# 0. Split the dataset into train and test subsets
Data from the same origin should be kept in the same subset to avoid data leakage.

This notebook crosses the image names with the identifiers in the csv file to get the number of FOVs (Fields of View) for each sample. Also counting the number of empty FOVs.

These counts can then be used to split the dataset into train and test subsets. Usually 80% for training and 20% for testing.


## 0.1. Load libraries and custom functions

Load the `pandas`, `os`, and `skimage` libraries.


In [2]:
import pandas as pd
import os
from skimage.io import imread


def get_file_list(directory: str) -> list[str]:
    """
    Get a list of all image files (.tif, .tiff) in the specified directory.

    Parameters:
    - directory: str, path to the directory containing image files.

    Returns:
    - list of image file names with .tif or .tiff extensions.
    """
    img_in_dir_list = [
        img for img in os.listdir(directory) if img.endswith((".tif", ".tiff"))
    ]
    return img_in_dir_list


def file_list_info_df(
    data_dir: str, img: str, identifier_csv: pd.DataFrame, data_dict: dict
) -> dict:
    """
    Create a DataFrame with file information.

    Parameters:
    - data_dir: str, path to the directory containing image files.
    - img: str, name of the image file.
    - identifier_csv: pd.DataFrame, DataFrame containing label and sample IDs.
    - data_dict: dict, dictionary to store file information.

    Returns:
    - data_dict: dict, updated dictionary with file information.
    """

    # Iterate through the DataFrame rows to find matching labels and sample IDs
    # and populate the data_dict with image name, label ID, sample ID.
    for i, row in identifier_csv.iterrows():
        lbl_id = row.iloc[0]
        sample_id = row.iloc[1]

        if lbl_id in img:
            data_dict["img_name"].append(img)
            data_dict["label_id"].append(lbl_id)
            data_dict["sample_id"].append(sample_id)

            # Read the image file to get its dimensions and min/max values
            # and append this information to the data_dict.
            image = imread(os.path.join(data_dir, img))
            data_dict["z_dim"].append(image.shape[0])
            data_dict["x_dim"].append(image.shape[1])
            data_dict["y_dim"].append(image.shape[2])
            data_dict["max_value"].append(image.max())
            data_dict["min_value"].append(image.min())

    return data_dict

## 0.2. Code


### User input variables

In [None]:
identifier_csv_path = "path/to/your/data_split.csv"

directory = "path/to/your/directory"  # Replace with your directory path

### Code to run

In [None]:
# Load the identifier CSV file into a DataFrame
identifier_csv = pd.read_csv(identifier_csv_path, header=0, sep=";")

# Get the list of image files in the specified directory
img_in_dir_list = get_file_list(directory)

# Initialize a dictionary to store file information
data_dict = {
    "img_name": [],
    "label_id": [],
    "sample_id": [],
    "z_dim": [],
    "x_dim": [],
    "y_dim": [],
    "max_value": [],
    "min_value": [],
}

# Iterate through the list of image files and populate the data_dict
for img in img_in_dir_list:
    data_dict = file_list_info_df(directory, img, identifier_csv, data_dict)

# Convert the data_dict dictionary to a DataFrame
df = pd.DataFrame.from_dict(data_dict)

# Save the DataFrame to a CSV file
df.to_csv(
    os.path.join(os.path.dirname(directory), os.path.basename(directory) + "_info.csv"),
    index=False,
)

# Group by "sample_id" and calculate summary statistics per sample
# Count the number of images, sum of max values, and count of 0s in "max_value",
# and get the minimum dimensions for each sample.
# This will create a summary DataFrame with the number of FOVs and empty FOVs.
summary_df = (
    df.groupby("sample_id")
    .agg(
        {
            "img_name": "count",  # Count the number of images
            "max_value": [
                lambda x: (x == 0).sum()
            ],  # Sum of max values and count of 0s
            "x_dim": "min",  # Minimum x dimension
            "y_dim": "min",  # Minimum y dimension
            "z_dim": "min",  # Minimum z dimension
        }
    )
    .reset_index()
)

# Rename columns for clarity
summary_df.columns = [
    "sample_id",
    "FOV_count",
    "empty_FOV_count",
    "min_x_dim",
    "min_y_dim",
    "min_z_dim",
]

# Print the summary DataFrame
print("Summary DataFrame:")
print(summary_df)

# Save the summary DataFrame to a CSV file
summary_csv_path = os.path.join(
    os.path.dirname(directory), os.path.basename(directory) + "_summary.csv"
)
summary_df.to_csv(summary_csv_path, index=False)

  directory = "D:\AI4LifeOpenCAll\_Full_data\masks_labels"


   sample_id  FOV_count  empty_FOV_count  min_x_dim  min_y_dim  min_z_dim
0    Pt. 101          9                6       2304       2304        119
1    Pt. 108          2                0       2048       2048        119
2    Pt. 116          2                0       2304       2304         73
3    Pt. 149          1                0       2304       2304         73
4     Pt. 20         11                0       2304       2304        119
5     Pt. 27         17                5       2304       2304        119
6    Pt. 329          1                0       2304       2304         73
7    Pt. 343          4                0       2304       2304         80
8    Pt. 367         22                0        396        399         31
9     Pt. 73         36               21       2304       2304        119
10    Pt. 91          3                0       2304       2304        119
