# Split the dataset into train and test sets
Taking care to not split data from same origin into both


### Load dependencies

In [1]:
import pandas as pd

import os
from skimage.io import imread


def get_file_list(directory):
    """
    Get a list of all CSV files in the specified directory.
    """
    img_in_dir_list = [
        img
        for img in os.listdir(directory)
        if img.endswith(".tif") and "relabel" not in img
    ]
    return img_in_dir_list


def file_list_info_df(data_dir, img, data_split_csv, data_dict):
    """
    Create a DataFrame with file information.
    """

    for i, row in data_split_csv.iterrows():
        lbl_id = row.iloc[0]
        sample_id = row.iloc[1]

        if lbl_id in img:
            data_dict["img_name"].append(img)
            data_dict["label_id"].append(lbl_id)
            data_dict["sample_id"].append(sample_id)

    image = imread(os.path.join(data_dir, img))
    data_dict["z_dim"].append(image.shape[0])
    data_dict["x_dim"].append(image.shape[1])
    data_dict["y_dim"].append(image.shape[2])
    data_dict["max_value"].append(image.max())
    data_dict["min_value"].append(image.min())

    return data_dict

In [2]:
data_split_csv_path = "data_identifier.csv"

directory = "D:\AI4LifeOpenCAll\_Full_data\masks_labels"

data_split_csv = pd.read_csv(data_split_csv_path, header=0, sep=";")

img_in_dir_list = get_file_list(directory)
data_dict = {
    "img_name": [],
    "label_id": [],
    "sample_id": [],
    "z_dim": [],
    "x_dim": [],
    "y_dim": [],
    "max_value": [],
    "min_value": [],
}

for img in img_in_dir_list:
    data_dict = file_list_info_df(directory, img, data_split_csv, data_dict)

df = pd.DataFrame.from_dict(data_dict)
df.to_csv(
    os.path.join(os.path.dirname(directory), os.path.basename(directory) + "_info.csv"),
    index=False,
)

# Group by "sample_id" and calculate summary statistics
summary_df = (
    df.groupby("sample_id")
    .agg(
        {
            "img_name": "count",  # Count the number of images
            "max_value": [
                lambda x: (x == 0).sum()
            ],  # Sum of max values and count of 0s
            "x_dim": "min",  # Minimum x dimension
            "y_dim": "min",  # Minimum y dimension
            "z_dim": "min",  # Minimum z dimension
        }
    )
    .reset_index()
)

# Rename columns for clarity
summary_df.columns = [
    "sample_id",
    "FOV_count",
    "empty_FOV_count",
    "min_x_dim",
    "min_y_dim",
    "min_z_dim",
]

print(summary_df)

# Save the summary DataFrame to a CSV file
summary_csv_path = os.path.join(
    os.path.dirname(directory), os.path.basename(directory) + "_summary.csv"
)
summary_df.to_csv(summary_csv_path, index=False)

  directory = "D:\AI4LifeOpenCAll\_Full_data\masks_labels"


   sample_id  FOV_count  empty_FOV_count  min_x_dim  min_y_dim  min_z_dim
0    Pt. 101          9                6       2304       2304        119
1    Pt. 108          2                0       2048       2048        119
2    Pt. 116          2                0       2304       2304         73
3    Pt. 149          1                0       2304       2304         73
4     Pt. 20         11                0       2304       2304        119
5     Pt. 27         17                5       2304       2304        119
6    Pt. 329          1                0       2304       2304         73
7    Pt. 343          4                0       2304       2304         80
8    Pt. 367         22                0        396        399         31
9     Pt. 73         36               21       2304       2304        119
10    Pt. 91          3                0       2304       2304        119
