Sample images and reports from `raw/` into a train-valid-test split, ready for the data loader.


In [None]:
import pandas as pd
from brainclip.config import *

parsed_reports_df = pd.read_csv(parsed_reports_ext)
parsed_reports_df

In [None]:
infarct_only = parsed_reports_df.copy()
infarct_only = infarct_only.replace({"tumor":"others","hemorrhage":"others"})

infarct_only = infarct_only[infarct_only["class"].isin(["infarct","normal"])]

df_in_use = parsed_reports_df # infarct_only

In [None]:
from sklearn.model_selection import train_test_split

X, y = df_in_use[["ID","StudyInstanceUID","ParsedImpressions"]], df_in_use["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify=y_train, test_size=0.1, random_state=42)

# do one split only (train, valid)

#X_valid = pd.concat([X_valid,X_test])
#y_valid = pd.concat([y_valid,y_test])

In [None]:
import numpy as np

len(y_train), np.unique(y_train, return_counts=True), np.unique(y_valid, return_counts=True), np.unique(y_test, return_counts=True)

In [None]:
y_test, y_valid

In [None]:
import os
import json
import shutil
from brainclip.model.utils.processing import preprocess_image
from brainclip.model.utils.file_utils import concat_sequences

sequences = ["T2S_0.nii.gz", "T2_FLAIR_0.nii.gz", "DWI_B1000_0.nii.gz" ]
#sequences = sequences[0]
access_dir = lambda filename: os.path.join(raw_images_ext, filename, "Nifti")

moved_images = []
all_images = list(os.listdir(raw_images_ext))

def move_images_and_text(X, y, target_folder):
    data_dict = {}

    for n, (idx, row) in enumerate(X.iterrows()):
        img_id = row["ID"]
        img_name, txt  = row["StudyInstanceUID"], row["ParsedImpressions"]
        patient_folder = access_dir(img_name)  
        sequences_list = []      

        try: 
            all_seq = [seq for seq in os.listdir(patient_folder)] 
            for seq in sequences:
                match_seq = [s for s in all_seq if s.split("_",1)[1]==seq][0]
                print(os.path.join(patient_folder, match_seq))
                sequences_list.append(os.path.join(patient_folder, match_seq))

            target_path = os.path.join(data_folder, target_folder, f"image_{img_id}.nii.gz")
            concat_sequences(sequences_list, target_path)
            print(target_path)
            # append label
            data_dict[n] = {
                "name":target_path, 
                "report":txt, 
                "label":y.loc[idx]
            }

        except FileNotFoundError: 
            print(f"{target_folder}: Couldn't find {patient_folder}, {img_name}")
            display(X[X["StudyInstanceUID"] == img_name])

    with open(f"{data_folder}{target_folder}.json", "w") as f:json.dump(data_dict, f)
        

In [None]:
move_images_and_text(X_train, y_train, "train")

In [None]:
move_images_and_text(X_valid, y_valid, "valid")

In [None]:
move_images_and_text(X_test, y_test, "test")