Sample images and reports from `raw/` into a train-valid-test split, ready for the data loader.


In [11]:
import pandas as pd
from brainclip.config import *

parsed_reports_df = pd.read_csv(parsed_reports)
parsed_reports_df

Unnamed: 0.1,Unnamed: 0,Dataset,StudyInstanceUID,ParsedImpressions,class
0,720,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.40716.11040716...,Acute infarct in the right corona radiata Chro...,infarct
1,256,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.43774.11043774...,Patchy areas of acute infarct in left caudate ...,infarct
2,822,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.26309.80263092...,Mild chronic ischemic small vessel changes in ...,infarct
3,1030,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.27625.80276252...,Asymmetrical moderate dilatation of right vent...,infarct
4,800,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.23406.80234062...,Chronic ischemic small vessel changes in bilat...,infarct
5,610,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.26626.80266262...,Chronic ischemic small vessel changes in bilat...,infarct
6,815,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.16379.11016379...,Acute infarct involving left lentiform form an...,infarct
7,796,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.39304.11039304...,Acute infarcts involving in left centrum semio...,infarct
8,860,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.42976.11042976...,Chronic infarcts with gliosis in right occipit...,infarct
9,524,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.38559.80385592...,Multiple lacunar areas of acute infarct in the...,infarct


In [2]:
#infarct_only = parsed_reports_df.copy()
#infarct_only = infarct_only.replace({"tumor":"others","hemorrhage":"others"})


In [14]:
from sklearn.model_selection import train_test_split

X, y = parsed_reports_df[["StudyInstanceUID","ParsedImpressions"]], parsed_reports_df["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify=y_train, test_size=0.1, random_state=42)

# do one split only (train, valid)

X_valid =pd.concat([X_valid,X_test])
y_valid = pd.concat([y_valid,y_test])

In [15]:
import numpy as np

len(y_train), np.unique(y_train, return_counts=True), np.unique(y_valid, return_counts=True), np.unique(y_test, return_counts=True)

(40,
 (array(['hemorrhage', 'infarct', 'normal', 'others', 'tumor'], dtype=object),
  array([8, 8, 8, 8, 8])),
 (array(['hemorrhage', 'infarct', 'normal', 'others', 'tumor'], dtype=object),
  array([2, 2, 2, 2, 2])),
 (array(['hemorrhage', 'infarct', 'normal', 'others', 'tumor'], dtype=object),
  array([1, 1, 1, 1, 1])))

In [16]:
y_test, y_valid

(43        normal
 2        infarct
 25    hemorrhage
 38        others
 49         tumor
 Name: class, dtype: object,
 19    hemorrhage
 17         tumor
 47        normal
 5        infarct
 31        others
 43        normal
 2        infarct
 25    hemorrhage
 38        others
 49         tumor
 Name: class, dtype: object)

In [17]:
import os
import json
import shutil
from brainclip.model.utils.processing import preprocess_image

sequences = ["DWI_B1000_0.nii.gz", "T2_FLAIR_0.nii.gz", "T2S_0.nii.gz"]
sequences = sequences[0]
access_dir = lambda filename: os.path.join(raw_images, filename, "Nifti")

moved_images = []
all_images = list(os.listdir("/datadrive_m2/alice/brain-CLIP/data/raw/images/Alice_IN-BodyScanData-03"))

def move_images_and_text(X, y, target_folder):
    data_dict = {}

    for n, (idx, row) in enumerate(X.iterrows()):
        img_name, txt  = row["StudyInstanceUID"], row["ParsedImpressions"]
        patient_folder = access_dir(img_name)        

        try: 
            for seq in os.listdir(patient_folder): # Modify this to add more sequences
                if sequences in seq:
                    img_path = os.path.join(patient_folder, seq)
                    target_path = os.path.join(data_folder, target_folder, f"image_{idx}.nii.gz")
                    # move image to target directory
                    shutil.copy(img_path, target_path)

                    preprocess_image(target_path)

                    # append label
                    data_dict[n] = {
                        "name":target_path, 
                        "report":txt, 
                        "label":y.loc[idx]
                    }

        except FileNotFoundError: 
            print(f"{target_folder}: Couldn't find {patient_folder}, {img_name}")
            display(X[X["StudyInstanceUID"] == img_name])

    with open(f"{data_folder}{target_folder}.json", "w") as f:json.dump(data_dict, f)
        

In [18]:
move_images_and_text(X_train, y_train, "train")
move_images_and_text(X_valid, y_valid, "valid")
#move_images_and_text(X_test, y_test, "test")