Sample images and reports from `raw/` into a train-valid-test split, ready for the data loader.


In [1]:
import pandas as pd
from brainclip.config import *

parsed_reports_df = pd.read_csv(parsed_reports_ext)
parsed_reports_df

Unnamed: 0.1,Unnamed: 0,ID,Dataset,StudyInstanceUID,ParsedImpressions,nlp_diagnosis,infarct,tumor,hemorrhage,others,normal,class
0,524,524,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.38559.80385592...,Multiple lacunar areas of acute infarct in the...,"infarct,loss of signal void,thrombosis,",1,0,0,0,0,infarct
1,923,923,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.67450.11067450...,Acute infarct in the right centrum semiovale. ...,"infarct,microangiopathy changes,atrophy,",1,0,0,0,0,infarct
2,793,793,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.30936.80309362...,Chronic ischemic small vessel changes in bilat...,"changes,infarcts,atrophy,hyperintensities,mast...",1,0,0,0,0,infarct
3,1027,1027,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.43331.11043331...,HYPERINTENSITY SEEN INVOLVING THE LEFT OCCIPIT...,"infarcts,atrophic changes,white matter hyperin...",1,0,0,0,0,infarct
4,597,597,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.17006.80170062...,Chronic ischaemic small vessel changes in bila...,"changes,infarcts,",1,0,0,0,0,infarct
...,...,...,...,...,...,...,...,...,...,...,...,...
257,907,907,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.13633.80136332...,No significant abnormality is seen in the brain.,,0,0,0,0,1,normal
258,170,170,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.5992.110059922...,No significant abnormality is seen in the brai...,,0,0,0,0,1,normal
259,922,922,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.45405.11045405...,No significant abnormality is seen in the brai...,,0,0,0,0,1,normal
260,236,236,IN-BodyScanData-03,1.2.826.1.3680043.9.5282.150415.35454.11035454...,No significant abnormality is seen in the brain.,,0,0,0,0,1,normal


In [2]:
infarct_only = parsed_reports_df.copy()
infarct_only = infarct_only.replace({"tumor":"others","hemorrhage":"others"})

infarct_only = infarct_only[infarct_only["class"].isin(["infarct","normal"])]

df_in_use = parsed_reports_df # infarct_only

In [3]:
from sklearn.model_selection import train_test_split

X, y = df_in_use[["ID","StudyInstanceUID","ParsedImpressions"]], df_in_use["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify=y_train, test_size=0.1, random_state=42)

# do one split only (train, valid)

#X_valid = pd.concat([X_valid,X_test])
#y_valid = pd.concat([y_valid,y_test])

In [4]:
import numpy as np

len(y_train), np.unique(y_train, return_counts=True), np.unique(y_valid, return_counts=True), np.unique(y_test, return_counts=True)

(211,
 (array(['hemorrhage', 'infarct', 'normal', 'others', 'tumor'], dtype=object),
  array([29, 40, 48, 47, 47])),
 (array(['hemorrhage', 'infarct', 'normal', 'others', 'tumor'], dtype=object),
  array([3, 5, 6, 5, 5])),
 (array(['hemorrhage', 'infarct', 'normal', 'others', 'tumor'], dtype=object),
  array([4, 5, 6, 6, 6])))

In [5]:
y_test, y_valid

(36        infarct
 260        normal
 2         infarct
 82          tumor
 46        infarct
 182        others
 43        infarct
 246        normal
 166        others
 86          tumor
 216        normal
 188        others
 72          tumor
 261        normal
 103         tumor
 158        others
 122    hemorrhage
 230        normal
 186        others
 115    hemorrhage
 50          tumor
 172        others
 136    hemorrhage
 242        normal
 6         infarct
 81          tumor
 128    hemorrhage
 Name: class, dtype: object,
 214        normal
 39        infarct
 110    hemorrhage
 113    hemorrhage
 248        normal
 35        infarct
 129    hemorrhage
 107         tumor
 37        infarct
 60          tumor
 257        normal
 255        normal
 249        normal
 80          tumor
 24        infarct
 155        others
 92          tumor
 167        others
 193        others
 74          tumor
 213        normal
 18        infarct
 157        others
 196        others
 N

In [6]:
import os
import json
import shutil
from brainclip.model.utils.processing import preprocess_image
from brainclip.model.utils.file_utils import concat_sequences

sequences = ["T2S_0.nii.gz", "T2_FLAIR_0.nii.gz", "DWI_B1000_0.nii.gz" ]
#sequences = sequences[0]
access_dir = lambda filename: os.path.join(raw_images_ext, filename, "Nifti")

moved_images = []
all_images = list(os.listdir(raw_images_ext))

def move_images_and_text(X, y, target_folder):
    data_dict = {}

    for n, (idx, row) in enumerate(X.iterrows()):
        img_id = row["ID"]
        img_name, txt  = row["StudyInstanceUID"], row["ParsedImpressions"]
        patient_folder = access_dir(img_name)  
        sequences_list = []      

        try: 
            all_seq = [seq for seq in os.listdir(patient_folder)] 
            for seq in sequences:
                match_seq = [s for s in all_seq if s.split("_",1)[1]==seq][0]
                print(os.path.join(patient_folder, match_seq))
                sequences_list.append(os.path.join(patient_folder, match_seq))

            target_path = os.path.join(data_folder, target_folder, f"image_{img_id}.nii.gz")
            concat_sequences(sequences_list, target_path)
            print(target_path)
            # append label
            data_dict[n] = {
                "name":target_path, 
                "report":txt, 
                "label":y.loc[idx]
            }

        except FileNotFoundError: 
            print(f"{target_folder}: Couldn't find {patient_folder}, {img_name}")
            display(X[X["StudyInstanceUID"] == img_name])

    with open(f"{data_folder}{target_folder}.json", "w") as f:json.dump(data_dict, f)
        

In [7]:
move_images_and_text(X_train, y_train, "train")

/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.150415.37583.11037583212327/Nifti/1.3.12.2.1107.5.2.40.39073.2021070320055236707799423.0.0.0_T2S_0.nii.gz
/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.150415.37583.11037583212327/Nifti/1.3.12.2.1107.5.2.40.39073.2021070319581822120999000.0.0.0_T2_FLAIR_0.nii.gz
/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.150415.37583.11037583212327/Nifti/1.3.12.2.1107.5.2.40.39073.2021070319531362320596794.0.0.0_DWI_B1000_0.nii.gz
/datadrive_m2/alice/brain-CLIP/data/train/image_550.nii.gz
/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.150415.4439.11004439222327/Nifti/1.3.12.2.1107.5.2.40.39073.2022012110491841624315377.0.0.0_T2S_0.nii.gz
/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.15

In [8]:
move_images_and_text(X_valid, y_valid, "valid")

/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.150415.36618.8036618222327/Nifti/1.3.12.2.1107.5.2.40.39004.2022071921163195368773824.0.0.0_T2S_0.nii.gz
/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.150415.36618.8036618222327/Nifti/1.3.12.2.1107.5.2.40.39004.2022071921143381338373672.0.0.0_T2_FLAIR_0.nii.gz
/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.150415.36618.8036618222327/Nifti/1.3.12.2.1107.5.2.40.39004.2022071921060278455570749.0.0.0_DWI_B1000_0.nii.gz
/datadrive_m2/alice/brain-CLIP/data/valid/image_1020.nii.gz
/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.150415.19090.8019090222327/Nifti/1.3.12.2.1107.5.2.40.39004.2022040314505722412525221.0.0.0_T2S_0.nii.gz
/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.1504

In [9]:
move_images_and_text(X_test, y_test, "test")

/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.150415.46825.11046825212327/Nifti/1.3.12.2.1107.5.2.40.39073.2021082415515434602751156.0.0.0_T2S_0.nii.gz
/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.150415.46825.11046825212327/Nifti/1.3.12.2.1107.5.2.40.39073.2021082415440854620650719.0.0.0_T2_FLAIR_0.nii.gz
/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.150415.46825.11046825212327/Nifti/1.3.12.2.1107.5.2.40.39073.2021082415375841442549185.0.0.0_DWI_B1000_0.nii.gz
/datadrive_m2/alice/brain-CLIP/data/test/image_985.nii.gz
/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.150415.35454.11035454222327/Nifti/1.3.12.2.1107.5.2.40.39073.2022072822410041150060419.0.0.0_T2S_0.nii.gz
/datadrive_m2/alice/brain-CLIP/data/raw/images/IN-BodyScanData-03-Alice-265/1.2.826.1.3680043.9.5282.15