# Python notebook to convert  Dicom files from Ambra to NIFTI

## Import Libraries and Set directories

In [None]:
#Import Libraries
import os
import glob
import SimpleITK as sitk
import sys, time
import numpy as np
import pandas as pd
import re
import shutil

In [None]:
def clear_dir(target_dir):
    with os.scandir(target_dir) as entries:
        for entry in entries:
            if entry.is_file() or entry.is_symlink():
                os.remove(entry.path)
            elif entry.is_dir():
                shutil.rmtree(entry.path)

In [None]:
# Set input/output directories
os.chdir('/home/jovyan')
cwd = os.getcwd()
dir_MC =cwd+'/UPLOAD'
dir_train = cwd+'/CCHMC_NIFTI'
dirs_MC = next(os.walk(dir_MC))[1]
dirs_train = next(os.walk(dir_train))[1]
print(len(dirs_train),len(dirs_MC))

## Define some handy functions for creating dicom 

In [None]:
dirs_MC[:] = np.unique([x for x in dirs_MC if not x.startswith('.')])
dirs_train[:] = np.unique([x for x in dirs_train if not x.startswith('.')])
dirs_testMC = [t for t in dirs_MC if t not in dirs_train]
dirs_trainMC = [t for t in dirs_MC if t in dirs_train]
# seqs = list()
# for d in dirs:
#     #match = re.sub('\d+$','',d) # Pattern for Duke studies
#     match = re.sub('\d*_\d*$|\d*$|\d*_b$','',d) # Pattern for UCSD studies
#     if match:
#         seqs.append(match)
# unique_seqs = np.unique(seqs)

print('MC studies',' CCHMC train',' CCHMC test in MC',' CCHMC train in MC')
print(len(dirs_MC),' ',len(dirs_train),' ',len(dirs_testMC),' ',len(dirs_trainMC))

In [None]:
# Load CSV with patient details
df = pd.read_csv('patient_details.csv',index_col=False,encoding='latin-1')
display(df.head(100))

In [None]:
print('Total number of entries in patient_details.csv', len(df))

In [None]:
#np.unique(df['Field'])
a = df['Field'].value_counts()
a.get(3)

In [None]:
# Calculate demographics for a list of patients
def calculate_demographics(pats,df,indexname):
    df_pats = df[df['Key'].isin(pats)]
    df_pats = df_pats.drop_duplicates(subset=['Key'],keep='first')
    
#     duplicateRowsDF = df_pats[df_pats.duplicated(['Key'])]
#     if duplicateRowsDF is not None:
#         display(duplicateRowsDF)
    
    total_items = len(df_pats)
    mean_weight = df_pats["WEIGHT"].mean()
    mean_height = df_pats["HEIGHT"].mean()
    mean_age = df_pats["Age (DOS)"].mean()
    mean_BMI = df_pats["BMI"].mean()
    
    std_weight = df_pats["WEIGHT"].std()
    std_height = df_pats["HEIGHT"].std()
    std_age = df_pats["Age (DOS)"].std()
    std_BMI = df_pats["BMI"].std()
    
    age = "{:.1f}".format(mean_age) +  u" \u00B1 " + "{:.1f}".format(std_age) 
    weight = "{:.1f}".format(mean_weight) +  u" \u00B1 " + "{:.1f}".format(std_weight) 
    height = "{:.1f}".format(mean_height) +  u" \u00B1 " + "{:.1f}".format(std_height) 
    bmi = "{:.1f}".format(mean_BMI) +  u" \u00B1 " + "{:.1f}".format(std_BMI) 
    
    
    females = df_pats['GENDER'].value_counts().get('Female')
    whites =  df_pats['RACE'].value_counts().get('WHITE')
    GEs =  df_pats['Vendor'].value_counts().get('GE MEDICAL SYSTEMS')
    Field_3s = df_pats['Field'].value_counts().get(3)
    
    
    return [indexname,total_items,age,height,weight,females,whites,GEs,Field_3s]

In [None]:
len(dirs_trainMC)

In [None]:
dirs = [dirs_MC,dirs_train,dirs_testMC,dirs_trainMC]
datasets = ['Phase1 CCHMC data','CCHMC train data','CCHMC test in Phase1','CCHMC train in Phase1']
out_df=pd.DataFrame(columns=['Dataset','N','Age','Height','Weight','Females','Whites','GE','Field_3T'])
for i in range(len(datasets)):
    out = calculate_demographics(dirs[i],df,datasets[i])
    out_df.loc[i] = out

display(out_df)

In [None]:
out_df.to_csv('CCHMC_demographic_information.csv')

In [None]:
split_df = pd.DataFrame(columns=['Exams used for Training','Exams used for Testing'])

split_df['Exams used for Testing'] = dirs_testMC
print(len(split_df))
dirs_trainMC2 = dirs_trainMC.copy()
dirs_trainMC2.append('   ')
dirs_trainMC2.append('   ')
print(len(dirs_trainMC2),len(dirs_trainMC))
split_df['Exams used for Training'] = dirs_trainMC2

split_df.to_csv('CCHMC_phase1_train_test_split.csv')

In [None]:
# Get MultiCenterLS study names from processed CCHMC DICOMs and match with local IDs (ELAST-999) for internal use
dir_dcm = os.path.join(cwd,'UPLOAD')
pats = os.listdir(dir_dcm)
from pydicom import read_file

In [None]:
df_extmap = pd.DataFrame(columns=['Key','ExternalID'])
for i,pat in enumerate(pats):
    MR_path = os.path.join(dir_dcm,pat,'T2')
    MR_file = os.path.join(MR_path,os.listdir(MR_path)[0])
    #print(MR_file)
    t2 = read_file(MR_file)
    df_extmap.loc[i] = [t2.PatientID, t2.PatientName]
df_extmap.to_csv('CCHMC_phase1_internal_external_IDmap.csv')

In [None]:
len(df_extmap)
list(df_extmap)

In [None]:
df_DSC = pd.read_csv('CCHMC_isensee_results_151pats_32.csv',index_col=False,encoding='latin-1')

In [None]:
df_DSC = df_DSC[['ID','DSC']]
df_DSC.columns = ['Key','DSC']
list(df_DSC)

In [None]:
display(df_extmap.head(10))

In [None]:
display(df_DSC.head(10))

In [None]:
df_DSC_extmap = df_DSC.merge(df_extmap,on=['Key'],how='inner')
len(df_DSC_extmap)

In [None]:
df_DSC_extmap = df_DSC_extmap[['Key','ExternalID','DSC']]
df_DSC_extmap.to_csv('CCHMC_isensee_results_151pats_32_wexternalID')