# Load libraries and directories

In [97]:
from IPython.display import display, HTML

In [98]:
# from IPython import get_ipython
from tqdm.notebook import tqdm
import pickle
import os
import pprint
pp = pprint.PrettyPrinter(indent=1)

# Custom modules for debugging
from SliceViewer import ImageSliceViewer3D, ImageSliceViewer3D_1view,ImageSliceViewer3D_2views
from investigate import *

#pd.set_option("display.max_rows", 10)
      
import json
from run_sma_experiment import find_l3_images,output_images
import pprint
from L3_finder import *

# Custom functions
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

def load_object(filename):        
    with open(filename, 'rb') as input:
        return pickle.load(input)

In [99]:
get_ipython().run_line_magic('tb', '')

NameError: name 'dump' is not defined

### In CCHMC's workflow, there were two dicom dumps, with different folder structures and naming convention:
<br>
Folder structure: <br>
Dump-1: Project_folder/Patient_folder/Series_Folder/dicom_files <br>
Dump-2: Project_folder/Patient_folder/Study_folder/Series_Folder/dicom_files<br> 
<br>
Naming Convention for patient folder: <br>
Dump-1: PATID-GMRN-PATID-STUDYNAME <br>
Dump-2: PT-PATID-PATID 

In [100]:
#Select which dump you are processing here [dump1: 1, dump2: 2]
dump = 2

In [101]:
cwd = os.getcwd()
data = '/tf/data'
output = '/tf/pickles'

## Section 1 - Load list of normal patients filtered from Epic data and select those patients from the DICOM dump of all patients

In [5]:
# Load normal patient list
infile  = 'patlist_with_validBMI.csv'
df_P = pd.read_csv(infile, index_col=False)
df_P = df_P.loc[:, ~df_P.columns.str.contains('^Unnamed')]
df_P = df_P[['GIVEN_MRN','PAT_ID','ACC']]
print('Columns of df_P: ', list(df_P))
print('Length of df_P: ', len(df_P))
display(df_P.head(10))
#print('# of Unique patients: ', len(df_P.subject_id.unique()))

Columns of df_P:  ['GIVEN_MRN', 'PAT_ID', 'ACC']
Length of df_P:  2399


Unnamed: 0,GIVEN_MRN,PAT_ID,ACC
0,1343508,Z704566,8018969
1,11699446,Z1931299,8842054
2,1299521,Z664930,8024856
3,11437502,Z1465933,8000527
4,11063680,Z1091411,8006713
5,1212646,Z585709,8051250
6,11133354,Z1161168,8833553
7,11109370,Z1137138,8831692
8,11301610,Z1329744,8057737
9,11451698,Z1683576,8046105


In [6]:
pats = next(os.walk(data))[1]
print('Total patient folders in data dir: ',len(pats))

Total patient folders in data dir:  383


In [8]:
if dump == 1:
    patids = [pat.split('-')[0] for pat in pats]
elif dump == 2:
    patids = [pat.split('-')[-1] for pat in pats]

valid_ids = [valid_id for valid_id,valid_dir in zip(patids,pats) if valid_id in df_P.PAT_ID.values]

valid_ids = set(valid_ids)

In [9]:
print('valid ids: ',len(valid_ids))

valid ids:  381


## Section 2 - Load each study into subject object
<br>
Subject object defined in L3finder.ingest

In [10]:
# Import modules and config file
configfile = os.path.join(cwd,'config/debug_ES/series_filter_ds1.json')
with open(configfile, "r") as f:
        config = json.load(f)

config = config["series_filter"]        
print('Current config dict: ')
pp.pprint(config)

Current config dict: 
{'dicom_dir': '/tf/data',
 'model_path': 'None',
 'output_directory': '/tf/output',
 'overwrite': True,
 'save_plots': True,
 'series_to_skip_pickle_file': '/tf/output/broken_sagittal_and_axial_series.pkl',
 'show_plots': False}


In [11]:
if dump==2:
    config["new_tim_dicom_dir_structure"] = False
elif dump==1:
    config["new_tim_dicom_dir_structure"] = True

In [12]:
# Debug
print("Finding subjects")

subjects = list(
    find_subjects(
        config["dicom_dir"],
        new_tim_dir_structure=config["new_tim_dicom_dir_structure"]
    )
)

print('Subjects found: ', len(subjects))

Finding subjects
Subjects found:  383


## Section-3 - check if there are subjects with multiple folders (studies)

In [14]:
subjects = [subject for subject in subjects if subject.id_ in valid_ids]
print('Subjects found: ', len(subjects))
print('Valid Subjects: ', len(valid_ids))

Subjects found:  381
Valid Subjects:  381


In [15]:
# Find Duplicate Subjects
unique_subjects = []
duplicate_subjects = []
for subject in subjects:
    if subject.id_ not in unique_subjects:
        unique_subjects.append(subject.id_)
    else:
        duplicate_subjects.append(subject.id_)

print(duplicate_subjects)           

[]


In [None]:
# Use these for interactive investigation of subject/studies
print_subject_paths(subjects)

In [None]:
print_subject_series('Z619766','/tf/data/Z619766-19070630-Z619766-_')

#### DICOM Visualizer to select correct study for subjects with more than 1 study

In [None]:
imseries = get_subject_series('Z619766','Z619766-SE-1-2.0')
print(imseries.orientation,' ' , imseries.slice_thickness)
imdata = imseries.pixel_data

In [None]:
%matplotlib inline
print(imdata.shape)
ImageSliceViewer3D(imdata)

In [16]:
## Save subjects without duplicates
save_object(subjects, os.path.join(output,'subjects_noduplicates.pkl'))

## Section 4 - Load each series into series object and keep only axials and sagittals

In [17]:
subjects = load_object(os.path.join(output,'subjects_noduplicates.pkl'))

In [19]:
len(subjects)

381

In [29]:
%%time
# Find series images
print("Finding series")
series = list(flatten(tqdm((s.find_series() for s in subjects),total=len(subjects))))
print("Total number of series found: ", len(series))

Finding series


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=381.0), HTML(value='')))


Total number of series found:  3301
CPU times: user 98 ms, sys: 27.1 ms, total: 125 ms
Wall time: 112 ms


In [31]:
%%time
sagittal_series, axial_series, excluded_series = separate_series(series)

Filtering series
CPU times: user 1.41 s, sys: 692 ms, total: 2.11 s
Wall time: 11.9 s


In [32]:
print("Length of valid pats: ", len(subjects))
print("Length of sagittal series", len(sagittal_series))
print("Length of axial series", len(axial_series))
#print("Length of excluded series", len(excluded_series))
#print("Length of all series in dataset", len(series))

Length of valid pats:  381
Length of sagittal series 362
Length of axial series 802


In [33]:
# Save required objects
save_object(axial_series, os.path.join(output,'axial_series.pkl'))
save_object(sagittal_series, os.path.join(output,'sagittal_series.pkl'))

## Start from here tomorrow

## Section 5 - Investigate subjects and series using pandas

In [34]:
axial_series = load_object(os.path.join(output,'axial_series.pkl'))
sagittal_series = load_object(os.path.join(output,'sagittal_series.pkl'))
subjects = load_object(os.path.join(output,'subjects_noduplicates.pkl'))

In [35]:
df_a = get_summary_dfs(axial_series,sagittal_series,subjects)
save_object(df_a, os.path.join(output,'df_a.pkl'))

In [40]:
display(df_a.head(10))

Unnamed: 0,ID,Axials,Sagittals
0,Z911879,2,0
1,Z667765,2,3
2,Z720947,2,1
3,Z1396897,0,0
4,Z1041413,2,3
5,Z1268454,2,1
6,Z1467058,2,2
7,Z604607,2,0
8,Z1230777,2,0
9,Z697015,2,0


In [36]:
df_a_axials = get_summary_by_serieslength(axial_series)
df_a_sags = get_summary_by_serieslength(sagittal_series)
save_object(df_a_axials, os.path.join(output,'df_a_axials.pkl'))
save_object(df_a_sags, os.path.join(output,'df_a_sags.pkl'))

In [37]:
print("Length of subjects with atleast 1 axial or sagittal series: ", len(df_a))
print("Length of subjects with atleast 1 axial series: ", len(df_a_axials['ID'].unique()))
print("Length of subjects with atleast 1 sagittal series: ", len(df_a_sags['ID'].unique()))

Length of subjects with atleast 1 axial or sagittal series:  381
Length of subjects with atleast 1 axial series:  377
Length of subjects with atleast 1 sagittal series:  242


In [38]:
# Patients without Axial
pats = [pat for pat in df_a['ID'].values if pat not in df_a_axials['ID'].values]
print(len(pats))
print(pats)

4
['Z1396897', 'Z1041077', 'Z441008', 'Z525377']


In [39]:
# Patients without Sagittal
pats = [pat for pat in df_a['ID'].values if pat not in df_a_sags['ID'].values]
print(len(pats))

139


In [None]:
imseries = get_subject_series('Z837620','Z837620-SE-6-Vol_Body_Vol._0.5',subjects)
print(imseries.orientation,' ' , imseries.slice_thickness)
imdata = imseries.pixel_data

In [None]:
%matplotlib inline
print(imdata.shape)
ImageSliceViewer3D(imdata)

In [None]:
print_summary_by_serieslength(df_a_axials)

In [None]:
print_summary_by_serieslength(df_a_sags)

## Section 6 - Create dataframe of optimal axial sagittal pairs

The function filter_finalpairs in investigate.py is used

In [7]:
axial_series = load_object(os.path.join(output,'axial_series.pkl'))
sagittal_series = load_object(os.path.join(output,'sagittal_series.pkl'))
subjects = load_object(os.path.join(output,'subjects_noduplicates.pkl'))

df_a_axials = load_object(os.path.join(output,'df_a_axials.pkl'))
df_a_sags = load_object(os.path.join(output,'df_a_sags.pkl'))
df_a = load_object(os.path.join(output,'df_a.pkl'))

In [8]:
%%time
from L3_finder import *
from l3finder.ingest import *
from multiprocessing import get_context
from multiprocessing import set_start_method
#set_start_method("spawn")
df_filt = None

if __name__=='__main__':
    # Find series images
    print("Finding IDs")
    
    IDs = [s.id_ for s in subjects]
    pair_filter = functools.partial(
        filter_finalpairs,
        df_ax=df_a_axials,
        df_sag=df_a_sags,
        subjects=subjects
    )
    
    def pool_filter(pool, func, candidates):
        arg1 = [a for a in tqdm(pool.imap_unordered(func, candidates),total=len(candidates))]
        return arg1
        
    print('Filtering series using ', multiprocessing.cpu_count(), ' cores:')
        
    with get_context("spawn").Pool(processes=40) as p:
        result_list = pool_filter(p, pair_filter, IDs)
        p.close()
        p.join()
    
    print('parallel processing over')
     # Start from here
    df_filt  = pd.DataFrame(columns=['ID','Axial','Sagittal','Overlap','MissingScore','PairValidity', 
                                'AxSlices','SagSlices','AxThick','SagThick'])
    for i,op in enumerate(result_list):
        df_filt.loc[i] = op
    

    print("Processed")

Finding IDs
Filtering series using  48  cores:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=381.0), HTML(value='')))


parallel processing over
Processed
CPU times: user 5.73 s, sys: 1.89 s, total: 7.62 s
Wall time: 4min 37s
Compiler : 138 ms


In [9]:
save_object(df_filt, os.path.join(output,'df_filteredpairs.pkl'))

## Section 7: Investigate the dataframe for missing and low quality pairs

In [11]:
# Load all params
df_filt = load_object(os.path.join(output,'df_filteredpairs.pkl'))
axial_series = load_object(os.path.join(output,'axial_series.pkl'))
sagittal_series = load_object(os.path.join(output,'sagittal_series.pkl'))
subjects = load_object(os.path.join(output,'subjects_noduplicates.pkl'))

df_a_axials = load_object(os.path.join(output,'df_a_axials.pkl'))
df_a_sags = load_object(os.path.join(output,'df_a_sags.pkl'))
df_a = load_object(os.path.join(output,'df_a.pkl'))

In [12]:
# Make sure filtered df and subjects are equal length
print('Length of filtered df: ',len(df_filt))
print('Length of subjects: ',len(subjects))

Length of filtered df:  381
Length of subjects:  381


In [13]:
# View and Remove subjects without axial series
df_noaxials =  df_filt[df_filt['Axial'].isnull()]
print('Number of subjects without Axials: ', len(df_noaxials))
display(df_noaxials)

Number of subjects without Axials:  4


Unnamed: 0,ID,Axial,Sagittal,Overlap,MissingScore,PairValidity,AxSlices,SagSlices,AxThick,SagThick
0,Z1396897,,,,,,,,,
1,Z1041077,,,,,,,,,
114,Z441008,,,,,,,,,
304,Z525377,,,,,,,,,


In [14]:
# Remove subjects without axials from subjects list:
subjects = [s for s in subjects if s.id_ not in df_noaxials.ID.values]
print('Length of subjects with axials: ',len(subjects))


Length of subjects with axials:  377


In [16]:
# Print cases that don't have sagittals
df_nosags = df_filt[df_filt['Sagittal'].isnull()]
print("Number of cases without sagittals: ", len(df_nosags))

Number of cases without sagittals:  139


In [22]:
# Investigate cases with less than 0.7 overlap and  < 0.9 Missing Score [Tracks Slices missing from stack]
df_pooroverlap = df_filt[(df_filt['Overlap'] < 0.7) | (df_filt['MissingScore'] < 0.9)]
print('Cases with overlap < 0.7: ', len(df_pooroverlap))

Cases with overlap < 0.7:  18


In [69]:
display(df_pooroverlap.sort_values(by=['MissingScore'],ascending=[True]))

Unnamed: 0,ID,Axial,Sagittal,Overlap,MissingScore,PairValidity,AxSlices,SagSlices,AxThick,SagThick
203,Z505334,Z505334-SE-2-ABD_PELVIS,Z505334-SE-602-Sagital_Abdomen,0,0.615,0.615,175,72,2.5,0.703125
308,Z854069,Z854069-SE-5-Venous_Phase_Standard_Axial,Z854069-SE-500-Venous_Phase_Standard_Vol_Vol,0.352,0.755,1.107,93,513,5.0,0.5
311,Z1248225,Z1248225-SE-2-Trauma_C_A_P,Z1248225-SE-401-SAGITTAL,False,0.759,0.759,129,59,5.0,2.5
85,Z1014113,Z1014113-SE-2-Axial_Stnd,Z1014113-SE-501-SAG,0.155,0.772,0.927,125,68,5.0,2.52
109,Z1165230,Z1165230-SE-2-,Z1165230-SE-1001-Vitrea_Snapshot__Vol._Vol,0.419,0.923,1.342,55,108,5.0,0.5
271,Z1429940,Z1429940-SE-6-ABDOMEN,Z1429940-SE-600-Reformatted,0.503,0.966,1.469,288,42,1.25,5.0
287,Z1017392,Z1017392-SE-4-Axial_Body__5.0,Z1017392-SE-12-Sagittal_Bone_Sagittal_3.000,False,1.0,1.0,67,79,5.0,3.0
247,Z1194146,Z1194146-SE-4-Pediatric_3.0__CE,Z1194146-SE-7-Pediatric_3.0_Sagittal_CE,0,1.0,1.0,126,96,3.0,3.0
221,Z700667,Z700667-SE-4-Axial_Body__5.0,Z700667-SE-11-Sagittal__Sagittal_1.0,0.5,1.0,1.5,101,204,5.0,1.0
219,Z342284,Z342284-SE-4-,Z342284-SE-13-Vol._Sagittal,0.486,1.0,1.486,113,136,5.0,1.0


In [53]:
# Handy Functions to investigate the poor pairs
def get_ax_sag(df,ind):
    global subjects
    subid = df.loc[ind,'ID']
    axid = df.loc[ind,'Axial']
    sagid = df.loc[ind,'Sagittal']
    ax = get_subject_series(subid,axid,subjects)
    sag = get_subject_series(subid,sagid,subjects)
    return ax,sag

In [89]:
calculate_missing_slices_sagittals(get_ax_sag(df_pooroverlap,85)[1]),verbose=True)

0.423

In [82]:
calculate_series_overlap(*get_ax_sag(df_pooroverlap,85),verbose=True)

374

### Based on investigation, eliminate series and subjects not eligible and create final df

In [93]:
# Keep Sagittals only when overlap > 0.7
df_final = df_filt.copy()
for ind,row in df_final.iterrows():
    if (not row['Overlap']) or (row['Overlap'] < 0.7):
            df_final.loc[ind,'Sagittal'] = None

In [94]:
# Print cases that don't have sagittals
print("Number of cases without sagittals in filter df: ", len(df_nosags))
df_nosags2 = df_final[df_final['Sagittal'].isnull()]
print("Number of cases without sagittals in final df: ", len(df_nosags2))

Number of cases without sagittals in filter df:  139
Number of cases without sagittals in final df:  157


In [103]:
# Get final 
final_df_file = 'df_final_dump_'+str(dump)+'.pkl'
final_subs_file = 'subjects_final_dump_'+str(dump)+'.pkl'

save_object(df_final, os.path.join(output,final_df_file))
save_object(subjects, os.path.join(output,final_subs_file))

In [104]:
axial_series[0]

ImageSeries(subject=Subject(path=PosixPath('/tf/data/PT-Z911879-Z911879')), series_path=PosixPath('/tf/data/PT-Z911879-Z911879/ST-10231915-Z911879-CT_ABDOMEN_PELVIS_W_CONTRAST/SE-5-Stnd__Pediatric_Vol_Stnd'), accession_path=PosixPath('/tf/data/PT-Z911879-Z911879/ST-10231915-Z911879-CT_ABDOMEN_PELVIS_W_CONTRAST/SE-5-Stnd__Pediatric_Vol_Stnd'))

In [107]:
axial_series[0].series_path

PosixPath('/tf/data/PT-Z911879-Z911879/ST-10231915-Z911879-CT_ABDOMEN_PELVIS_W_CONTRAST/SE-5-Stnd__Pediatric_Vol_Stnd')

In [108]:
axial_series[0].accession_path

PosixPath('/tf/data/PT-Z911879-Z911879/ST-10231915-Z911879-CT_ABDOMEN_PELVIS_W_CONTRAST/SE-5-Stnd__Pediatric_Vol_Stnd')