# Notebook to Move final dcm files selected for processing to Project Folder

In [48]:
import shutil
import pickle
import pandas as pd
from l3finder.ingest import *
import os
import multiprocessing
from multiprocessing import get_context
from multiprocessing import set_start_method
from tqdm.notebook import tqdm
from investigate import *

# Custom functions
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

def load_object(filename):        
    with open(filename, 'rb') as input:
        return pickle.load(input)

In [49]:
cwd = os.getcwd()
output = '/tf/pickles'

### Open dataframes from both dumps and merge them after investigating duplicate cases

In [50]:
df_final1 = load_object(os.path.join(output,'df_final_dump_1.pkl'))
subjects1 = load_object(os.path.join(output,'subjects_final_dump_1.pkl'))

valid_subs = [s.id_ for s in subjects1]
df_final1 = df_final1[df_final1['ID'].isin(valid_subs)]

In [51]:
df_final2 = load_object(os.path.join(output,'df_final_dump_2.pkl'))
subjects2 = load_object(os.path.join(output,'subjects_final_dump_2.pkl'))
valid_subs = [s.id_ for s in subjects2]
display(df_final2[~df_final2['ID'].isin(valid_subs)])

df_final2 = df_final2[df_final2['ID'].isin(valid_subs)]

Unnamed: 0,ID,Axial,Sagittal,Overlap,MissingScore,PairValidity,AxSlices,SagSlices,AxThick,SagThick
0,Z1396897,,,,,,,,,
1,Z1041077,,,,,,,,,
114,Z441008,,,,,,,,,
304,Z525377,,,,,,,,,


In [52]:
print('Df_final1: ', len(df_final1))
print('subjects1: ', len(subjects1))
print('Df_final2: ', len(df_final2))
print('subjects2: ', len(subjects2))

Df_final1:  2052
subjects1:  2052
Df_final2:  377
subjects2:  377


In [53]:
# Find overlap between Df_final1 and Df_final2
overlaps = df_final1.loc[df_final1['ID'].isin(df_final2['ID'].values),'ID'].values
print('No of overlaps: ',len(overlaps))

No of overlaps:  62


In [54]:
# print overlaps in 1
display(df_final1[df_final1['ID'].isin(overlaps)].sort_values(by='ID'))

Unnamed: 0,ID,Axial,Sagittal,Overlap,MissingScore,PairValidity,AxSlices,SagSlices,AxThick,SagThick
29,Z1014113,Z1014113-SE-2-Axial_Stnd,,0.155,0.772,0.927,125,68,5.00,2.52
36,Z1017392,Z1017392-SE-4-Axial_Body_5.0,,False,1.000,1,67,79,5.00,3
44,Z1023089,Z1023089-SE-2-Axial_Body_5.0,Z1023089-SE-5-Sagittal_Body_Sagittal_3.000,0.997,1.000,1.997,64,80,5.00,3
45,Z1024771,Z1024771-SE-3-Body_5.0,Z1024771-SE-10-Body_Sagittal_3.000,0.989,1.000,1.989,72,76,5.00,3
57,Z1035204,Z1035204-SE-5-Axial_Body_5.0,Z1035204-SE-8-Sagittal_Body_Sagittal_3.000,0.994,1.000,1.994,66,81,5.00,3
65,Z1041413,Z1041413-SE-4-Stnd_Pediatric_2.0_CE,,0.662,1.000,1.662,213,135,2.00,2
114,Z1068362,Z1068362-SE-3-KIDNEY_STONE,,,1.000,,124,,2.50,
194,Z1140026,Z1140026-SE-2-ABD_PELVIS,,0,1.000,1,144,167,2.50,2
209,Z1154679,Z1154679-SE-2-Axial_Body_5.0,Z1154679-SE-5-Sagittal_Body_Sagittal_3.000,0.997,1.000,1.997,61,67,5.00,3
217,Z1165230,Z1165230-SE-2-_,,0.419,0.923,1.342,55,108,5.00,0.5


In [None]:
# print overlaps in 2
display(df_final2[df_final2['ID'].isin(overlaps)].sort_values(by='ID'))

### studies in both data dumps have exactly same series selected and the overlap, missing scores are same. So keep the ones in dump2 (local dump)

In [55]:
# Concatenate Df_final1 and 2 into Df_final

# First remove duplicates in df_final1
df_final1 = df_final1[~df_final1['ID'].isin(overlaps)]
print(len(df_final1))

1990


In [56]:
# Remove duplicates in subjects1
subjects1 = [s for s in subjects1 if s.id_ not in overlaps]
print(len(subjects1))

1990


In [57]:
# Finally concatenate df_final1 and df_final2, subjects1 and subjects2

df_final1['dump'] = 1
df_final2['dump'] = 2

df_final = df_final1.append(df_final2)
subjects = subjects1 + subjects2

print('Final no of studies: ', len(df_final))
print('Final no of studies: ', len(subjects))

# Remove unwanted
del df_final1
del df_final2

Final no of studies:  2367
Final no of studies:  2367


In [59]:
# Save final
save_object(df_final,os.path.join(output,'df_final.pkl'))
save_object(subjects,os.path.join(output,'subjects_final.pkl'))