# Isolating the image of interest for the positive scans

I think the plan right now is keep eveything in the directory structure it is right now. Just delete the dicom files that do not contribute anything. If we extract the image from the dicom we will keep it in the same folder as the dicom

In [1]:
import pandas as pd
import os
import sys
import shutil
import pydicom

In [2]:
"""
Importing the table of positive scans
Will use this to get sct_slice_num
"""

path2_positive_scan_data = '/Users/carlosolivares/throwaway/throaway_wqe/positive_scan_cohort.csv'
positive_scan_data = pd.read_csv(path2_positive_scan_data)


In [3]:
positive_scan_data.head()

Unnamed: 0.1,Unnamed: 0,pid,study_yr,sct_ab_desc,sct_slice_num
0,3,100004,0,51,26.0
1,6,100004,1,51,22.0
2,9,100005,0,51,32.0
3,13,100005,1,51,38.0
4,18,100005,2,51,38.0


In [4]:
"""
Finding an putting the path to the positive scans here. 
also getting the list of pids. 
"""

path2pids = '/Volumes/My Passport for Mac/positive_scans/batch_0/manifest-1660146378931/NLST'

pids = [file for file in os.listdir(path2pids) if file.isnumeric()]

In [5]:
"""
The directory structure of the dataset is as such:

pid -> years -> scan file -> slices

Teh goal of the code below is to step through this structure and: 

1. pull the pid
2. pull the year
3. use that information to find the sct_slice_num

"""

def isolate_scan(pid, study_yr, ab_scan_data):
    """
    """
    mask1 = ab_scan_data.pid == pid
    mask2 = ab_scan_data.study_yr == study_yr
    sct_slice_num = int(ab_scan_data[mask1&mask2]['sct_slice_num'].values[0])
    return sct_slice_num

def get_slice_nums(path2pid, ab_scan_data):
    """
    We use the information available to us to get the 
    """
    pid = int(path2pid.split('/')[-1])
    years = get_years(path2pid)
    slice_dict = dict()
    for year in years: 
        sct_slice_num = isolate_scan(pid, year, ab_scan_data)
        slice_dict[year] = sct_slice_num
    return slice_dict

def get_years(path2pid):
    lst_of_dirs = [folder for folder in os.listdir(path2pid) if folder[0] != '.']
    if not lst_of_dirs:
        return None
    years = []
    for folder in lst_of_dirs:
        #print(folder)
        num = int(folder.split('-')[2])
        if num == 1999:
            years.append(0)
        if num == 2000:
            years.append(1)
        if num == 2001:
            years.append(2)
    assert len(years) == len(set(years))
    return years

def get_slice_files(path2pid, yr):
    """
    """
    yr_to_str = {
        0: 1999,
        1: 2000,
        2: 2001
    }
    yr_int = yr_to_str[yr]
    folders = [folder for folder in os.listdir(path2pid) if folder[0] != '.']
    #print(folders)
    yr_lst = [folder for folder in folders if (yr_int == int(folder.split('-')[2]))]
    #print(yr_lst)
    assert (len(yr_lst) == 1)
    year_folder = yr_lst[0]
    path2year = os.path.join(path2pid, year_folder)
    scan_folder = [folder for folder in os.listdir(path2year)]
    assert(len(scan_folder) == 1)
    scan_folder = scan_folder[0]
    path2slices = os.path.join(path2year, scan_folder)
    slices = [ file for file in os.listdir(path2slices) if not ('._' in file) ]
    return slices, path2slices

def get_slice_of_interest(interesting_slice_num, lst_of_slices):
    """
    lst of slices has already been cleaned
    """
    for slc in lst_of_slices:
        slc_depth = int(slc.split('-')[1].split('.dcm')[0])
        if interesting_slice_num == slc_depth:
            return slc
    return None

In [6]:
for pid in pids:
    pid_folder = os.path.join(path2pids, pid)
    slice_dict = get_slice_nums(pid_folder, positive_scan_data)
    print('pid: {}'.format(pid))
    print('slices: {}'.format(slice_dict))
    for year in slice_dict.keys():
        slices, path2slices = get_slice_files(pid_folder, year)
        print(get_slice_of_interest(slice_dict[year], slices))
        #print(len(slices))
        #print(path2slices)

pid: 100004
slices: {0: 26, 1: 22}
1-026.dcm
1-022.dcm
pid: 100005
slices: {0: 32, 1: 38, 2: 38}
1-032.dcm
1-038.dcm
1-038.dcm
pid: 100012
slices: {0: 38, 1: 39}
1-038.dcm
1-039.dcm
pid: 100026
slices: {1: 92, 2: 95}
1-092.dcm
1-095.dcm
pid: 100035
slices: {1: 81, 2: 81}
1-081.dcm
1-081.dcm
pid: 100053
slices: {0: 58, 2: 50}
1-058.dcm
1-050.dcm
pid: 100056
slices: {0: 25, 1: 45, 2: 45}
1-025.dcm
1-045.dcm
1-045.dcm
pid: 100069
slices: {0: 29, 2: 79}
1-029.dcm
1-079.dcm
pid: 100072
slices: {1: 74, 2: 75}
1-074.dcm
1-075.dcm
pid: 100073
slices: {0: 67, 1: 84, 2: 82}
1-067.dcm
1-084.dcm
1-082.dcm
pid: 100079
slices: {1: 46}
1-046.dcm
pid: 100080
slices: {0: 55}
1-055.dcm
pid: 100082
slices: {1: 24, 2: 27}
1-024.dcm
1-027.dcm
pid: 100085
slices: {1: 101, 2: 104}
1-101.dcm
1-104.dcm
pid: 100092
slices: {1: 42}
1-042.dcm
pid: 100095
slices: {0: 24, 1: 24, 2: 28}
1-024.dcm
1-024.dcm
1-028.dcm
pid: 100108
slices: {1: 62, 2: 68}
1-062.dcm
1-068.dcm
pid: 100111
slices: {0: 98}
1-098.dcm
pid: 100