In [2]:
import pandas as pd
import os
import sys
import shutil
from collections import namedtuple

In [3]:
paths = {
    'patient': 'nlst_780_prsn_idc_20210527.csv',
    'abnormalities': 'nlst_780_ctab_idc_20210527.csv',
    'screen': 'nlst_780_screen_idc_20210527.csv'
}

ClinicalData = namedtuple('ClinicalData', 'patient abnormalities screen')

In [4]:
clinical_data = ClinicalData(
    pd.read_csv(paths['patient']),
    pd.read_csv(paths['abnormalities']),
    pd.read_csv(paths['screen'])
)

In [5]:
clinical_data.abnormalities.head()

Unnamed: 0,sct_ab_desc,sct_ab_num,sct_epi_loc,sct_long_dia,sct_margins,sct_perp_dia,sct_pre_att,study_yr,sct_slice_num,sct_found_after_comp,pid,dataset_version
0,65,1,,,,,,0,,0.0,100002,2011.02.03/05.12.21
1,64,1,,,,,,1,,0.0,100002,2011.02.03/05.12.21
2,65,1,,,,,,2,,0.0,100002,2011.02.03/05.12.21
3,51,1,1.0,4.0,2.0,3.0,1.0,0,26.0,0.0,100004,2011.02.03/05.12.21
4,64,2,,,,,,0,,0.0,100004,2011.02.03/05.12.21


In [6]:
clinical_data.patient.head()

Unnamed: 0,race,cigsmok,gender,age,loclhil,locllow,loclup,locrhil,locrlow,locrmid,...,can_scr,canc_rpt_link,pid,dataset_version,scr_days0,scr_days1,scr_days2,candx_days,canc_free_days,de_stag_7thed
0,1,1,1,70,,,,,,,...,0,0,100001,2011.02.03/05.12.21,0.0,511.0,,,2353,
1,1,1,1,66,,,,,,,...,0,0,100002,2011.02.03/05.12.21,13.0,349.0,731.0,,2391,
2,1,1,1,64,,,,,,,...,0,0,100003,2011.02.03/05.12.21,12.0,374.0,738.0,,2381,
3,1,0,1,60,,,,,,,...,0,0,100004,2011.02.03/05.12.21,8.0,452.0,743.0,,2688,
4,1,0,1,64,,,,,,,...,0,0,100005,2011.02.03/05.12.21,56.0,382.0,746.0,,2435,


In [7]:
"""
First, let's get the patients that we know had ct scans by isolationg all the PIDs 
that are in the screen data table
"""
ct_pids = set(clinical_data.screen.pid)
print('there are {} pds that underwent ct imaging'.format(len(ct_pids)))

there are 26453 pds that underwent ct imaging


In [8]:
"""
Now, let us check if there are non-ct patients in the abnormalities dataset
"""

abnormality_pids = set(
    clinical_data.abnormalities.pid
)

if abnormality_pids.issubset(ct_pids):
    print('all abnormality pids are in the ct pids')
else:
    print('there are pids in the abnormalities dataset that may not be from ct scans')

there are pids in the abnormalities dataset that may not be from ct scans


In [9]:
"""
How many scans are in the screen dataset?
"""

print(clinical_data.screen.shape)

(75138, 20)


In [10]:
print(clinical_data.abnormalities.shape)

(177487, 12)


In [11]:
print(len(abnormality_pids.intersection(ct_pids)))

24512


In [12]:
"""
apply the first filter: 

only the pids that are in screen dataset
"""

ct_abnormalities_data = clinical_data.abnormalities[clinical_data.abnormalities.pid.isin(ct_pids)]

In [13]:
"""
Find the positive patient pids
"""

positive_labels = {4, 5, 6}
negative_labels = {1, 2, 3}

negative_pids = set()
negative_pid_rows = []
positive_pids = set()
positive_pid_rows = []
for index, row in clinical_data.patient.iterrows():
    if row['scr_iso0'] in positive_labels or row['scr_iso1'] in positive_labels or row['scr_iso2'] in positive_labels:
        positive_pids.add(row['pid'])
        positive_pid_rows.append(index)
    elif ((row['scr_iso0'] in negative_labels) and 
    (row['scr_iso1'] in negative_labels) and
    (row['scr_iso2'] in negative_labels)):
        negative_pids.add(row['pid'])
        negative_pid_rows.append(index)

In [14]:
"""
print the results of the thing

"""

print('there are {} positive patients'.format(len(positive_pids)))
print('there are {} negative patients'.format(len(negative_pids)))

there are 14750 positive patients
there are 33556 negative patients


In [15]:
positive_scans_data = clinical_data.abnormalities[clinical_data.abnormalities.pid.isin(positive_pids)]

In [16]:
"""
The next filter is for those abnormality types that were 51. 

that is, sct_ab_desc == 51
"""

positive_scans_data = positive_scans_data[positive_scans_data['sct_ab_desc'] == 51]

## Interlude: How many of these abnormalities were found only after comparison?



In [17]:
"""
Filter the dataset with sct_found_after_comp == 1

"""

comp_positives = positive_scans_data[positive_scans_data.sct_found_after_comp == 1]
#comp_positives = comp_positives[comp_positives.sct_ab_num == 1]
print(comp_positives.shape)

(465, 12)


So a not-insignificant number of these abnormalities were found after comparison. This is an interesting thing to keep in mind later. The easiest way to test this is by just preparing this datset for inference after we have trained the model. 

In [18]:
"""
The next filter is to account for scans that have more than one kind of abnormality. 
We do this by only choosing the first abnormality of type 51. 

also: filter by abnormalities found only on first look. Maybe prepare another dataset that has

sct_ab_num == 1
"""


positive_scans_data = positive_scans_data[positive_scans_data.sct_ab_num == 1]
print(positive_scans_data.shape)


(18003, 12)


In [19]:
"""
Now we filter for abnormalitites that were only fond in isolation
"""

positive_scans_data = positive_scans_data[positive_scans_data.sct_found_after_comp == 0]
print(positive_scans_data.shape)

(17907, 12)


In [20]:
set(positive_scans_data.sct_ab_desc)

{51}

In [21]:
download_data = positive_scans_data[['pid', 'study_yr', 'sct_ab_desc', 'sct_slice_num']]

In [22]:
download_data.head()

Unnamed: 0,pid,study_yr,sct_ab_desc,sct_slice_num
3,100004,0,51,26.0
6,100004,1,51,22.0
9,100005,0,51,32.0
13,100005,1,51,38.0
18,100005,2,51,38.0


In [23]:
"""
Now let's get the data for the negative patients. 

For each patient, we get three negative scans. We do not have any way of tracking the number of slices or depth of the lkj
"""

#check that no negative patient ids are in the dataset we just go 

if set(positive_scans_data.pid).intersection(negative_pids):
    print('there is intersect between the negative and positive patients')

In [24]:
#very good. 

In [25]:
#now to get the data about the negative patients. We just need a table

negative_patient_data = clinical_data.patient[clinical_data.patient.pid.isin(negative_pids)]

In [29]:
negative_patient_data = negative_patient_data[['pid', 'scr_iso0', 'scr_iso1', 'scr_iso2']]
negative_patient_data.head()

Unnamed: 0,pid,scr_iso0,scr_iso1,scr_iso2
1,100002,2,3,2
5,100006,1,1,1
7,100008,1,1,1
8,100009,1,2,2
9,100010,2,1,2


In [30]:
"""
just save both of them

"""

#negative_patient_data.to_csv('negative_patient_cohort.csv')

#download_data.to_csv('positive_scan_cohort.csv')