# Preamble

In [1]:
import numpy as np
import pandas as pd
import shutil
import matplotlib.pyplot as plt

In [2]:
# Set this to false for faster execution time
SHOW_IMAGES = False

# Whether or not to save matplotlib figures as pdf output file for latex
USE_LATEX_ENGINE = True

if USE_LATEX_ENGINE:
    import matplotlib
    # matplotlib.use("pgf")     # pgf doesn't work for some plots because they exceed the max value that pgf can calculate
    matplotlib.rcParams.update({
        # "pgf.texsystem": "pdflatex",
        # 'pgf.rcfonts': False,
        'font.family': 'serif',
        'text.usetex': True,
    })

# Function Definitions

In [3]:
def _generate_output_string(element, data):
    """
    Returns a string that contains information about the percentual share of a value in a list
    """
    # if not isinstance(element, str): # make sure element is of type string, convert otherwise
    #     element = str(element)
    
    n_total = len(data)
    bool_list = (data == element)
    n_elements = sum(bool_list)

    return "{:<30}".format(str(n_elements) + '/' + str(n_total) + ' (' + "{:.2f}".format(((n_elements / n_total) * 100)) + '%)')

In [4]:
def calculate_percentages(normal_data, pneumonia_data, additional_df=None):
    """
    checks if two lists contain the same set of values - returns a warning if the sets dont match - prints a string with information about the distribution if they match
    """
    if len(set(normal_data)) is not len(set(pneumonia_data)):
        print('WARNING! Categories of healthy and pneumonia data are not identical!')
        print('Normal: ' + str(set(normal_data)))
        print('Pneumonia: ' + str(set(pneumonia_data)))
        print('Number of appearences (normal):')
        n_normal_set = ''
        for category in set(normal_data):
            n_normal_set += "'" + str(category) + "': " + str(sum(normal_data == category)) + '\n'   # single quote: non-escaped string, double quote: escaped string
        print(n_normal_set)
        return
    else:
        print(set(normal_data))
    if additional_df is None:
        print('{:<21}'.format('') + '{:<30}'.format('normal') + '{:<30}'.format('pneumonia'))
    else:
        print('{:<21}'.format('') + '{:<30}'.format('normal (total)') + '{:<30}'.format('normal (filtered)') + '{:<30}'.format('pneumonia'))

    for element in set(normal_data):
        category_string = "{:<21}".format(str(element) + ': ')
        if additional_df is None:
            print(category_string + _generate_output_string(element, normal_data) + _generate_output_string(element, pneumonia_data))
        else:
            print(category_string + _generate_output_string(element, normal_data) + _generate_output_string(element, additional_df) + _generate_output_string(element, pneumonia_data))

In [5]:
def find_appearences(df, column, value):
    """
    searches for appearences of a specified value within the column of a dataframe - prints the number of appearences and the corresponding indexes
    """
    
    value = str(value)
    print('found ' + str(sum(df[column] == value)) + ' rows')
    index_list = df.index[df[column] == value].tolist()
    print('Indexes: ' + str(index_list))
    return index_list

In [6]:
def drop_from_column(df, column, value, convert_to_string=True):
    """
    searches for appearences of a specified value within a column of a dataframe - drops each row where the value appeared from the dataframe
    """
    if convert_to_string:
        value = str(value)
    print('Dropping ' + str(sum(df[column] == value)) + ' rows')
    index_list = df.index[df[column] == value].tolist()
    df.drop(index_list, inplace=True)

# Data preparation

In [7]:
path_mimic_cxr = '/mnt/f/05_DatensaetzeJan/MIMIC-CXR-JPG_Chest-Radiographs/physionet.org/files/mimic-cxr-jpg/2.0.0/'
file_name_meta_data = 'mimic-cxr-2.0.0-metadata.csv'
file_name_findings = 'mimic-cxr-2.0.0-negbio.csv'

meta_data_csv_df = pd.read_csv(path_mimic_cxr + file_name_meta_data)
meta_data_csv_df.reset_index(drop=True, inplace=True)

findings_df = pd.read_csv(path_mimic_cxr + file_name_findings)
findings_df.reset_index(drop=True, inplace=True)

print('length of meta data:', len(meta_data_csv_df))
print('length of findings:', len(findings_df))


length of meta data: 377110
length of findings: 227827


In [8]:
print(sum(meta_data_csv_df.ProcedureCodeSequence_CodeMeaning == meta_data_csv_df.PerformedProcedureStepDescription), 'of', len(meta_data_csv_df), 'Samples have the same entries for both procedure Labels')

304688 of 377110 Samples have the same entries for both procedure Labels


In [9]:
print('Description of', len(set(meta_data_csv_df)), 'columns (meta data):')
set(meta_data_csv_df)

Description of 12 columns (meta data):


{'Columns',
 'PatientOrientationCodeSequence_CodeMeaning',
 'PerformedProcedureStepDescription',
 'ProcedureCodeSequence_CodeMeaning',
 'Rows',
 'StudyDate',
 'StudyTime',
 'ViewCodeSequence_CodeMeaning',
 'ViewPosition',
 'dicom_id',
 'study_id',
 'subject_id'}

In [10]:
print('Description of', len(set(findings_df)), 'columns (findings):')
set(findings_df)

Description of 16 columns (findings):


{'Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Enlarged Cardiomediastinum',
 'Fracture',
 'Lung Lesion',
 'Lung Opacity',
 'No Finding',
 'Pleural Effusion',
 'Pleural Other',
 'Pneumonia',
 'Pneumothorax',
 'Support Devices',
 'study_id',
 'subject_id'}

In [11]:
len(set(findings_df['subject_id']))

65379

In [12]:
len(set(meta_data_csv_df['subject_id']))

65379

In [13]:
# replacing nan with 'nan'
meta_data_csv_df.fillna('nan', inplace=True)

findings_df.fillna('nan', inplace=True)
findings_df.rename(columns={"No Finding": "NoFinding", "Support Devices": "SupportDevices"}, errors='raise', inplace=True)

In [14]:
print(meta_data_csv_df.sample(n=10).dicom_id)

69233     d11d846d-a8712fc6-6255f03b-3151579a-daa9ce87
91224     88e7202b-f6a935af-4a217de5-dfeaee4f-6af14df0
259079    addecea6-adda0c2b-5df30aac-63b4e019-0af175c6
372474    9dd61767-a3387e42-47e0dbcd-f522f7eb-44b46c84
301588    12ece850-c6960161-abc7f321-78fca705-7899a3b6
66782     55186a73-7e819d4c-5c43f6be-598458b7-2021bfe4
272640    7a7d4955-4f39770b-da4fd657-50119623-b1311579
229239    abf6b836-96445478-9021bae3-f5af9134-10cecb86
9146      c96a3609-3ccd0269-9d55c79c-b794bca9-7bb6715f
157336    f79ad1c9-ed5edf06-ff437a6b-c3ff4486-a19a3969
Name: dicom_id, dtype: object


In [15]:
meta_data_csv_df['ProcedureCombined'] = meta_data_csv_df['ProcedureCodeSequence_CodeMeaning'] + ' ' + meta_data_csv_df['PerformedProcedureStepDescription']

In [16]:
# join dataframes together so it's easier to drop samples, etc.
meta_data_df = findings_df.join(meta_data_csv_df.set_index('study_id'), rsuffix='_meta', on='study_id')
meta_data_df.reset_index(drop=True, inplace=True)

In [17]:
import matplotlib.image as mpimg

num = 1
for sample in meta_data_df.sample(n=10).itertuples():
    if not SHOW_IMAGES:
        break

    print('=== ' + str(num) + '. sample study ===')
    print('NoFinding:', sample.NoFinding, ', Pneumonia:', sample.Pneumonia, ', SupportDevices', sample.SupportDevices)

    study_meta_data = meta_data_df[meta_data_df['study_id'] == sample.study_id]
    print('\nPatientOrientationCodeSequence_CodeMeaning, PerformedProcedureStepDescription, ProcedureCodeSequence_CodeMeaning')
    print('StudyDate, StudyTime, ViewCodeSequence_CodeMeaning, ViewPosition\n')

    num = num + 1
    
    for image in study_meta_data.itertuples():
        print(image.PatientOrientationCodeSequence_CodeMeaning, image.PerformedProcedureStepDescription, image.ProcedureCodeSequence_CodeMeaning)
        print(image.StudyDate, image.StudyTime, image.ViewCodeSequence_CodeMeaning, image.ViewPosition)

        file_path = path_mimic_cxr + 'files/p' + str(sample.subject_id)[:2] + '/p' + str(sample.subject_id) + '/s' + str(sample.study_id) + '/' + str(image.dicom_id) + '.jpg'
        # print(file_path)
        plt.figure(num)
        img = mpimg.imread(file_path)
        imgplot = plt.imshow(img, cmap="gray")
        plt.show()
    
    print('\n')



In [18]:
# analyze and remove images with lateral perspective
calculate_percentages(meta_data_df['ViewPosition'], meta_data_df['ViewPosition'])
print('\n')
calculate_percentages(meta_data_df['ViewCodeSequence_CodeMeaning'], meta_data_df['ViewCodeSequence_CodeMeaning'])
print('\n')
calculate_percentages(meta_data_df['ProcedureCodeSequence_CodeMeaning'], meta_data_df['ProcedureCodeSequence_CodeMeaning'])
print('\n')
calculate_percentages(meta_data_df['PerformedProcedureStepDescription'], meta_data_df['PerformedProcedureStepDescription'])

{'LATERAL', 'AP AXIAL', 'PA RLD', 'XTABLE LATERAL', 'PA LLD', 'LAO', 'AP LLD', 'LL', 'AP RLD', 'RAO', 'PA', 'AP', 'SWIMMERS', 'nan', 'LPO'}
                     normal                        pneumonia                     
LATERAL:             82852/377095 (21.97%)         82852/377095 (21.97%)         
AP AXIAL:            2/377095 (0.00%)              2/377095 (0.00%)              
PA RLD:              1/377095 (0.00%)              1/377095 (0.00%)              
XTABLE LATERAL:      2/377095 (0.00%)              2/377095 (0.00%)              
PA LLD:              4/377095 (0.00%)              4/377095 (0.00%)              
LAO:                 3/377095 (0.00%)              3/377095 (0.00%)              
AP LLD:              2/377095 (0.00%)              2/377095 (0.00%)              
LL:                  35129/377095 (9.32%)          35129/377095 (9.32%)          
AP RLD:              2/377095 (0.00%)              2/377095 (0.00%)              
RAO:                 3/377095 (0.00%)   

In [19]:
# Drop irrelevant rows for ViewPosition first and afterwards try to get information for samples where ViewPosition is nan from other fields 
drop_from_column(meta_data_df, 'ViewPosition', 'LL')
drop_from_column(meta_data_df, 'ViewPosition', 'AP LLD')
drop_from_column(meta_data_df, 'ViewPosition', 'PA LLD')
drop_from_column(meta_data_df, 'ViewPosition', 'RAO')
drop_from_column(meta_data_df, 'ViewPosition', 'SWIMMERS')
drop_from_column(meta_data_df, 'ViewPosition', 'XTABLE LATERAL')
drop_from_column(meta_data_df, 'ViewPosition', 'AP RLD')
drop_from_column(meta_data_df, 'ViewPosition', 'PA RLD')
drop_from_column(meta_data_df, 'ViewPosition', 'LAO')
drop_from_column(meta_data_df, 'ViewPosition', 'LPO')
drop_from_column(meta_data_df, 'ViewPosition', 'AP AXIAL')
drop_from_column(meta_data_df, 'ViewPosition', 'LATERAL')

Dropping 35129 rows
Dropping 2 rows
Dropping 4 rows
Dropping 3 rows
Dropping 1 rows
Dropping 2 rows
Dropping 2 rows
Dropping 1 rows
Dropping 3 rows
Dropping 1 rows
Dropping 2 rows
Dropping 82852 rows


In [20]:
calculate_percentages(meta_data_df['ViewPosition'], meta_data_df['ViewPosition'])

{'nan', 'PA', 'AP'}
                     normal                        pneumonia                     
nan:                 15769/259093 (6.09%)          15769/259093 (6.09%)          
PA:                  96155/259093 (37.11%)         96155/259093 (37.11%)         
AP:                  147169/259093 (56.80%)        147169/259093 (56.80%)        


In [21]:
nan_subset_df = meta_data_df[meta_data_df['ViewPosition'] == 'nan'].copy()

calculate_percentages(nan_subset_df['ViewPosition'], nan_subset_df['ViewPosition'])
print('\n')
calculate_percentages(nan_subset_df['ViewCodeSequence_CodeMeaning'], nan_subset_df['ViewCodeSequence_CodeMeaning'])
print('\n')
calculate_percentages(nan_subset_df['ProcedureCodeSequence_CodeMeaning'], nan_subset_df['ProcedureCodeSequence_CodeMeaning'])
print('\n')
calculate_percentages(nan_subset_df['PerformedProcedureStepDescription'], nan_subset_df['PerformedProcedureStepDescription'])

{'nan'}
                     normal                        pneumonia                     
nan:                 15769/15769 (100.00%)         15769/15769 (100.00%)         


{'nan'}
                     normal                        pneumonia                     
nan:                 15769/15769 (100.00%)         15769/15769 (100.00%)         


{'DX CHEST & RIBS', 'DX CHEST 2 VIEW PICC LINE PLACEMENT', 'CHEST (SINGLE VIEW)', 'DX CHEST PORTABLE PICC LINE PLACEMENT', 'CHEST (PA AND LAT)', 'CHEST (PRE-OP PA & LAT)', 'CHEST (PORTABLE AP)', 'CHEST SGL VIEW/LINE PLACEMENT', 'DX CHEST WITH DECUB'}
                     normal                        pneumonia                     
DX CHEST & RIBS:     10/15769 (0.06%)              10/15769 (0.06%)              
DX CHEST 2 VIEW PICC LINE PLACEMENT: 1/15769 (0.01%)               1/15769 (0.01%)               
CHEST (SINGLE VIEW): 2/15769 (0.01%)               2/15769 (0.01%)               
DX CHEST PORTABLE PICC LINE PLACEMENT: 1/15769 (0.01%)   

In [22]:
nan_patientgroups_df = nan_subset_df.groupby(['subject_id'])
nan_patientgroups_df.head()

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,...,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,ProcedureCombined
44,10001122,53447138,,,,,,,,,...,CHEST (PA AND LAT),,2140,1760,21431210,162003.0,CHEST (PA AND LAT),,,CHEST (PA AND LAT) CHEST (PA AND LAT)
45,10001122,53447138,,,,,,,,,...,CHEST (PA AND LAT),,2140,1760,21431210,162003.0,CHEST (PA AND LAT),,,CHEST (PA AND LAT) CHEST (PA AND LAT)
130,10002013,55312734,,,,,,,,0,...,CHEST (PA AND LAT),,2140,1760,21601005,164810.0,CHEST (PA AND LAT),,,CHEST (PA AND LAT) CHEST (PA AND LAT)
131,10002013,55312734,,,,,,,,0,...,CHEST (PA AND LAT),,2140,1760,21601005,164810.0,CHEST (PA AND LAT),,,CHEST (PA AND LAT) CHEST (PA AND LAT)
145,10002221,53781756,,,,,,,,,...,CHEST (PA AND LAT),,2140,1760,22001221,153523.0,CHEST (PA AND LAT),,,CHEST (PA AND LAT) CHEST (PA AND LAT)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377069,19999068,59390811,-1,0,,,0,,,1,...,CHEST (PORTABLE AP),,2140,1760,21610827,53735.0,CHEST (PORTABLE AP),,,CHEST (PORTABLE AP) CHEST (PORTABLE AP)
377073,19999270,55274188,,0,,0,,,,,...,CHEST (PA AND LAT),,2140,1760,21821112,104527.0,CHEST (PA AND LAT),,,CHEST (PA AND LAT) CHEST (PA AND LAT)
377074,19999270,55274188,,0,,0,,,,,...,CHEST (PA AND LAT),,2140,1760,21821112,104527.0,CHEST (PA AND LAT),,,CHEST (PA AND LAT) CHEST (PA AND LAT)
377075,19999270,56267753,,,,,,,,,...,CHEST (PA AND LAT),,2140,1760,21831203,135727.0,CHEST (PA AND LAT),,,CHEST (PA AND LAT) CHEST (PA AND LAT)


In [23]:
code_meaning = nan_subset_df[nan_subset_df['ProcedureCodeSequence_CodeMeaning'] == "CHEST (PORTABLE AP)"]
step_description = nan_subset_df[nan_subset_df['PerformedProcedureStepDescription'] == "CHEST (PORTABLE AP)"]

equal_indizes = set(code_meaning.index).intersection(set(step_description.index))

In [24]:
print('There are', len(equal_indizes), 'matching indizes. It\'s very likely, that they are actually AP projected samples.')

There are 2201 matching indizes. It's very likely, that they are actually AP projected samples.


In [25]:
unequal_indizes = set(code_meaning.index).symmetric_difference(set(step_description.index))

In [26]:
print(sum(meta_data_df.ProcedureCodeSequence_CodeMeaning == meta_data_df.PerformedProcedureStepDescription), 'of', len(meta_data_df), 'Samples have the same entries for both procedure Labels')

222627 of 259093 Samples have the same entries for both procedure Labels


In [27]:
calculate_percentages(nan_subset_df['ProcedureCombined'], nan_subset_df['ProcedureCombined'])

{'CHEST (PORTABLE AP) CHEST (PA AND LAT) PORT', 'CHEST (PA AND LAT) CHEST (PORTABLE AP)', 'CHEST (PA AND LAT) RIB BILAT, W/AP CHEST', 'DX CHEST & RIBS CHEST (PA AND LAT)', 'CHEST (SINGLE VIEW) CHEST (PORTABLE AP)', 'CHEST (PA AND LAT) KNEE (AP, LAT AND TUNNEL) LEFT', 'CHEST (PRE-OP PA & LAT) CHEST (PRE-OP PA AND LAT)', 'DX CHEST 2 VIEW PICC LINE PLACEMENT DX CHEST 2 VIEW PICC LINE PLACEMENT', 'CHEST (PORTABLE AP) CHEST (PA AND LAT)', 'CHEST (PA AND LAT) CHEST (PA AND LAT)', 'DX CHEST WITH DECUB CHEST (PA AND LAT)', 'CHEST (PORTABLE AP) PORTABLE ABDOMEN', 'CHEST (PORTABLE AP) BABYGRAM (CHEST ONLY)', 'CHEST (PORTABLE AP) CHEST SGL VIEW/LINE PLACEMENT PORT', 'CHEST (PORTABLE AP) CHEST (SINGLE VIEW)', 'CHEST (PORTABLE AP) ABDOMEN (SUPINE AND ERECT)', 'CHEST (PA AND LAT) nan', 'CHEST (PA AND LAT) CHEST (PA, LAT AND OBLIQUES)', 'CHEST (PORTABLE AP) nan', 'CHEST (SINGLE VIEW) CHEST (SINGLE VIEW)', 'CHEST (PA AND LAT) CHEST (PRE-OP PA AND LAT)', 'DX CHEST PORTABLE PICC LINE PLACEMENT CHEST (PA

In [28]:
qry = 'ProcedureCombined.str.contains("CHEST \(PORTABLE AP\)")'
qry += ' & ~(ProcedureCombined.str.contains("LAT"))'
ap_df = nan_subset_df.query(qry)

print(len(ap_df), 'Samples have been determined to be AP projection')
set(ap_df.ProcedureCombined)


2252 Samples have been determined to be AP projection


{'CHEST (PORTABLE AP) ABDOMEN (SUPINE AND ERECT)',
 'CHEST (PORTABLE AP) ABDOMEN (SUPINE ONLY) PORT',
 'CHEST (PORTABLE AP) BABYGRAM (CHEST ONLY)',
 'CHEST (PORTABLE AP) CHEST (PORTABLE AP)',
 'CHEST (PORTABLE AP) CHEST (SINGLE VIEW)',
 'CHEST (PORTABLE AP) CHEST (SINGLE VIEW) PORT',
 'CHEST (PORTABLE AP) CHEST SGL VIEW/LINE PLACEMENT PORT',
 'CHEST (PORTABLE AP) PELVIS (AP ONLY)',
 'CHEST (PORTABLE AP) PORTABLE ABDOMEN',
 'CHEST (PORTABLE AP) nan',
 'CHEST (SINGLE VIEW) CHEST (PORTABLE AP)'}

In [31]:
calculate_percentages(ap_df['ProcedureCombined'], ap_df['ProcedureCombined'])

{'CHEST (PORTABLE AP) CHEST (PORTABLE AP)', 'CHEST (PORTABLE AP) ABDOMEN (SUPINE AND ERECT)', 'CHEST (PORTABLE AP) ABDOMEN (SUPINE ONLY) PORT', 'CHEST (PORTABLE AP) PORTABLE ABDOMEN', 'CHEST (PORTABLE AP) BABYGRAM (CHEST ONLY)', 'CHEST (PORTABLE AP) nan', 'CHEST (PORTABLE AP) PELVIS (AP ONLY)', 'CHEST (SINGLE VIEW) CHEST (PORTABLE AP)', 'CHEST (PORTABLE AP) CHEST SGL VIEW/LINE PLACEMENT PORT', 'CHEST (PORTABLE AP) CHEST (SINGLE VIEW)', 'CHEST (PORTABLE AP) CHEST (SINGLE VIEW) PORT'}
                     normal                        pneumonia                     
CHEST (PORTABLE AP) CHEST (PORTABLE AP): 2201/2252 (97.74%)            2201/2252 (97.74%)            
CHEST (PORTABLE AP) ABDOMEN (SUPINE AND ERECT): 1/2252 (0.04%)                1/2252 (0.04%)                
CHEST (PORTABLE AP) ABDOMEN (SUPINE ONLY) PORT: 2/2252 (0.09%)                2/2252 (0.09%)                
CHEST (PORTABLE AP) PORTABLE ABDOMEN: 2/2252 (0.09%)                2/2252 (0.09%)                
CHEST (PORT

In [33]:
meta_data_df[meta_data_df['study_id'] == 51676183]

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,...,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,ProcedureCombined
75573,12009998,51676183,1,0,,,,,,,...,ABDOMEN (SUPINE ONLY) PORT,,2140,1760,21710425,202609.0,CHEST (PORTABLE AP),,,CHEST (PORTABLE AP) ABDOMEN (SUPINE ONLY) PORT


In [34]:
ap_df[ap_df['ProcedureCombined'] == 'CHEST (PORTABLE AP) ABDOMEN (SUPINE ONLY) PORT']

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,...,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning,ProcedureCombined
75573,12009998,51676183,1.0,0.0,,,,,,,...,ABDOMEN (SUPINE ONLY) PORT,,2140,1760,21710425,202609.0,CHEST (PORTABLE AP),,,CHEST (PORTABLE AP) ABDOMEN (SUPINE ONLY) PORT
292488,17755234,59218035,,,,-1.0,0.0,,,1.0,...,ABDOMEN (SUPINE ONLY) PORT,,2140,1760,21221016,135232.0,CHEST (PORTABLE AP),,,CHEST (PORTABLE AP) ABDOMEN (SUPINE ONLY) PORT


In [35]:
num = 1
for image in ap_df[ap_df.ProcedureCombined != "CHEST (PORTABLE AP) CHEST (PORTABLE AP)"].itertuples():
    if not SHOW_IMAGES:
        break
    
    print('=== ' + str(num) + '. sample ===')
    print('ProcedureCombined:', image.ProcedureCombined)

    num = num + 1
    
    file_path = path_mimic_cxr + 'files/p' + str(image.subject_id)[:2] + '/p' + str(image.subject_id) + '/s' + str(image.study_id) + '/' + str(image.dicom_id) + '.jpg'
    # print(file_path)
    plt.figure(num)
    img = mpimg.imread(file_path)
    imgplot = plt.imshow(img, cmap="gray")
    plt.show()
    
    print('\n')

In [36]:
num = 1
for image in ap_df[ap_df.ProcedureCombined == "CHEST (PORTABLE AP) CHEST (PORTABLE AP)"].sample(n=50).itertuples():
    if not SHOW_IMAGES:
        break

    print('=== ' + str(num) + '. sample ===')
    print('ProcedureCombined:', image.ProcedureCombined)

    num = num + 1
    
    file_path = path_mimic_cxr + 'files/p' + str(image.subject_id)[:2] + '/p' + str(image.subject_id) + '/s' + str(image.study_id) + '/' + str(image.dicom_id) + '.jpg'
    print(file_path)
    plt.figure(num)
    img = mpimg.imread(file_path)
    imgplot = plt.imshow(img, cmap="gray")
    plt.show()
    
    print('\n')

In [38]:
total = 0
for i in ap_df.index:
    if (i in meta_data_df.index):
        total += 1

In [39]:
print(total)

2252


In [40]:
for i in ap_df.index:
    meta_data_df.at[i, 'ViewPosition'] = 'AP'

In [41]:
calculate_percentages(meta_data_df['ViewPosition'], meta_data_df['ViewPosition'])

{'nan', 'PA', 'AP'}
                     normal                        pneumonia                     
nan:                 13517/259093 (5.22%)          13517/259093 (5.22%)          
PA:                  96155/259093 (37.11%)         96155/259093 (37.11%)         
AP:                  149421/259093 (57.67%)        149421/259093 (57.67%)        


In [42]:
num = 1
for image in meta_data_df[meta_data_df.ViewPosition == "nan"].sample(n=50).itertuples():
    if not SHOW_IMAGES:
        break
    
    print('=== ' + str(num) + '. sample ===')
    print('ProcedureCombined:', image.ProcedureCombined)

    num = num + 1
    
    file_path = path_mimic_cxr + 'files/p' + str(image.subject_id)[:2] + '/p' + str(image.subject_id) + '/s' + str(image.study_id) + '/' + str(image.dicom_id) + '.jpg'
    print(file_path)
    plt.figure(num)
    img = mpimg.imread(file_path)
    imgplot = plt.imshow(img, cmap="gray")
    plt.show()
    
    print('\n')

In [43]:
drop_from_column(meta_data_df, 'ViewPosition', 'nan')

Dropping 13517 rows


In [44]:
calculate_percentages(meta_data_df['ViewPosition'], meta_data_df['ViewPosition'])

{'PA', 'AP'}
                     normal                        pneumonia                     
PA:                  96155/245576 (39.15%)         96155/245576 (39.15%)         
AP:                  149421/245576 (60.85%)        149421/245576 (60.85%)        


In [45]:
calculate_percentages(meta_data_df['Pneumonia'], meta_data_df['Pneumonia'])

{'nan', 1.0, -1.0, 0.0}
                     normal                        pneumonia                     
nan:                 183869/245576 (74.87%)        183869/245576 (74.87%)        
1.0:                 16807/245576 (6.84%)          16807/245576 (6.84%)          
-1.0:                19806/245576 (8.07%)          19806/245576 (8.07%)          
0.0:                 25094/245576 (10.22%)         25094/245576 (10.22%)         


In [46]:
pneumonia_df = meta_data_df.query('Pneumonia == 1.0')
print('Pneumonia samples', len(pneumonia_df))
normal_df = meta_data_df.query('Pneumonia == 0.0 | Pneumonia == "nan"')
print('Normal samples', len(normal_df))

Pneumonia samples 16807
Normal samples 208963


In [48]:
pneumonia_patientids = set(pneumonia_df['subject_id'])
normal_patientids = set(normal_df['subject_id'])

duplicate_ids = pneumonia_patientids.intersection(normal_patientids)
print(len(duplicate_ids), 'of', len(pneumonia_patientids), 'pneumonia patients are also present in normal patients (total of', len(normal_patientids), 'unique normal patients)')

res = normal_df[normal_df['subject_id'].apply(lambda x: x in duplicate_ids)]
print('This affects', len(res), 'of', len(normal_df), 'normal samples')

7896 of 9558 pneumonia patients are also present in normal patients (total of 60499 unique normal patients)
This affects 71565 of 208963 normal samples


In [50]:
# Dropping duplicates in normal data
normal_df.drop(res.index, inplace=True)

In [51]:
len(normal_df)

137398

In [52]:
set(meta_data_df)

{'Atelectasis',
 'Cardiomegaly',
 'Columns',
 'Consolidation',
 'Edema',
 'Enlarged Cardiomediastinum',
 'Fracture',
 'Lung Lesion',
 'Lung Opacity',
 'NoFinding',
 'PatientOrientationCodeSequence_CodeMeaning',
 'PerformedProcedureStepDescription',
 'Pleural Effusion',
 'Pleural Other',
 'Pneumonia',
 'Pneumothorax',
 'ProcedureCodeSequence_CodeMeaning',
 'ProcedureCombined',
 'Rows',
 'StudyDate',
 'StudyTime',
 'SupportDevices',
 'ViewCodeSequence_CodeMeaning',
 'ViewPosition',
 'dicom_id',
 'study_id',
 'subject_id',
 'subject_id_meta'}

# Data Analysis

In [59]:
print('==== Distribution of projections ====\n')
calculate_percentages(normal_df['ViewPosition'], pneumonia_df['ViewPosition'])
print('\n==== Distribution of ViewCodeSequence_CodeMeaning ====\n')
calculate_percentages(normal_df['ViewCodeSequence_CodeMeaning'], pneumonia_df['ViewCodeSequence_CodeMeaning'])

==== Distribution of projections ====

{'PA', 'AP'}
                     normal                        pneumonia                     
PA:                  67563/137398 (49.17%)         5563/16807 (33.10%)           
AP:                  69835/137398 (50.83%)         11244/16807 (66.90%)          

==== Distribution of ViewCodeSequence_CodeMeaning ====

{'antero-posterior', 'Erect', 'Recumbent', 'postero-anterior', 'nan', 'left anterior oblique'}
                     normal                        pneumonia                     
antero-posterior:    68630/137398 (49.95%)         10932/16807 (65.04%)          
Erect:               239/137398 (0.17%)            27/16807 (0.16%)              
Recumbent:           6/137398 (0.00%)              1/16807 (0.01%)               
postero-anterior:    67340/137398 (49.01%)         5539/16807 (32.96%)           
nan:                 1173/137398 (0.85%)           306/16807 (1.82%)             
left anterior oblique: 10/137398 (0.01%)             2/168

In [54]:
print('==== Distribution of support devices ====\n')
calculate_percentages(normal_df['SupportDevices'], pneumonia_df['SupportDevices'])

==== Distribution of support devices ====

{'nan', 1.0, -1.0, 0.0}
                     normal                        pneumonia                     
nan:                 103304/137398 (75.19%)        11744/16807 (69.88%)          
1.0:                 31835/137398 (23.17%)         4810/16807 (28.62%)           
-1.0:                245/137398 (0.18%)            39/16807 (0.23%)              
0.0:                 2014/137398 (1.47%)           214/16807 (1.27%)             


# Stratified Sampling from Kaggle

In [88]:
'''
This module contains functions that computes stratified sampling of pandas dataframes.
'''
# Required libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Functions

In [89]:
def __smpl_size(population, size):
    '''
    A function to compute the sample size. If not informed, a sampling 
    size will be calculated using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2

        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error

        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)

        where:
            - cochran_n = result of the previous formula
            - N is the population size
    Parameters
    ----------
        :population: population size
        :size: sample size (default = None)
    Returns
    -------
    Calculated sample size to be used in the functions:
        - stratified_sample
        - stratified_sample_report
    '''
    if size is None:
        cochran_n = round(((1.96)**2 * 0.5 * 0.5)/ 0.02**2)
        n = round(cochran_n/(1+((cochran_n -1) /population)))
    elif size >= 0 and size < 1:
        n = round(population * size)
    elif size < 0:
        raise ValueError('Parameter "size" must be an integer or a proportion between 0 and 0.99.')
    elif size >= 1:
        n = size
    return n

In [90]:
def stratified_sample_report(df, strata, size=None):
    '''
    Generates a dataframe reporting the counts in each stratum and the counts
    for the final sampled dataframe.

    Parameters
    ----------
    :df: pandas dataframe from which data will be sampled.
    :strata: list containing columns that will be used in the stratified sampling.
    :size: sampling size. If not informed, a sampling size will be calculated
        using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2

        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error

        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)

        where:
            - cochran_n = result of the previous formula
            - N is the population size

    Returns
    -------
    A dataframe reporting the counts in each stratum and the counts
    for the final sampled dataframe.
    '''
    population = len(df)
    size = __smpl_size(population, size)
    tmp = df[strata]
    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()
    tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)
    return tmp_grpd

In [91]:
def stratified_sample_transferred(df_target, df_proportion, strata, size=None, seed=None, keep_index= True):
    '''
    It samples data from a pandas dataframe using strata. These functions use
    proportionate stratification:
    n1 = (N1/N) * n
    where:
        - n1 is the sample size of stratum 1
        - N1 is the population size of stratum 1
        - N is the total population size
        - n is the sampling size
    Parameters
    ----------
    :df_target: pandas dataframe from which data will be sampled.
    :df_proportion: pandas dataframe from which the proportions for sampling will be used.
    :strata: list containing columns that will be used in the stratified sampling.
    :size: sampling size. If not informed, a sampling size will be calculated
        using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
        where:
            - cochran_n = result of the previous formula
            - N is the population size
    :seed: sampling seed
    :keep_index: if True, it keeps a column with the original population index indicator
    
    Returns
    -------
    A sampled pandas dataframe based in a set of strata.
    Examples
    --------
    >> df.head()
    	id  sex age city 
    0	123 M   20  XYZ
    1	456 M   25  XYZ
    2	789 M   21  YZX
    3	987 F   40  ZXY
    4	654 M   45  ZXY
    ...
    # This returns a sample stratified by sex and city containing 30% of the size of
    # the original data
    >> stratified = stratified_sample(df=df, strata=['sex', 'city'], size=0.3)
    Requirements
    ------------
    - pandas
    - numpy
    '''
    
    # population = len(df)
    # size = __smpl_size(population, size)
    # tmp = df[strata]
    # tmp['size'] = 1
    # tmp_grpd = tmp.groupby(strata).count().reset_index()
    # tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)


    population = len(df_proportion)
    size = __smpl_size(population, size)

    tmp_proportion = df_proportion[strata]
    tmp = df_target[strata]

    tmp_proportion['size'] = 1
    tmp['size'] = 1

    tmp_proportion_grpd = tmp_proportion.groupby(strata).count().reset_index()
    tmp_grpd = tmp.groupby(strata).count().reset_index()

    if not tmp_grpd[strata].equals(tmp_proportion_grpd[strata]):
        print(tmp_grpd)
        print(tmp_proportion_grpd)
        raise ValueError("Dataframes don't have the same groups for the given set of stratas. You can check the differences in the terminal output.")        

    tmp_proportion_grpd['samp_size'] = round(size/population * tmp_proportion_grpd['size']).astype(int)
    tmp_grpd['samp_size'] = round(size/population * tmp_proportion_grpd['size']).astype(int)

    # controlling variable to create the dataframe or append to it
    first = True
    # Hier muss nichts geändert werden, da die Gruppen ohnehin die gleichen sein sollten -> len(tmp_grpd) == len(tmp_proportion_grpd)
    for i in range(len(tmp_grpd)):
        # query generator for each iteration
        qry=''
        for s in range(len(strata)):
            stratum = strata[s]
            # Hier wird n berechnet, tmp_proportion_grpd statt tmp_grpd
            
            value = tmp_grpd.iloc[i][stratum]
            n = tmp_grpd.iloc[i]['samp_size']

            if type(value) == str:
                value = "'" + str(value) + "'"
            
            if s != len(strata)-1:
                qry = qry + stratum + ' == ' + str(value) +' & '
            else:
                qry = qry + stratum + ' == ' + str(value)
        
        # query dataframe
        queried_df = df_target.query(qry)

        # check if result contains enough samples / rows
        if len(queried_df) < n:
            print("Warning! The required number of samples (" + str(n) + ") could not be retrieved.")
            print("Using all available entries (" + str(len(queried_df)) + ") of group:")
            print(qry)
            n = len(queried_df)

        # sample dataframe
        sampled_df = queried_df.sample(n=n, random_state=seed).reset_index(drop=(not keep_index))

        # assign to final dataframe
        if first:
            stratified_df = sampled_df
            first = False
        else:
            stratified_df = stratified_df.append(sampled_df, ignore_index=True)
    
    return stratified_df

In [92]:
filtered_df = stratified_sample_transferred(normal_df, pneumonia_df, ['ViewPosition', 'SupportDevices'], size=len(pneumonia_df))

print('\nFiltered dataframe has', len(filtered_df), 'samples')


Filtered dataframe has 16807 samples


In [94]:
print('==== Distribution of projections ====\n')
calculate_percentages(filtered_df['ViewPosition'], pneumonia_df['ViewPosition'])
print('\n==== Distribution of ViewCodeSequence_CodeMeaning ====\n')
calculate_percentages(filtered_df['ViewCodeSequence_CodeMeaning'], pneumonia_df['ViewCodeSequence_CodeMeaning'])
print('\n==== Distribution of SupportDevices ====\n')
calculate_percentages(filtered_df['SupportDevices'], pneumonia_df['SupportDevices'])

==== Distribution of projections ====

{'PA', 'AP'}
                     normal                        pneumonia                     
PA:                  5563/16807 (33.10%)           5563/16807 (33.10%)           
AP:                  11244/16807 (66.90%)          11244/16807 (66.90%)          

==== Distribution of ViewCodeSequence_CodeMeaning ====

Normal: {'antero-posterior', 'Erect', 'postero-anterior', 'nan', 'left anterior oblique'}
Pneumonia: {'antero-posterior', 'Erect', 'Recumbent', 'postero-anterior', 'nan', 'left anterior oblique'}
Number of appearences (normal):
'antero-posterior': 11040
'Erect': 23
'postero-anterior': 5544
'nan': 199
'left anterior oblique': 1


==== Distribution of SupportDevices ====

{0.0, 1.0, 'nan', -1.0}
                     normal                        pneumonia                     
0.0:                 214/16807 (1.27%)             214/16807 (1.27%)             
1.0:                 4810/16807 (28.62%)           4810/16807 (28.62%)           
na

# Store results as csv files

In [96]:
filtered_df.to_csv('./normal_meta_data_filtered.csv')
pneumonia_df.to_csv('./pneumonia_meta_data_filtered.csv')