In [None]:
import numpy as np
import pandas as pd
import shutil
import matplotlib.pyplot as plt

In [None]:
# Whether to plot matplotlib figures inside the notebook or create an output file for latex
USE_LATEX_ENGINE = True

if USE_LATEX_ENGINE:
    import matplotlib
    # matplotlib.use("pgf")
    matplotlib.rcParams.update({
        # "pgf.texsystem": "pdflatex",
        # 'pgf.rcfonts': False,
        'font.family': 'serif',
        'text.usetex': True,
    })

In [None]:
normal_meta_data = '/mnt/c/Users/Jan/Daten/Geschäftlich/Capgemini/scripts/tmp/filtered_PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv'
pneumonia_meta_data = '/mnt/c/Users/Jan/Daten/Geschäftlich/Capgemini/scripts/tmp/filtered_pneumonia.csv'

# read csv files, engine='python' improves parsing, column differentiation, etc.
normal_meta_data = pd.read_csv(normal_meta_data, engine='python')
pneumonia_meta_data = pd.read_csv(pneumonia_meta_data, engine='python')

In [None]:
def _generate_output_string(element, data):
    """
    Returns a string that contains information about the percentual share of a value in a list
    """
    if not isinstance(element, str): # make sure element is of type string, convert otherwise
        element = str(element)
    
    n_total = len(data)
    bool_list = (data == element)
    n_elements = sum(bool_list)

    return "{:<30}".format(str(n_elements) + '/' + str(n_total) + ' (' + "{:.2f}".format(((n_elements / n_total) * 100)) + '%)')

In [None]:
def calculate_percentages(normal_data, pneumonia_data):
    """
    checks if two lists contain the same set of values - returns a warning if the sets dont match - prints a string with information about the distribution if they match
    """
    if len(set(normal_data)) is not len(set(pneumonia_data)):
        print('WARNING! Categories of healthy and pneumonia data are not identical!')
        print('Normal: ' + str(set(normal_data)))
        print('Pneumonia: ' + str(set(pneumonia_data)))
        print('Number of appearences (normal):')
        n_normal_set = ''
        for category in set(normal_data):
            n_normal_set += "'" + str(category) + "': " + str(sum(normal_data == category)) + '\n'   # single quote: non-escaped string, double quote: escaped string
        print(n_normal_set)
        return
    else:
        print(set(normal_data))

    print('{:<21}'.format('') + '{:<30}'.format('normal') + '{:<30}'.format('pneumonia'))

    for element_normal, element_pneumonia in zip(set(normal_data), set(pneumonia_data)):
        category_string = "{:<21}".format(element_normal + ': ')
                
        print(category_string + _generate_output_string(element_normal, normal_data) + _generate_output_string(element_pneumonia, pneumonia_data))

In [None]:
def find_appearences(df, column, value, replace_nan_with=None):
    """
    searches for appearences of a specified value within the column of a dataframe - prints the number of appearences and the corresponding indexes
    """
    if replace_nan_with is not None:
        print('Replacing ' + str(df.isnull().sum().sum()) + ' appearances of NaN with: ' + str(replace_nan_with))
        df.fillna(replace_nan_with, inplace=True)   # be careful, this modifies the original dataframe that has been passed as variable
    value = str(value)
    print('found ' + str(sum(df[column] == value)) + ' rows')
    index_list = df.index[df[column] == value].tolist()
    print('Indexes: ' + str(index_list))

In [None]:
def drop_from_column(df, column, value):
    """
    searches for appearences of a specified value within a column of a dataframe - drops each row where the value appeared from the dataframe
    """
    value = str(value)
    print('Dropping ' + str(sum(df[column] == value)) + ' rows')
    index_list = df.index[df[column] == value].tolist()
    df.drop(index_list, inplace=True)

In [None]:
print('==== Distribution of projections ====\n')

calculate_percentages(normal_meta_data['Projection'], pneumonia_meta_data['Projection'])

find_appearences(normal_meta_data, 'Projection', 'UNK')
drop_from_column(normal_meta_data, 'Projection', 'UNK')

calculate_percentages(normal_meta_data['Projection'], pneumonia_meta_data['Projection'])

In [None]:
print('==== Distribution of MethodLabel ====\n')

calculate_percentages(normal_meta_data['MethodLabel'], pneumonia_meta_data['MethodLabel'])

In [None]:
print('==== Distribution of Patient Sex ====\n')

calculate_percentages(normal_meta_data['PatientSex_DICOM'], pneumonia_meta_data['PatientSex_DICOM'])

find_appearences(normal_meta_data, 'PatientSex_DICOM', '', 'U')     # 'U' for unknown

In [None]:
calculate_percentages(normal_meta_data['PatientSex_DICOM'], pneumonia_meta_data['PatientSex_DICOM'])

find_appearences(normal_meta_data, 'PatientSex_DICOM', '')      # 'U' for unknown

drop_from_column(normal_meta_data, 'PatientSex_DICOM', 'U')     # drop 'U' and 'O' since they are heavily underrepresented
drop_from_column(normal_meta_data, 'PatientSex_DICOM', 'O')

calculate_percentages(normal_meta_data['PatientSex_DICOM'], pneumonia_meta_data['PatientSex_DICOM'])

In [None]:
print('==== Distribution of ExposureTime ====\n')

calculate_percentages(normal_meta_data['ExposureTime'], pneumonia_meta_data['ExposureTime'])


In [None]:
calculate_percentages(normal_meta_data['Exposure_DICOM'], pneumonia_meta_data['Exposure_DICOM'])

filtered_exposure_normal = [int(x) for x in normal_meta_data['Exposure_DICOM'] if str(x) != 'None']
filtered_exposure_pneumonia = [int(x) for x in pneumonia_meta_data['Exposure_DICOM'] if str(x) != 'None']
print(len(filtered_exposure_normal))
print(len(filtered_exposure_pneumonia))


In [None]:
filtered_exposure_normal = sorted(filtered_exposure_normal)
filtered_exposure_pneumonia = sorted(filtered_exposure_pneumonia)

fig, ax_exp_time = plt.subplots()  # a figure with a single Axes

ax_exp_time.set_yscale('log')   # logarithmic scale on y axis

# bins must be -0.5 to center the x ticks, for detailled explanation see https://stackoverflow.com/questions/27083051/matplotlib-xticks-not-lining-up-with-histogram
ax_exp_time.hist(filtered_exposure_normal, np.arange(42)-0.5, alpha=0.5, label="Normal")
ax_exp_time.hist(filtered_exposure_pneumonia,np.arange(42)-0.5, alpha=0.5, color='r', label="Pneumonie")
ax_exp_time.set_xlabel('Strom * Zeit in mAs')
ax_exp_time.set_ylabel('Anzahl Röntgenbilder')
ax_exp_time.legend()
ax_exp_time.grid(True)

if USE_LATEX_ENGINE:
    plt.savefig("/mnt/c/Users/Jan/Daten/Dropbox/Master/3_Semester/Masterarbeit/Latex/python_output/padchest_combined_exposure.pdf")
    # plt.savefig('/mnt/c/Users/Jan/Daten/Dropbox/Master/3_Semester/Masterarbeit/Latex/python_output/padchest_combined_exposure.pgf')

In [None]:
print('==== Distribution of XRayTubeCurrent_DICOM ====\n')

calculate_percentages(normal_meta_data['XRayTubeCurrent_DICOM'], pneumonia_meta_data['XRayTubeCurrent_DICOM'])

In [None]:
print('==== Distribution of Exposure_DICOM ====\n')

calculate_percentages(normal_meta_data['Exposure_DICOM'], pneumonia_meta_data['Exposure_DICOM'])


In [None]:
print('==== Distribution of Modality_DICOM ====\n')

calculate_percentages(normal_meta_data['Modality_DICOM'], pneumonia_meta_data['Modality_DICOM'])

In [None]:
print('==== Distribution of Manufacturer_DICOM ====\n')

calculate_percentages(normal_meta_data['Manufacturer_DICOM'], pneumonia_meta_data['Manufacturer_DICOM'])

In [None]:
print('==== Distribution of PixelAspectRatio_DICOM ====\n')
calculate_percentages(normal_meta_data['PixelAspectRatio_DICOM'], pneumonia_meta_data['PixelAspectRatio_DICOM'])

In [None]:
print('==== Distribution of Age ====\n')
calculate_percentages(normal_meta_data['PatientBirth'], pneumonia_meta_data['PatientBirth'])

In [None]:
drop_from_column(normal_meta_data, 'PatientBirth', 'U') # drop unknown birth years, since there's only one entry

In [None]:
normal_age_list = list()
number_of_normal = list()
for y in set(normal_meta_data['PatientBirth']):
    normal_age_list += [int(y)]
    number_of_normal += [sum(normal_meta_data['PatientBirth'] == y)]

print(normal_age_list)
print(number_of_normal)

In [None]:
pneumonia_age_list = list()
number_of_pneumonia = list()
for y in set(pneumonia_meta_data['PatientBirth']):
    pneumonia_age_list += [int(y)]
    number_of_pneumonia += [sum(pneumonia_meta_data['PatientBirth'] == y)]

print(pneumonia_age_list)
print(number_of_pneumonia)

In [None]:
plt.axis([1900, 2021 , 0, 1001])
plt.plot(normal_age_list, number_of_normal, label="Normal")
plt.plot(pneumonia_age_list, number_of_pneumonia, 'r', label="Pneumonie")
plt.ylabel('Anzahl Patienten')
plt.xlabel('Geburtsjahr')
plt.legend()
plt.grid(True)
# plt.show()

if USE_LATEX_ENGINE:
    plt.savefig('/mnt/c/Users/Jan/Daten/Dropbox/Master/3_Semester/Masterarbeit/Latex/python_output/year_of_birth_diagram.pdf')


In [None]:
normal_ages = []
for study, birth in zip(normal_meta_data['StudyDate_DICOM'], normal_meta_data['PatientBirth']):
    normal_ages.append(int(str(study)[:4]) - int(birth))
print(normal_ages[:20])

In [None]:
pneumonia_ages = []
for study, birth in zip(pneumonia_meta_data['StudyDate_DICOM'], pneumonia_meta_data['PatientBirth']):
    pneumonia_ages.append(int(str(study)[:4]) - int(birth))
print(pneumonia_ages[:20])

In [None]:
x_axis_age = np.arange(max(set(normal_ages))+2)-0.5 # would be +1 for right aligned, but we want to center the x ticks -> +0.5
plt.hist(normal_ages, x_axis_age, alpha=0.5, label="Normal")
plt.hist(pneumonia_ages, x_axis_age, alpha=0.5, color='r', label="Pneumonie")
plt.ylabel('Anzahl Patienten')
plt.xlabel('Alter')
plt.legend()
plt.grid(True)
# plt.show()

if USE_LATEX_ENGINE:
    plt.savefig('/mnt/c/Users/Jan/Daten/Dropbox/Master/3_Semester/Masterarbeit/Latex/python_output/padchest_age_histogram.pdf')


In [None]:
set(normal_meta_data)

In [None]:
duplicate_patients = set(normal_meta_data['PatientID']) & set(pneumonia_meta_data['PatientID'])
print(str(len(duplicate_patients)) + ' Patients are both in Normal and Pneumonia data present')
print('Total number of Patients:\nNormal: ' + str(len(set(normal_meta_data['PatientID']))) + '\nPneumonia: ' + str(len(set(pneumonia_meta_data['PatientID']))))

In [None]:
all_patient_ids = normal_meta_data['PatientID'].append(pneumonia_meta_data['PatientID'])
print(str(len(all_patient_ids)))
print(str(len(normal_meta_data['ImageID'])))
print(str(len(pneumonia_meta_data['ImageID'])))