Simple Notebook to create a modified version of the original CheXpert CSV

In [1]:
# Importing necessary packages
import sys
import pandas as pd 
import numpy as np
import os
import torch
from sklearn.model_selection import train_test_split
sys.path.append('/cis/home/bbharti1/projects/CheXpert_PyTorch/')
from utils import get_base_rates
import matplotlib.pyplot as plt

In [2]:
# Set root dir
root_dir = sys.path[-1]

In [3]:
# Load csv and show the head
csv_path = 'train.csv'
org_df = pd.read_csv(csv_path)
file_heads = [os.path.split(os.path.split(org_df['Path'][i])[0])[0] for i in range(len(org_df['Path']))]

# Add column to dataframe
org_df.insert(0,'Patient_ID_path',file_heads)

In [4]:
# Basic statistics of original dataset
print('Number of images = ', len(org_df))
print('Number of patients = ', len(org_df['Patient_ID_path'].unique()))
print('Number of frontal images = ', np.sum(org_df['Frontal/Lateral'] == 'Frontal'))
print('Number of lateral images = ', np.sum(org_df['Frontal/Lateral'] == 'Lateral'))

Number of images =  223414
Number of patients =  64540
Number of frontal images =  191027
Number of lateral images =  32387


In [5]:
# Replace all -1 and NaN with 0 
new_df = org_df.copy(deep=True)
new_df = new_df.replace(-1,0)
new_df = new_df.replace(np.nan,0)

# Male = 1 and Female = 0
new_df = new_df.replace('Male',1)
new_df = new_df.replace('Female',0)
new_df = new_df.drop(new_df[new_df['Sex'] == 'Unknown'].index)
new_df['Sex'] = new_df['Sex'].astype('uint8')

# Remove unnecessary columns: (Only using Frontal images)
new_df = new_df.drop(new_df[new_df['Frontal/Lateral'] == 'Lateral'].index)
new_df = new_df.drop(columns=['Age','Frontal/Lateral','AP/PA'])

# Remaining number of patients and images
print('Remaining number of images = ', len(new_df))
# print('Remaining number of patients = ', len(new_df['Patient_ID_path'].unique()))

Remaining number of images =  191026


In [24]:
# Split csv into csvs for classifier and sex classifier
a = new_df['Sex']
d1, sex_data = train_test_split(new_df,test_size=0.2,stratify=new_df['Sex'],random_state=10)

# Grouping all conditions and adding column
conditions = ['Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema',
              'Consolidation', 'Pneumonia','Atelectasis','Pneumothorax','Pleural Effusion','Pleural Other','Fracture']
grouped_cond = np.array(np.sum(d1[conditions],axis=1))
grouped_cond[grouped_cond > 0] = 1
d1['grouped_condition'] = grouped_cond
conditions = ['Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema',
              'Consolidation', 'Pneumonia','Atelectasis','Pneumothorax','Pleural Effusion','Pleural Other','Fracture',
              'grouped_condition']

In [25]:
# List of competition conditions
a = d1['Sex'].values
for condition in conditions:
    # Create all necessary training and validation csvs

    # Get 4 base rates
    # Getting base rates by condition
    y = d1[condition].values
    return_dict = True
    rates,list = get_base_rates(y,a,return_dict)

    # Add group to dataframe
    df = pd.concat([d1.iloc[:,0:3],d1[condition]],axis=1)
    df['group'] = list

    # Create training and test sets
    df_train, df_test = train_test_split(df,test_size=0.2,stratify=df['group'],random_state=10)

    # Create CSVs
    train_file_name = os.path.join(root_dir,'csvs','train','train_' + condition + '.csv')
    df_train.to_csv(train_file_name,index=False)
    test_file_name = os.path.join(root_dir,'csvs','test','test_' + condition + '.csv')
    df_train.to_csv(test_file_name,index=False)

In [16]:
# Create new csvs
# d1.to_csv('model_train.csv',index=False)
# Create new csvs
sex_data.to_csv(os.path.join(root_dir,'csvs','sex_model_train.csv'),index=False)