In [18]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from lifelines import KaplanMeierFitter, CoxPHFitter

In [36]:
# Load data
data = pd.read_csv("C:/Users/ashle/OneDrive/Documents/Courses/UNC Courses/BIOS 635 - ML/Midterm/train.csv")
data_test = pd.read_csv("C:/Users/ashle/OneDrive/Documents/Courses/UNC Courses/BIOS 635 - ML/Midterm/test.csv")

# Data Preprocessing 

In [21]:
# Examine data
print(data.shape)
print(data_test.shape)
data.head()

(28800, 60)
(3, 58)


Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,...,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793
3,3,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349
4,4,High,No,,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.0,16.223


Since there are so many NaNs, we will replace the missing data.

In [37]:
# Replace missing data

# Check missing values
print(f"Number of rows with NaN: {data.isnull().any(axis=1).sum()}")

# Fill numerical variables with median
for col in data.select_dtypes(include=['number']).columns:  
    data[col].fillna(data[col].median(), inplace=True)

# Fill categorical variables with mode
for col in data.select_dtypes(include=['object', 'category']).columns:  
    data[col].fillna(data[col].mode()[0], inplace=True)
    
# Check NaNs are removed
print(f"Number of rows with NaN in Train now: {data.isnull().any(axis=1).sum()}")

# Do the same preprocessing for test set
print(f"Number of rows with NaN in Test: {data_test.isnull().any(axis=1).sum()}")

for col in data_test.select_dtypes(include=['number']).columns:  
    data_test[col].fillna(data_test[col].median(), inplace=True)

for col in data_test.select_dtypes(include=['object', 'category']).columns:  
    data_test[col].fillna(data_test[col].mode()[0], inplace=True)
    
print(f"Number of rows with NaN in Test now: {data_test.isnull().any(axis=1).sum()}")

Number of rows with NaN: 26826
Number of rows with NaN in Train now: 0
Number of rows with NaN in Test: 2
Number of rows with NaN in Test now: 0


In [24]:
# Define a function to calculate the survival rate
def transform_survival_rate(df, time_col='efs_time', event_col='efs'):
    """
    Transform the efs and efs_time data into survival rates using the Kaplan-Meier estimator.
    """
    kmf = KaplanMeierFitter()  # initalize estimator
    kmf.fit(df[time_col], df[event_col])  # fit the estimator
    survival_prob = kmf.survival_function_at_times(df[time_col]).to_numpy().flatten()  # calculate probabilites
    df["survival_rate"] = survival_prob  # add probabilities as new variable   
    return df

In [38]:
# Adding survival rate variable
data = transform_survival_rate(data, "efs_time", "efs")
data.head()

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time,survival_rate
0,0,N/A - non-malignant indication,No,Poor,No,2.0,8.0,No TBI,No,6.0,...,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356,0.458687
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672,0.847759
2,2,N/A - non-malignant indication,No,Poor,No,2.0,8.0,No TBI,No,6.0,...,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793,0.462424
3,3,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349,0.456661
4,4,High,No,Poor,No,2.0,8.0,No TBI,No,6.0,...,Related,MEL,8.0,No,2.0,No,10.0,0.0,16.223,0.464674


# Creating Models