In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

from lifelines import CoxPHFitter

from sklearn.model_selection import GridSearchCV, KFold, RepeatedKFold, train_test_split, ParameterGrid
import sksurv.util
import warnings
from lifelines.utils import k_fold_cross_validation
from sklearn.linear_model import ElasticNet

from sksurv.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')


df=pd.read_csv("../1_data/private/vitalant_preprocessed_data.csv")


In [2]:
#split by pre-pandemic and intra pandmeic

# the date to divide the DataFrame
split_date = pd.to_datetime('2020-01-1')
df["Visit_Date"] = df["Visit_Date"].astype("datetime64[ns]")

# Create two subsets based on the split date
pre = df.loc[df["Visit_Date"] <= split_date]
intra = df.loc[df["Visit_Date"] > split_date]

#split by fixed mobile
pre_fixed= pre.query("Fixed_mobile== 1")
pre_mobile= pre.query("Fixed_mobile == 0")

intra_fixed= intra.query("Fixed_mobile== 1")
intra_mobile= intra.query("Fixed_mobile == 0")

ref_fix=['OUTCOME_TYPE_completed', 'RACE_ETHNICITY_White', 'DONOR_GENDER_M','DONOR_ABORH_Opos', 'DONOR_EDU_Post_Secondary']
ref_mob=['OUTCOME_TYPE_completed', 'RACE_ETHNICITY_White', 'DONOR_GENDER_M', 'DONOR_ABORH_Opos', 'DONOR_EDU_Post_Secondary', 'Opp_to_donate_12_more']

In [9]:
#pre COVID
Xpre_fix= pre_fixed.loc[:,['time_to_return','CENSORED','DONOR_AGE_AT_DONATION', 'first_time','DONOR_GENDER', 'RACE_ETHNICITY',  'OUTCOME_TYPE', 'DONOR_ABORH', 'cum_lifetime_donations',  'rbc_loss_last_12_months', 'rbc_loss_last_24_months', 'DONOR_WEIGHT', 'DONOR_HEIGHT', 'DONOR_BORN_IN_US', 'DONOR_EDU']]
                        
#['time_to_return','CENSORED','DONOR_AGE_AT_DONATION', 'first_time','DONOR_GENDER', 'RACE_ETHNICITY',  'OUTCOME_TYPE', 'cum_lifetime_donations',  'rbc_loss_last_12_months', 'rbc_loss_last_24_months', 'DONOR_ABORH', 'DONOR_BORN_IN_US', 'DONOR_EDU', 'DONOR_WEIGHT', 'DONOR_HEIGHT']]

Xpre_fix= pd.get_dummies(Xpre_fix)  
Xpre_fix = Xpre_fix.drop(ref_fix, axis=1)  

Xpre_fix['first_time_hgb']=Xpre_fix["first_time"]*Xpre_fix['OUTCOME_TYPE_low hgb']
Xpre_fix.fillna(0, inplace=True)
ypre_fix= Xpre_fix[['time_to_return']]

Xpre_mob= pre_mobile.loc[:,['time_to_return','CENSORED','DONOR_AGE_AT_DONATION', 'first_time','DONOR_GENDER', 'RACE_ETHNICITY',  'OUTCOME_TYPE', 'DONOR_ABORH', 'cum_lifetime_donations',  'rbc_loss_last_12_months', 'rbc_loss_last_24_months', 'DONOR_WEIGHT', 'DONOR_HEIGHT', 'DONOR_BORN_IN_US', 'DONOR_EDU', 'Opp_to_donate']]

Xpre_mob= pd.get_dummies(Xpre_mob)  
Xpre_mob = Xpre_mob.drop(ref_mob, axis=1)  

Xpre_mob['first_time_hgb']=Xpre_mob["first_time"]*Xpre_mob['OUTCOME_TYPE_low hgb']
Xpre_mob.fillna(0, inplace=True)
ypre_mob= Xpre_mob[['time_to_return']]



Xpre_fix.columns = Xpre_fix.columns.str.replace(' ', '_')
Xpre_mob.columns = Xpre_mob.columns.str.replace(' ', '_')

In [10]:
#intra covid

Xintra_fix= intra_fixed.loc[:,['time_to_return','CENSORED','DONOR_AGE_AT_DONATION', 'first_time','DONOR_GENDER', 'RACE_ETHNICITY',  'OUTCOME_TYPE', 'DONOR_ABORH', 'cum_lifetime_donations',  'rbc_loss_last_12_months', 'rbc_loss_last_24_months', 'DONOR_WEIGHT', 'DONOR_HEIGHT', 'DONOR_BORN_IN_US', 'DONOR_EDU']]

Xintra_fix= pd.get_dummies(Xintra_fix)  
Xintra_fix = Xintra_fix.drop(ref_fix, axis=1)  


Xintra_fix['first_time_hgb']=Xintra_fix["first_time"]*Xintra_fix['OUTCOME_TYPE_low hgb']
Xintra_fix.fillna(0, inplace=True)
yintra_fix= Xintra_fix[[ 'time_to_return']]

Xintra_mob= intra_mobile.loc[:,['time_to_return','CENSORED','DONOR_AGE_AT_DONATION', 'first_time','DONOR_GENDER', 'RACE_ETHNICITY',  'OUTCOME_TYPE', 'DONOR_ABORH', 'cum_lifetime_donations',  'rbc_loss_last_12_months', 'rbc_loss_last_24_months', 'DONOR_WEIGHT', 'DONOR_HEIGHT', 'DONOR_BORN_IN_US', 'DONOR_EDU', 'Opp_to_donate']]


Xintra_mob= pd.get_dummies(Xintra_mob)  
Xintra_mob = Xintra_mob.drop(ref_mob, axis=1)  

Xintra_mob['first_time_hgb']=Xintra_mob["first_time"]*Xintra_mob['OUTCOME_TYPE_low hgb']
Xintra_mob.fillna(0, inplace=True)
yintra_mob= Xintra_mob[[ 'time_to_return']]


Xintra_fix.columns = Xintra_fix.columns.str.replace(' ', '_')
Xintra_mob.columns = Xintra_mob.columns.str.replace(' ', '_')

In [11]:
#plt.figure(figsize=(16, 16))
#sns.heatmap(Xintra_fix.corr(), annot=True, cmap='coolwarm', linewidths=0.5)

In [12]:
#drop unknowns
Xpre_fix.drop(['RACE_ETHNICITY_UNKNOWN', 'DONOR_ABORH_UNK', 'DONOR_EDU_UNAVAILABLE'], axis=1, inplace=True)
Xintra_fix.drop(['RACE_ETHNICITY_UNKNOWN', 'DONOR_ABORH_UNK', 'DONOR_EDU_UNAVAILABLE','DONOR_GENDER_UNKNOWN'], axis=1, inplace=True)
Xpre_mob.drop(['RACE_ETHNICITY_UNKNOWN', 'DONOR_ABORH_UNK', 'DONOR_EDU_UNAVAILABLE','DONOR_GENDER_UNKNOWN'], axis=1, inplace=True)
Xintra_mob.drop(['RACE_ETHNICITY_UNKNOWN', 'DONOR_ABORH_UNK', 'DONOR_EDU_UNAVAILABLE'], axis=1, inplace=True)

In [13]:
#cph = CoxPHFitter()
#cph.fit(Xintra_mob, 'time_to_return', 'CENSORED')
#cph.print_summary()

In [14]:
Xpre_fix.drop(['time_to_return'], axis=1, inplace=True)
Xintra_fix.drop(['time_to_return'], axis=1, inplace=True)
Xpre_mob.drop(['time_to_return'], axis=1, inplace=True)
Xintra_mob.drop(['time_to_return'], axis=1, inplace=True)

In [15]:
from lifelines.utils.sklearn_adapter import sklearn_adapter
param_grid = {
    'penalizer': [0.01],  # Range of alpha values 
    'l1_ratio':[1]#np.arange(0, 1, 0.0) # Choose lasso - for variable selection
}

base_class = sklearn_adapter(CoxPHFitter, event_col='CENSORED')
cph = base_class(solver='newton-cg', tol=1e-4)

gcv = GridSearchCV(cph,
    param_grid=param_grid,
    cv=5,
    error_score='raise')

In [86]:
gcv.fit(Xpre_fix, ypre_fix)

best_params = gcv.best_params_
best_model = gcv.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_params)

# Print the summary of the best model
print("\nBest Model Summary:")
print(best_model.lifelines_model.print_summary())

Best Hyperparameters:
{'l1_ratio': 1, 'penalizer': 0.001}

Best Model Summary:


0,1
model,lifelines.CoxPHFitter
duration col,'duration_col'
event col,'CENSORED'
penalizer,0.001
l1 ratio,1
baseline estimation,breslow
number of observations,1.09079e+06
number of events observed,908291
partial log-likelihood,-11952326.95
time fit was run,2023-10-10 18:35:53 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
DONOR_AGE_AT_DONATION,0.01,1.01,0.0,0.01,0.01,1.01,1.01,0.0,172.85,<0.005,inf
first_time,-0.74,0.48,0.01,-0.75,-0.73,0.47,0.48,0.0,-140.36,<0.005,inf
cum_lifetime_donations,0.0,1.0,0.01,-0.01,0.01,0.99,1.01,0.0,0.74,0.46,1.12
rbc_loss_last_12_months,-0.0,1.0,0.0,-0.01,0.01,0.99,1.01,0.0,-0.04,0.97,0.04
rbc_loss_last_24_months,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.03,0.98,0.04
DONOR_WEIGHT,-0.0,1.0,0.0,-0.0,-0.0,1.0,1.0,0.0,-4.0,<0.005,13.96
DONOR_HEIGHT,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,24.55,<0.005,439.56
DONOR_BORN_IN_US,0.3,1.35,0.0,0.29,0.31,1.34,1.36,0.0,92.83,<0.005,inf
DONOR_GENDER_F,-0.01,0.99,0.0,-0.02,-0.01,0.98,0.99,0.0,-5.82,<0.005,27.37
RACE_ETHNICITY_Asian,0.02,1.02,0.01,0.01,0.04,1.01,1.04,0.0,3.27,<0.005,9.86

0,1
Concordance,0.62
Partial AIC,23904713.89
log-likelihood ratio test,174943.53 on 30 df
-log2(p) of ll-ratio test,inf


None


In [28]:
gcv.fit(Xpre_mob, ypre_mob)

best_params = gcv.best_params_
best_model = gcv.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_params)

# Print the summary of the best model
print("\nBest Model Summary:")
print(best_model.lifelines_model.print_summary())

Best Hyperparameters:
{'l1_ratio': 0.99, 'penalizer': 0.001}

Best Model Summary:


0,1
model,lifelines.CoxPHFitter
duration col,'duration_col'
event col,'CENSORED'
penalizer,0.001
l1 ratio,0.99
baseline estimation,breslow
number of observations,2.00683e+06
number of events observed,1.18434e+06
partial log-likelihood,-16489866.59
time fit was run,2023-09-26 20:55:18 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
DONOR_AGE_AT_DONATION,0.02,1.02,0.0,0.02,0.02,1.02,1.02,0.0,283.36,<0.005,inf
first_time,-0.76,0.47,0.0,-0.76,-0.75,0.47,0.47,0.0,-289.14,<0.005,inf
cum_lifetime_donations,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.04,0.96,0.05
unit_rbc_loss,0.0,1.0,0.01,-0.01,0.01,0.99,1.01,0.0,0.01,0.99,0.01
rbc_loss_last_12_months,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.04,0.97,0.04
rbc_loss_last_24_months,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.05,0.96,0.06
days_since_last_rbc_loss,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.07,0.95,0.08
days_since_last_drbc_loss,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.05,0.96,0.06
DONOR_GENDER_F,0.08,1.08,0.01,0.07,0.09,1.07,1.1,0.0,14.03,<0.005,146.23
DONOR_GENDER_M,-0.0,1.0,0.01,-0.01,0.01,0.99,1.01,0.0,-0.01,0.99,0.01

0,1
Concordance,0.68
Partial AIC,32979783.18
log-likelihood ratio test,461702.11 on 25 df
-log2(p) of ll-ratio test,inf


None


In [16]:
gcv.fit(Xintra_fix, yintra_fix)

best_params = gcv.best_params_
best_model = gcv.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_params)

# Print the summary of the best model
print("\nBest Model Summary:")
print(best_model.lifelines_model.print_summary())

Best Hyperparameters:
{'l1_ratio': 1, 'penalizer': 0.01}

Best Model Summary:


0,1
model,lifelines.CoxPHFitter
duration col,'duration_col'
event col,'CENSORED'
penalizer,0.01
l1 ratio,1
baseline estimation,breslow
number of observations,1.28711e+06
number of events observed,913579
partial log-likelihood,-12249988.47
time fit was run,2023-10-11 17:16:11 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
DONOR_AGE_AT_DONATION,0.01,1.01,0.0,0.01,0.01,1.01,1.01,0.0,164.36,<0.005,inf
first_time,-0.76,0.47,0.0,-0.77,-0.75,0.46,0.47,0.0,-171.43,<0.005,inf
cum_lifetime_donations,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.01,0.99,0.01
rbc_loss_last_12_months,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.0,1.00,0.01
rbc_loss_last_24_months,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.01,0.99,0.01
DONOR_WEIGHT,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.04,0.97,0.05
DONOR_HEIGHT,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,56.24,<0.005,inf
DONOR_BORN_IN_US,0.21,1.24,0.0,0.21,0.22,1.23,1.25,0.0,63.73,<0.005,inf
DONOR_GENDER_F,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.02,0.99,0.02
RACE_ETHNICITY_Asian,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.04,0.97,0.05

0,1
Concordance,0.61
Partial AIC,24500032.94
log-likelihood ratio test,132878.07 on 28 df
-log2(p) of ll-ratio test,inf


None


In [None]:
gcv.fit(Xintra_mob, yintra_mob)

best_params = gcv.best_params_
best_model = gcv.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_params)

# Print the summary of the best model
print("\nBest Model Summary:")
print(best_model.lifelines_model.print_summary())