In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns

from lifelines import CoxPHFitter

from sklearn.model_selection import GridSearchCV, KFold, RepeatedKFold, train_test_split, ParameterGrid
import sksurv.util
import warnings
from lifelines.utils import k_fold_cross_validation
from sklearn.linear_model import ElasticNet

from sklearn.preprocessing import OrdinalEncoder
import warnings
warnings.filterwarnings('ignore')

df=pd.read_csv("../1_data/private/preprocessed_data.csv")
df.columns

Index(['DonorID', 'Visit_Date', 'Visit_yr', 'Visit_Mo', 'donation_time',
       'don_Sequence', 'col_Int', 'Visit_Age', 'sex', 'Fixed', 'MobileID',
       'donation_product', 'DonProc', 'Outcome', 'ABO_RH',
       'Disease_marker_results', 'HB_Value', 'race_group', 'race', 'Ferritin',
       'category', 'DefCode', 'Def_start', 'Def_end', 'Deferral_permanent',
       'EffectiveYr', 'EffectiveMo', 'rbc_loss_last_12_months',
       'rbc_loss_last_24_months', 'OUTCOME_TYPE', 'CENSORED', 'time_to_return',
       'time_to_return2', 'def_Year', 'first_time', 'high_school_age',
       'cum_lifetime_donations', 'Fixed_mobile', 'FIXED_NEXT',
       'fixed_mobile_pattern', 'competing_events', 'NEXT_DON', 'PREV_DON',
       'def_prod_imputed', 'return_to_same_ID_by_dn', 'return_to_same_ID',
       'hgb_deferral', 'count_in_next_12_months', 'Opp_to_donate'],
      dtype='object')

In [2]:
#split by pre-pandemic and intra pandmeic

# the date to divide the DataFrame
split_date = pd.to_datetime('2020-01-1')
df["Visit_Date"] = df["Visit_Date"].astype("datetime64")

# Create two subsets based on the split date
pre = df.loc[df['Visit_Date'] <= split_date]
intra = df.loc[df['Visit_Date'] > split_date]



In [3]:
#split by fixed mobile
pre_fixed= pre.query("Fixed_mobile== 1")
pre_mobile= pre.query("Fixed_mobile == 0")

intra_fixed= intra.query("Fixed_mobile== 1")
intra_mobile= intra.query("Fixed_mobile == 0")

ref_fix=['OUTCOME_TYPE_completed', 'race_White', 'ABO_RH_Opos', 'sex_M']
ref_mob=['OUTCOME_TYPE_completed', 'race_White', 'ABO_RH_Opos', 'sex_M', 'Opp_to_donate_12_more']

In [4]:
#pre COVID
Xpre_fix= pre_fixed.loc[:,['time_to_return','CENSORED','Visit_Age', 'first_time'  ,'sex', 'race', 'OUTCOME_TYPE', 'cum_lifetime_donations',  'rbc_loss_last_12_months', 'rbc_loss_last_24_months', 'high_school_age', 'ABO_RH']]

Xpre_fix= pd.get_dummies(Xpre_fix)  
Xpre_fix = Xpre_fix.drop(ref_fix, axis=1)  

Xpre_fix['first_time_hgb']=Xpre_fix["first_time"]*Xpre_fix['OUTCOME_TYPE_low hgb']
Xpre_fix.fillna(0, inplace=True)
ypre_fix= Xpre_fix[['time_to_return']]

Xpre_mob= pre_mobile.loc[:,['time_to_return','CENSORED','Visit_Age', 'first_time'  ,'sex', 'race', 'OUTCOME_TYPE', 'cum_lifetime_donations',  'rbc_loss_last_12_months', 'rbc_loss_last_24_months', 'high_school_age', 'ABO_RH', 'Opp_to_donate']]

Xpre_mob= pd.get_dummies(Xpre_mob)  
Xpre_mob = Xpre_mob.drop(ref_mob, axis=1)  

Xpre_mob['first_time_hgb']=Xpre_mob["first_time"]*Xpre_mob['OUTCOME_TYPE_low hgb']
Xpre_mob.fillna(0, inplace=True)
ypre_mob= Xpre_mob[[ 'time_to_return']]

Xpre_fix.columns = Xpre_fix.columns.str.replace(' ', '_')
Xpre_mob.columns = Xpre_mob.columns.str.replace(' ', '_')

In [5]:
#intra covid

Xintra_fix= intra_fixed.loc[:,['time_to_return','CENSORED','Visit_Age', 'first_time'  ,'sex', 'race', 'OUTCOME_TYPE', 'cum_lifetime_donations',  'rbc_loss_last_12_months', 'rbc_loss_last_24_months', 'high_school_age', 'ABO_RH']]

Xintra_fix= pd.get_dummies(Xintra_fix)  
Xintra_fix = Xintra_fix.drop(ref_fix, axis=1)  


Xintra_fix['first_time_hgb']=Xintra_fix["first_time"]*Xintra_fix['OUTCOME_TYPE_low hgb']
Xintra_fix.fillna(0, inplace=True)
yintra_fix= Xintra_fix[[ 'time_to_return']]

Xintra_mob= intra_mobile.loc[:,['time_to_return', 'CENSORED','Visit_Age', 'first_time'  ,'sex', 'race', 'OUTCOME_TYPE', 'cum_lifetime_donations',  'rbc_loss_last_12_months', 'rbc_loss_last_24_months', 'high_school_age', 'ABO_RH', 'Opp_to_donate']]


Xintra_mob= pd.get_dummies(Xintra_mob)  
Xintra_mob = Xintra_mob.drop(ref_mob, axis=1)  

Xintra_mob['first_time_hgb']=Xintra_mob["first_time"]*Xintra_mob['OUTCOME_TYPE_low hgb']
Xintra_mob.fillna(0, inplace=True)
yintra_mob= Xintra_mob[[ 'time_to_return']]

Xintra_fix.columns = Xintra_fix.columns.str.replace(' ', '_')
Xintra_mob.columns = Xintra_mob.columns.str.replace(' ', '_')


In [6]:
Xpre_fix.drop(['race_unknown', 'ABO_RH_UNK'], axis=1, inplace=True)
Xintra_fix.drop(['race_unknown', 'ABO_RH_UNK'], axis=1, inplace=True)
Xpre_mob.drop(['race_unknown', 'ABO_RH_UNK'], axis=1, inplace=True)
Xintra_mob.drop(['race_unknown', 'ABO_RH_UNK'], axis=1, inplace=True)

In [None]:
#univariate regression 
#pre-fixed
results_pre_fix = pd.DataFrame(columns=['Covariate', 'Hazard Ratio', 'Lower', 'Upper'])

covariates = Xpre_fix.columns.drop(['time_to_return', 'CENSORED'])
# Loop through covariates and fit univariate models
for covariate in covariates:
    cph_univariate = CoxPHFitter()
    cph_univariate.fit(Xpre_fix, duration_col='time_to_return', event_col='CENSORED', formula=covariate)
    hazard_ratio = cph_univariate.summary.loc[covariate, 'exp(coef)']
    lower = cph_univariate.summary.loc[covariate, 'exp(coef) lower 95%']
    upper = cph_univariate.summary.loc[covariate, 'exp(coef) upper 95%']
    results_pre_fix = results_pre_fix.append({'Covariate': covariate, 'Hazard Ratio': hazard_ratio, 'Lower': lower, 'Upper': upper}, ignore_index=True)
    
results_pre_fix


In [None]:
#pre mobile
results_pre_mob = pd.DataFrame(columns=['Covariate', 'Hazard Ratio', 'Lower', 'Upper'])

covariates = Xpre_mob.columns.drop(['time_to_return', 'CENSORED'])
# Loop through covariates and fit univariate models
for covariate in covariates:
    cph_univariate = CoxPHFitter()
    cph_univariate.fit(Xpre_mob, duration_col='time_to_return', event_col='CENSORED', formula=covariate)
    hazard_ratio = cph_univariate.summary.loc[covariate, 'exp(coef)']
    lower = cph_univariate.summary.loc[covariate, 'exp(coef) lower 95%']
    upper = cph_univariate.summary.loc[covariate, 'exp(coef) upper 95%']
    results_pre_mob = results_pre_mob.append({'Covariate': covariate, 'Hazard Ratio': hazard_ratio, 'Lower': lower, 'Upper': upper}, ignore_index=True)
    
results_pre_mob


In [None]:
#intra fixed
results_intra_fix = pd.DataFrame(columns=['Covariate', 'Hazard Ratio', 'Lower', 'Upper'])
covariates = Xintra_fix.columns.drop(['time_to_return', 'CENSORED'])
# Loop through covariates and fit univariate models
for covariate in covariates:
    cph_univariate = CoxPHFitter()
    cph_univariate.fit(Xintra_fix, duration_col='time_to_return', event_col='CENSORED', formula=covariate)
    hazard_ratio = cph_univariate.summary.loc[covariate, 'exp(coef)']
    lower = cph_univariate.summary.loc[covariate, 'exp(coef) lower 95%']
    upper = cph_univariate.summary.loc[covariate, 'exp(coef) upper 95%']
    results_intra_fix = results_intra_fix.append({'Covariate': covariate, 'Hazard Ratio': hazard_ratio, 'Lower': lower, 'Upper': upper}, ignore_index=True)
    
results_intra_fix


In [None]:
#intra mobile
results_intra_mob = pd.DataFrame(columns=['Covariate', 'Hazard Ratio', 'Lower', 'Upper'])
covariates = Xintra_mob.columns.drop(['time_to_return', 'CENSORED'])
# Loop through covariates and fit univariate models
for covariate in covariates:
    cph_univariate = CoxPHFitter()
    cph_univariate.fit(Xintra_mob, duration_col='time_to_return', event_col='CENSORED', formula=covariate)
    hazard_ratio = cph_univariate.summary.loc[covariate, 'exp(coef)']
    lower = cph_univariate.summary.loc[covariate, 'exp(coef) lower 95%']
    upper = cph_univariate.summary.loc[covariate, 'exp(coef) upper 95%']
    results_intra_mob = results_intra_mob.append({'Covariate': covariate, 'Hazard Ratio': hazard_ratio, 'Lower': lower, 'Upper': upper}, ignore_index=True)
    
results_intra_mob

In [7]:
Xpre_fix.drop(['time_to_return'], axis=1, inplace=True)
Xintra_fix.drop(['time_to_return'], axis=1, inplace=True)
Xpre_mob.drop(['time_to_return'], axis=1, inplace=True)
Xintra_mob.drop(['time_to_return'], axis=1, inplace=True)

In [8]:
#cph = CoxPHFitter()
#cph.fit(Xintra_mob, 'time_to_return', 'CENSORED')
#cph.print_summary()

In [11]:
from lifelines.utils.sklearn_adapter import sklearn_adapter
param_grid = {
    'penalizer': [0.01],  # Range of alpha values 
    'l1_ratio':[1] #np.arange(0, 1, 0.0) # Choose lasso - for variable selection
}

base_class = sklearn_adapter(CoxPHFitter, event_col='CENSORED')
cph = base_class(solver='newton-cg', tol=1e-4)

gcv = GridSearchCV(cph,
    param_grid=param_grid,
    cv=5,
    error_score='raise')

In [12]:
gcv.fit(Xpre_fix, ypre_fix)

best_params = gcv.best_params_
best_model = gcv.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_params)

# Print the summary of the best model
print("\nBest Model Summary:")
print(best_model.lifelines_model.print_summary())

Best Hyperparameters:
{'l1_ratio': 1, 'penalizer': 0.01}

Best Model Summary:


0,1
model,lifelines.CoxPHFitter
duration col,'duration_col'
event col,'CENSORED'
penalizer,0.01
l1 ratio,1
baseline estimation,breslow
number of observations,763017
number of events observed,761715
partial log-likelihood,-9481790.29
time fit was run,2023-10-11 17:24:35 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
Visit_Age,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,18.07,<0.005,239.95
first_time,0.26,1.29,0.01,0.24,0.28,1.27,1.32,0.0,28.34,<0.005,584.49
cum_lifetime_donations,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,22.56,<0.005,372.01
rbc_loss_last_12_months,0.16,1.17,0.0,0.16,0.16,1.17,1.18,0.0,117.57,<0.005,inf
rbc_loss_last_24_months,0.05,1.05,0.0,0.05,0.05,1.05,1.06,0.0,71.48,<0.005,inf
high_school_age,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.02,0.99,0.02
sex_F,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.04,0.97,0.04
race_African_Black,0.08,1.08,0.0,0.07,0.08,1.07,1.09,0.0,27.98,<0.005,569.89
race_Asian,-0.03,0.97,0.01,-0.04,-0.02,0.96,0.98,0.0,-5.75,<0.005,26.72
race_Mixed_Race,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.04,0.97,0.04

0,1
Concordance,0.65
Partial AIC,18963622.58
log-likelihood ratio test,159433.75 on 21 df
-log2(p) of ll-ratio test,inf


None


In [13]:
gcv.fit(Xpre_mob, ypre_mob)

best_params = gcv.best_params_
best_model = gcv.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_params)

# Print the summary of the best model
print("\nBest Model Summary:")
print(best_model.lifelines_model.print_summary())

KeyboardInterrupt: 

In [306]:
gcv.fit(Xintra_fix, yintra_fix)

best_params = gcv.best_params_
best_model = gcv.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_params)

# Print the summary of the best model
print("\nBest Model Summary:")
print(best_model.lifelines_model.print_summary())

Best Hyperparameters:
{'l1_ratio': 1, 'penalizer': 0.001}

Best Model Summary:


0,1
model,lifelines.CoxPHFitter
duration col,'duration_col'
event col,'CENSORED'
penalizer,0.001
l1 ratio,1
baseline estimation,breslow
number of observations,1.05748e+06
number of events observed,923399
partial log-likelihood,-11912902.52
time fit was run,2023-10-10 19:04:53 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
Visit_Age,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,25.73,<0.005,482.65
first_time,0.34,1.4,0.01,0.32,0.35,1.38,1.43,0.0,41.1,<0.005,inf
cum_lifetime_donations,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,37.08,<0.005,997.47
rbc_loss_last_12_months,0.2,1.22,0.0,0.19,0.2,1.21,1.22,0.0,161.07,<0.005,inf
rbc_loss_last_24_months,0.05,1.05,0.0,0.04,0.05,1.04,1.05,0.0,67.28,<0.005,inf
high_school_age,0.04,1.04,0.01,0.03,0.05,1.03,1.05,0.0,8.34,<0.005,53.61
sex_F,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.11,0.91,0.13
race_African_Black,0.06,1.06,0.0,0.06,0.07,1.06,1.07,0.0,24.41,<0.005,434.89
race_Asian,-0.04,0.96,0.01,-0.05,-0.03,0.96,0.97,0.0,-6.8,<0.005,36.49
race_Mixed_Race,0.03,1.03,0.01,0.02,0.04,1.02,1.04,0.0,6.06,<0.005,29.44

0,1
Concordance,0.66
Partial AIC,23825847.03
log-likelihood ratio test,229414.49 on 21 df
-log2(p) of ll-ratio test,inf


None


In [307]:
gcv.fit(Xintra_mob, yintra_mob)

best_params = gcv.best_params_
best_model = gcv.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_params)

# Print the summary of the best model
print("\nBest Model Summary:")
print(best_model.lifelines_model.print_summary())

Best Hyperparameters:
{'l1_ratio': 1, 'penalizer': 0.001}

Best Model Summary:


0,1
model,lifelines.CoxPHFitter
duration col,'duration_col'
event col,'CENSORED'
penalizer,0.001
l1 ratio,1
baseline estimation,breslow
number of observations,1.38844e+06
number of events observed,1.04366e+06
partial log-likelihood,-13788197.85
time fit was run,2023-10-10 19:13:11 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
Visit_Age,0.01,1.01,0.0,0.01,0.01,1.01,1.01,0.0,56.64,<0.005,inf
first_time,0.16,1.18,0.0,0.15,0.17,1.17,1.19,0.0,37.1,<0.005,998.55
cum_lifetime_donations,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,34.48,<0.005,862.99
rbc_loss_last_12_months,0.18,1.2,0.0,0.18,0.19,1.2,1.2,0.0,143.55,<0.005,inf
rbc_loss_last_24_months,0.06,1.06,0.0,0.06,0.06,1.06,1.06,0.0,77.55,<0.005,inf
high_school_age,-0.1,0.9,0.0,-0.11,-0.1,0.89,0.91,0.0,-29.52,<0.005,633.71
sex_F,0.02,1.02,0.0,0.02,0.03,1.02,1.03,0.0,10.55,<0.005,83.98
race_African_Black,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.05,0.96,0.06
race_Asian,-0.12,0.89,0.0,-0.12,-0.11,0.88,0.9,0.0,-30.09,<0.005,658.52
race_Mixed_Race,0.06,1.06,0.0,0.05,0.06,1.05,1.07,0.0,14.29,<0.005,151.37

0,1
Concordance,0.66
Partial AIC,27576445.70
log-likelihood ratio test,265738.92 on 25 df
-log2(p) of ll-ratio test,inf


None
