In [0]:
!pip install lifelines

In [0]:
import pandas as pd, numpy as np, re
from numpy import exp, mean
from fancyimpute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from lifelines import CoxPHFitter, AalenJohansenFitter
# import statsmodels.api as sm
# from matplotlib import pyplot as plt
# from resample.bootstrap import bootstrap_ci, bootstrap

In [0]:
# whole cohort (N=1,478,506)
d = pd.read_csv('Fortable3mi_ld_dec_censored.csv')
d['rucc_metro'] = np.where(d.RUCC_2013.isin([1, 2, 3]), 1, (np.where(d.RUCC_2013.isna(), np.nan, 0)))
d = d.drop(['RUCC_2013'], axis=1)

In [0]:
# Random forest imputation for categorical
for var in ['dialysis_mod1', 'insurance_esrd', 'rucc_metro']:
    pred = ['sex_new', 'age_cat', 'race_new', var]
    imputer = IterativeImputer(n_iter=1, random_state=7, predictor=RandomForestClassifier(n_estimators=10))
    imputed = pd.DataFrame(imputer.fit_transform(d[pred]), columns=pred)
    d = d.drop(var, axis=1).join(imputed[var])

In [0]:
# Bayesian Ridge linear imputation for continuous
for var in ['Hospitalization_Rate_facility', 'Mortality_Rate_Facility', 'NEAR_DIST']:
    completed = []
    for i in range(5):
        pred = ['sex_new', 'age_cat', 'race_new', var]
        imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=i)
        completed.append(imputer.fit_transform(d[pred]))
    completed_mean = np.mean(completed, axis=0)
    imputed = pd.DataFrame(completed_mean, columns=pred)
    if var == 'NEAR_DIST':
        m = imputed[imputed.NEAR_DIST > 0].NEAR_DIST.mean()
        imputed.NEAR_DIST = np.where(imputed.NEAR_DIST < 0, m, imputed.NEAR_DIST)
    d = d.drop(var, axis=1).join(imputed[var])


In [0]:
##Cox model
PH_data = d[['PROVUSRD', 'chain_class2', 'for_profit', 'sex_new', 'age_cat', 'race_new', 'dialysis_mod1', 'esrd_cause', 'bmi_35',
                 'ashd_new', 'chf',	'other_cardiac', 'cva_new',	'pvasc_new', 'hypertension', 'diabetes', 'copd_new',
                 'smoke_new', 'cancer_new', 'insurance_esrd', 'PATTXOP_MEDUNFITn',
                 'network_us_region_dfr', 'NEAR_DIST', 'rucc_metro', 'wl', 'wl_time', 'livingd', 'ld_time', 'deceasedt', 'dec_time']]
PH_data = PH_data.join(pd.get_dummies(PH_data.dialysis_mod1, prefix='dialysis_mod1', drop_first=True))
PH_data = PH_data.join(pd.get_dummies(PH_data.insurance_esrd, prefix='insurance_esrd', drop_first=True))
PH_data = PH_data.join(pd.get_dummies(pd.Categorical(PH_data.age_cat, [5, 1, 2, 3, 4, 6], True), prefix='age_cat', drop_first=True))
PH_data = PH_data.join(pd.get_dummies(PH_data.race_new, prefix='race_new', drop_first=True))
PH_data = PH_data.join(pd.get_dummies(PH_data.esrd_cause, prefix='esrd_cause', drop_first=True))
PH_data = PH_data.join(pd.get_dummies(PH_data.network_us_region_dfr, prefix='network_us_region_dfr', drop_first=True))
PH_data = PH_data.join(pd.get_dummies(pd.Categorical(PH_data.chain_class2, [6,5,2,1,3,4], True), prefix='chain_class2', drop_first=True))
PH_data = PH_data.drop(['dialysis_mod1', 'esrd_cause', 'age_cat', 'race_new', 'chain_class2', 'insurance_esrd', 'network_us_region_dfr'], axis=1)

In [0]:
crude = 'chain|deceasedt|dec_time'
model1 = crude + '|sex_new|age_cat|race_new'
model2 = model1 + '|dialysis_mod1|esrd_cause|bmi_35|ashd_new|other_cardiac|hypertension|diabetes|'\
                  'copd_new|smoke_new|cancer_new|chf|cva_new|pvasc_new'
#chf|cva_new|pvasc_new'
model3 = model2 + '|insurance_esrd|network_us_region_dfr|NEAR_DIST|rucc_metro|PATTXOP_MEDUNFITn' #PATTXOP_MEDUNFITn'


In [0]:
varlist=
['age_cat',
 'sex_new',
 'race_new',
 'insurance_esrd',
 'esrd_cause',
 'dialysis_mod1',
 'bmi_35',
 'chf',
 'ashd_new',
 'other_cardiac',
 'cva_new',
 'pvasc_new',
 'hypertension',
 'diabetes',
 'copd_new',
 'smoke_new',
 'cancer_new',
 'nephcare_cat2',
 'PATTXOP_MEDUNFITn',
 'network_us_region_dfr',
 'rucc_metro',
 'NEAR_DIST']
cph = CoxPHFitter()
for main_var in varlist:
  crude = main_var+'|deceasedt|dec_time'
  cph.fit(PH_data.filter(regex=crude), duration_col='dec_time', event_col='deceasedt', step_size=0.5)
  print(round(pd.concat([exp(cph.hazards_).rename('HR'), exp(cph.confidence_intervals_)], 1), 2))

In [78]:
tmp = d[(d.wl==1) & (d.wl_time<=24)]
for main_var in varlist:
  print(pd.concat([tmp[main_var].value_counts(), round(tmp[main_var].value_counts()/len(tmp)*100, 1)], axis=1))

   age_cat  age_cat
4    23952     28.8
5    19254     23.1
3    17887     21.5
2    10806     13.0
1     7362      8.8
6     3976      4.8
   sex_new  sex_new
0    51814     62.2
1    31423     37.8
   race_new  race_new
1     38348      46.1
2     23772      28.6
3     14449      17.4
4      6668       8.0
     insurance_esrd  insurance_esrd
3.0           31698            38.1
1.0           17446            21.0
2.0           14906            17.9
5.0           10259            12.3
4.0            8928            10.7
   esrd_cause  esrd_cause
1       32873        39.5
2       19964        24.0
3       17116        20.6
4       13284        16.0
     dialysis_mod1  dialysis_mod1
0.0          60269           72.4
1.0          20704           24.9
2.0           2264            2.7
   bmi_35  bmi_35
0   69598    83.6
1   13639    16.4
     chf   chf
0  71234  85.6
1  12003  14.4
   ashd_new  ashd_new
0     77601      93.2
1      5636       6.8
   other_cardiac  other_cardiac
0          