In [0]:
!pip install lifelines

In [2]:
import pandas as pd, numpy as np, re
from numpy import exp, mean
from fancyimpute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from lifelines import CoxPHFitter, AalenJohansenFitter
# import statsmodels.api as sm
# from matplotlib import pyplot as plt
# from resample.bootstrap import bootstrap_ci, bootstrap

Using TensorFlow backend.


In [0]:
from google.colab import drive
drive.mount('/content/Drive')

In [0]:
# whole cohort (N=1,478,506)
d = pd.read_csv('Drive/My Drive/facility/Fortable3mi_ld_dec_censored.csv')
d['rucc_rural'] = np.where(d.RUCC_2013.isin([1, 2, 3]), 0, (np.where(d.RUCC_2013.isna(), np.nan, 1)))
d = d.drop(['RUCC_2013'], axis=1)

In [0]:
# Random forest imputation for categorical
for var in ['dialysis_mod1', 'insurance_esrd', 'rucc_rural', 'nephcare_cat2']:
    pred = ['sex_new', 'age_cat', 'race_new', var]
    imputer = IterativeImputer(n_iter=1, random_state=7, predictor=RandomForestClassifier(n_estimators=10))
    imputed = pd.DataFrame(imputer.fit_transform(d[pred]), columns=pred)
    d = d.drop(var, axis=1).join(imputed[var])

In [0]:
# Bayesian Ridge linear imputation for continuous
for var in ['Hospitalization_Rate_facility', 'Mortality_Rate_Facility', 'NEAR_DIST']:
    completed = []
    for i in range(5):
        pred = ['sex_new', 'age_cat', 'race_new', var]
        imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=i)
        completed.append(imputer.fit_transform(d[pred]))
    completed_mean = np.mean(completed, axis=0)
    imputed = pd.DataFrame(completed_mean, columns=pred)
    if var == 'NEAR_DIST':
        m = imputed[imputed.NEAR_DIST > 0].NEAR_DIST.mean()
        imputed.NEAR_DIST = np.where(imputed.NEAR_DIST < 0, m, imputed.NEAR_DIST)
    d = d.drop(var, axis=1).join(imputed[var])


In [0]:
##Cox model
PH_data = d[['PROVUSRD', 'chain_class2', 'for_profit', 'sex_new', 'age_cat', 'race_new', 'dialysis_mod1', 'esrd_cause', 'bmi_35',
                 'ashd_new', 'chf',	'other_cardiac', 'cva_new',	'pvasc_new', 'hypertension', 'diabetes', 'copd_new',
                 'smoke_new', 'cancer_new', 'insurance_esrd', 'PATTXOP_MEDUNFITn','nephcare_cat2',
                 'network_us_region_dfr', 'NEAR_DIST', 'rucc_rural', 'wl', 'wl_time', 'livingd', 'ld_time', 'deceasedt', 'dec_time']]
PH_data = PH_data.join(pd.get_dummies(PH_data.dialysis_mod1, prefix='dialysis_mod1', drop_first=True))
PH_data = PH_data.join(pd.get_dummies(pd.Categorical(PH_data.insurance_esrd, [3, 2, 1, 4, 5], True), prefix='insurance_esrd', drop_first=True))
PH_data = PH_data.join(pd.get_dummies(pd.Categorical(PH_data.age_cat, [5, 1, 2, 3, 4, 6], True), prefix='age_cat', drop_first=True))
PH_data = PH_data.join(pd.get_dummies(PH_data.race_new, prefix='race_new', drop_first=True))
PH_data = PH_data.join(pd.get_dummies(PH_data.esrd_cause, prefix='esrd_cause', drop_first=True))
PH_data = PH_data.join(pd.get_dummies(PH_data.network_us_region_dfr, prefix='network_us_region_dfr', drop_first=True))
PH_data = PH_data.join(pd.get_dummies(pd.Categorical(PH_data.chain_class2, [6,5,2,1,3,4], True), prefix='chain_class2', drop_first=True))
PH_data = PH_data.drop(['dialysis_mod1', 'esrd_cause', 'age_cat', 'race_new', 'chain_class2', 'insurance_esrd', 'network_us_region_dfr'], axis=1)

In [0]:
crude = 'chain|deceasedt|dec_time'
model1 = crude + '|sex_new|age_cat|race_new'
model2 = model1 + '|dialysis_mod1|esrd_cause|bmi_35|ashd_new|other_cardiac|hypertension|diabetes|'\
                  'copd_new|smoke_new|cancer_new|chf|cva_new|pvasc_new'
#chf|cva_new|pvasc_new'
model3 = model2 + '|insurance_esrd|network_us_region_dfr|NEAR_DIST|rucc_metro|PATTXOP_MEDUNFITn' #PATTXOP_MEDUNFITn'


In [0]:
varlist=['age_cat',
 'sex_new',
 'race_new',
 'insurance_esrd',
 'esrd_cause',
 'dialysis_mod1',
 'bmi_35',
 'chf',
 'ashd_new',
 'other_cardiac',
 'cva_new',
 'pvasc_new',
 'hypertension',
 'diabetes',
 'copd_new',
 'smoke_new',
 'cancer_new',
 'nephcare_cat2',
 'PATTXOP_MEDUNFITn',
 'network_us_region_dfr',
 'rucc_rural',
 'NEAR_DIST']


In [0]:
cph = CoxPHFitter()
for main_var in varlist:
  crude = main_var+'|deceasedt|dec_time'
  cph.fit(PH_data.filter(regex=crude), duration_col='dec_time', event_col='deceasedt', step_size=0.5)
  print(round(pd.concat([exp(cph.hazards_).rename('HR'), exp(cph.confidence_intervals_)], 1), 2))

In [0]:
tmp = d[(d.wl==1) & (d.wl_time<=24)]
for main_var in varlist:
  count = tmp[main_var].value_counts(dropna=False).map('{:,}'.format)
  pct = tmp[main_var].value_counts(dropna=False)/len(tmp)*100.map(' ({:.1f})'.format)
  print(count+pct)
  
 

In [18]:
for case, time in zip(['wl', 'livingd', 'deceasedt'], ['wl_time', 'ld_time', 'dec_time']):
  tmp = d[d[time]<=24]
  count = tmp.groupby('chain_class2')[case].sum().map('{:,}'.format)
  pyr = (tmp.groupby('chain_class2')[time].sum()/12).map(' /{:,.0f}'.format)
  print(count+pyr)
  count1 = tmp.groupby('for_profit')[case].sum().map('{:,}'.format)
  pyr1 = (tmp.groupby('for_profit')[time].sum()/12).map(' /{:,.0f}'.format)
  print(count1+pyr1)

chain_class2
1    19,176 /198,079
2    21,753 /197,692
3      9,634 /93,703
4      4,377 /40,542
5     18,522 /35,320
6      9,775 /45,650
dtype: object
for_profit
0     28,297 /80,970
1    54,940 /530,016
dtype: object
chain_class2
1    4,426 /190,650
2    4,465 /188,172
3     1,789 /89,362
4       948 /38,578
5     2,695 /28,085
6     1,996 /42,332
dtype: object
for_profit
0      4,691 /70,417
1    11,628 /506,762
dtype: object
chain_class2
1    3,244 /190,650
2    3,238 /188,172
3     1,458 /89,362
4       553 /38,578
5     1,926 /28,085
6     1,383 /42,332
dtype: object
for_profit
0     3,309 /70,417
1    8,493 /506,762
dtype: object
