In [None]:
import lifelines
from lifelines import CoxPHFitter
from lifelines import CoxTimeVaryingFitter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def create_survival_data(data_df, id_col, index_time_col, outcome_time_col,
                         followup_end_time, followup_max_time_from_index=np.inf,
                         censoring_time_col=None):
    tmp_data_df = data_df.copy()
    if censoring_time_col is None:
        censoring_time_col = 'censoring_time_col'
        tmp_data_df[censoring_time_col] = np.nan

    survival_df = tmp_data_df[[id_col, index_time_col, outcome_time_col, censoring_time_col]].copy()
    survival_df['event_time_from_index'] = (survival_df[outcome_time_col] - survival_df[index_time_col])
    survival_df['censoring_time_from_index'] = (survival_df[censoring_time_col] - survival_df[index_time_col])
    survival_df['followup_end_time_from_index'] = (followup_end_time - survival_df[index_time_col])
    survival_df['max_time_from_index'] = followup_max_time_from_index
    survival_df['earliest_censoring_time_from_index'] = survival_df[['censoring_time_from_index',
                                                                     'followup_end_time_from_index',
                                                                     'max_time_from_index']].min(axis=1)
    # Seperate into 2 types of people:
    # (1) those who have an event before followup_max_time_from_index, followup_end_time_from_index and censoring_time_from_index
    idx_event = ( (survival_df[outcome_time_col].notna()) &  \
                  (survival_df['event_time_from_index']<=followup_max_time_from_index) & \
                  (survival_df['event_time_from_index']<=survival_df['earliest_censoring_time_from_index']) )
    survival_df.loc[idx_event, 'E'] = 1
    survival_df.loc[idx_event, 'T'] = survival_df['event_time_from_index']
    survival_df.loc[~idx_event, 'E'] = 0
    survival_df.loc[~idx_event, 'T'] = survival_df['earliest_censoring_time_from_index'] 
    survival_df['E'] = survival_df['E'].astype(int)
    survival_df['T'] = survival_df['T'].replace(np.inf, 1000)
    survival_df['T'] = survival_df['T'].astype(int)
    survival_df.sort_values(['T'], inplace=True)
    return survival_df[[id_col, 'E', 'T']]


In [None]:
doc_matrix = pd.read_csv('c:/corona_segal/merge_matrix_outcome_recovery.txt', sep="\t")

In [None]:
data_df = doc_matrix.reset_index().copy()

for col in ['death_date', 'test_date','recover_date','censor_date']:
    data_df.loc[:,col].replace(-9,np.nan, inplace=True)
    data_df.loc[:,col] = pd.to_datetime(data_df[col], format='%Y%m%d')

MAX_DATE = data_df[['death_date', 'test_date','recover_date']].max().max()
MIN_DATE = data_df[['death_date', 'test_date','recover_date']].min().min()

data_df['pos_test_date'] = data_df['recover_date']
data_df.loc[(data_df['recover_ind']!=1), 'pos_test_date'] = np.nan

data_df['neg_test_date'] = data_df['censor_date']
data_df.loc[(data_df['recover_ind']>0), 'neg_test_date'] = np.nan

for col in ['test_date', 'pos_test_date', 'neg_test_date','recover_date','censor_date']:
    data_df.loc[:,col+'_T'] = (data_df.loc[:,col] -  MIN_DATE).dt.days

In [None]:
data_df['Date_adjustment'] = (data_df['test_date']-MIN_DATE)/np.timedelta64(1,'D')

In [None]:
def prepare_time_to_pos_test_surv_df(data_df, followup_max_time_from_index=60, censoring_time_col=None):  
    
    surv_df = create_survival_data(data_df,
                     id_col='index', index_time_col='test_date_T', outcome_time_col='pos_test_date_T',
                     followup_end_time=(MAX_DATE-MIN_DATE).days, followup_max_time_from_index=60,
                                   censoring_time_col=censoring_time_col)
    surv_df = surv_df[surv_df['T']>=0]


    symp_cols = [
         'age', 'gender'
    ]

    surv_df = surv_df.merge(data_df[['index','id_disease','Date_adjustment','test_date','recover_ind','death_date','censor_date','recover_date']+symp_cols], on='index', how='left')
    
    return surv_df

In [None]:
surv_df = prepare_time_to_pos_test_surv_df(data_df, followup_max_time_from_index=60, censoring_time_col='neg_test_date_T')


In [None]:
surv_df['Children'] = np.where(surv_df['age']<18,1,0)
surv_df['Adults'] = np.where(surv_df['age']>=18,1,0)

In [None]:
from lifelines import KaplanMeierFitter
from lifelines.plotting import add_at_risk_counts
kmf = KaplanMeierFitter()

# Kaplan Meier

In [None]:
symp_col1='Children'
symp_col2='Adults'

ax=None

kmf_0 = KaplanMeierFitter()
kmf_0.fit(surv_df.loc[surv_df[symp_col1]==1, 'T'], surv_df.loc[surv_df[symp_col1]==1, 'E'], label=f'{symp_col1}')

kmf_1 = KaplanMeierFitter()
kmf_1.fit(surv_df.loc[surv_df[symp_col2]==1, 'T'], surv_df.loc[surv_df[symp_col2]==1, 'E'], label=f'{symp_col2}')


if ax is None:
    fig, ax = plt.subplots(1,1,figsize=(8,6),dpi=100)
kmf_0.plot(ax=ax)
kmf_1.plot(ax=ax)

ax.set_xticks(np.arange(0,61,2))
ax.set_yticks(np.arange(0,1.01,0.1))
ax.legend()
ax.set_title('Time To Recovery')


fig.tight_layout()

In [None]:
symp_col1='Male'
symp_col2='Female'
gender_col = 'gender'

ax=None

kmf_0 = KaplanMeierFitter()
kmf_0.fit(surv_df.loc[surv_df[gender_col]==0, 'T'], surv_df.loc[surv_df[gender_col]==0, 'E'], label=f'{symp_col1}')

kmf_1 = KaplanMeierFitter()
kmf_1.fit(surv_df.loc[surv_df[gender_col]==1, 'T'], surv_df.loc[surv_df[gender_col]==1, 'E'], label=f'{symp_col2}')


if ax is None:
    fig, ax = plt.subplots(1,1,figsize=(8,6),dpi=100)
kmf_0.plot(ax=ax)
kmf_1.plot(ax=ax)

ax.set_xticks(np.arange(0,61,2))
ax.set_yticks(np.arange(0,1.01,0.1))
#add_at_risk_counts(kmf_0, kmf_1, ax=ax)
ax.legend()
ax.set_title('Time To Recovery')


fig.tight_layout()

## Cox

In [None]:
surv_df['Female'] = surv_df['gender']
surv_df['Presence of a chronic medical condition'] = surv_df['id_disease']

In [None]:
cph = CoxPHFitter()
cph.fit(surv_df[['T', 'E','Presence of a chronic medical condition', 'Female','Date_adjustment', 'Children']],duration_col='T', event_col='E')
orig_summary =  cph.summary

In [None]:
orig_summary.to_csv('c:/corona_segal/summary_recover.txt')