In [42]:
import pandas as pd
import numpy as np
import os
import glob
import sidetable
import altair as alt
import lifelines as lf

In [43]:
path = os.getcwd()

In [44]:
csv_files = glob.glob(os.path.join(path, "*.csv"))

In [45]:
df = []
for file in csv_files:
    df_temp = pd.read_csv(file, low_memory = False)
    df.append(df_temp)

In [46]:
cancer_idx = pd.DataFrame(df[0])
cancer_nonidx = pd.DataFrame(df[1])
cancer_panel = pd.DataFrame(df[2])
cancer_imaging = pd.DataFrame(df[3])
cancer_manifest = pd.DataFrame(df[4])
cancer_medonc = pd.DataFrame(df[5])
cancer_path = pd.DataFrame(df[6])
cancer_patientlevel = pd.DataFrame(df[7])
cancer_drugs = pd.DataFrame(df[8])

In [47]:
cancer_drugs.stb.freq(['drugs_drug_1','pfs_i_g_status'])

Unnamed: 0,drugs_drug_1,pfs_i_g_status,count,percent,cumulative_count,cumulative_percent
0,"Carboplatin(Blastocarb,CBDCA,Carboplat,Carbopl...",1.0,346,11.792774,346,11.792774
1,Investigational Drug,1.0,294,10.020450,640,21.813224
2,"Nivolumab(BMS936558,MDX1106,NIVO,ONO4538,Opdivo)",1.0,227,7.736878,867,29.550102
3,"Pemetrexed Disodium(Alimta,LY231514)",1.0,209,7.123381,1076,36.673483
4,"Erlotinib Hydrochloride(CP358,774,OSI774,Tarceva)",1.0,182,6.203136,1258,42.876619
...,...,...,...,...,...,...
87,"Dabrafenib(Tafinlar, GSK2118436A, GSK2118436)",0.0,1,0.034083,2930,99.863667
88,"Cobimetinib(Cotellic,GDC0973,XL518)",1.0,1,0.034083,2931,99.897751
89,"Cetuximab(Cetuximab CDP1,Cetuximab CMAB009,Cet...",0.0,1,0.034083,2932,99.931834
90,"Capecitabine(Ro091978 000,Xeloda)",1.0,1,0.034083,2933,99.965917


In [48]:
data = cancer_drugs
df = data.dropna(subset = ['tt_pfs_i_g_mos','pfs_i_g_status'])
df_md = data.dropna(subset = ['tt_pfs_m_g_mos', 'pfs_m_g_status'])

In [49]:
alt.data_transformers.enable('default', max_rows=10000)

DataTransformerRegistry.enable('default')

In [50]:
df_melted = pd.melt(
    df, 
    id_vars=["record_id", "regimen_drugs", 'tt_pfs_m_g_mos', 'tt_pfs_i_g_mos'],
    value_vars=["pfs_m_g_status", "pfs_i_g_status"],
    var_name="progression_type",
    value_name="progression_occurred"
)

df_melted["time_till_progression"] = df_melted.apply(lambda x: x["tt_pfs_i_g_mos"] if x["progression_type"] == "pfs_i_g_status" else x["tt_pfs_m_g_mos"], axis=1)

df_melted.drop(["tt_pfs_i_g_mos", "tt_pfs_m_g_mos"], axis=1, inplace=True)

In [51]:
df_melted['progression_type'] = np.where(df_melted['progression_type'] == 'pfs_m_g_status', 'Notes Based', 'Imaging Based')

In [52]:
drug_options = sorted(df_melted['regimen_drugs'].unique())
selection = alt.selection_single(
    fields=['regimen_drugs'], 
    bind=alt.binding_select(options=drug_options, 
                            name='Select Drug Regimen'),
     init={'regimen_drugs': 'Docetaxel'}
)
alt.Chart(df_melted).mark_bar().encode(
    x = alt.X('progression_occurred:N', axis= None),
    y = alt.Y('count():Q', axis=alt.Axis(grid=True)),
    color = alt.Color('progression_occurred:N', legend=alt.Legend(title='Progression Status')),
    column=alt.Column('progression_type:N')
).configure_view(
    stroke='transparent'
).properties(
    width=50,
    height=300,
    title='Frequency of Progression for each Drug'
).add_selection(selection).transform_filter(selection)


In [53]:
# create a Kaplan-Meier curves for each drug
kmf_dict = {}
for drug in df['regimen_drugs'].unique():
    kmf_dict[drug] = lf.KaplanMeierFitter()
    mask = df['regimen_drugs'] == drug
    kmf_dict[drug].fit(df['tt_pfs_i_g_mos'][mask], df['pfs_i_g_status'][mask], label=drug)

# create a dataframe with the survival probabilities
survival_df = pd.DataFrame()
for drug, kmf in kmf_dict.items():
    survival_prob = kmf.survival_function_
    survival_prob.columns = ['survival_prob']
    survival_prob['drug'] = drug
    survival_prob['time'] = survival_prob.index
    survival_df = pd.concat([survival_df, survival_prob], axis=0)

# create the Altair plot with a dropdown menu
selection = alt.selection_single(
    fields=['drug'], 
    bind=alt.binding_select(options=sorted(list(kmf_dict.keys()))),
    name='Select',
    init = {'drug': 'Docetaxel'}
)

alt_chart = alt.Chart(survival_df).mark_line().encode(
    x='time:Q',
    y='survival_prob:Q',
    color='drug:N'
).add_selection(
    selection
).transform_filter(
    selection
)

# create a Kaplan-Meier curves for each drug
kmf_dict = {}
for drug in df_md['regimen_drugs'].unique():
    kmf_dict[drug] = lf.KaplanMeierFitter()
    mask = df_md['regimen_drugs'] == drug
    kmf_dict[drug].fit(df_md['tt_pfs_m_g_mos'][mask], df_md['pfs_m_g_status'][mask], label=drug)

# create a dataframe with the survival probabilities
survival_df = pd.DataFrame()
for drug, kmf in kmf_dict.items():
    survival_prob = kmf.survival_function_
    survival_prob.columns = ['survival_prob']
    survival_prob['drug'] = drug
    survival_prob['time'] = survival_prob.index
    survival_df = pd.concat([survival_df, survival_prob], axis=0)

# create the Altair plot with a dropdown menu
selection = alt.selection_single(
    fields=['drug'], 
    bind=alt.binding_select(options=sorted(list(kmf_dict.keys()))),
    name='Select',
    init = {'drug': 'Docetaxel'}
)

alt_chart2 = alt.Chart(survival_df).mark_line().encode(
    x='time:Q',
    y='survival_prob:Q',
    color='drug:N'
).add_selection(
    selection
).transform_filter(
    selection
)

# display the plot
alt_chart2

new = alt_chart | alt_chart2

In [54]:
new

In [55]:
df_melted = pd.melt(
    df, 
    id_vars=["record_id", "regimen_drugs", 'tt_pfs_m_g_mos', 'tt_pfs_i_g_mos'],
    value_vars=["pfs_m_g_status", "pfs_i_g_status"],
    var_name="progression_type",
    value_name="progression_occurred"
)

df_melted["time_till_progression"] = df_melted.apply(lambda x: x["tt_pfs_i_g_mos"] if x["progression_type"] == "pfs_i_g_status" else x["tt_pfs_m_g_mos"], axis=1)

df_melted.drop(["tt_pfs_i_g_mos", "tt_pfs_m_g_mos"], axis=1, inplace=True)

In [56]:
len(df_melted['regimen_drugs'
].unique())

190

In [57]:
kmf_dict = {}
for drug in df_melted['regimen_drugs'].unique():
    for prog_type in df_melted['progression_type'].unique():
        kmf_dict[(drug, prog_type)] = lf.KaplanMeierFitter()
        mask = (df_melted['regimen_drugs'] == drug) & (df_melted['progression_type'] == prog_type)
        kmf_dict[(drug, prog_type)].fit(df_melted['time_till_progression'][mask], df_melted['progression_occurred'][mask], label=f'{drug} ({prog_type})')

survival_df = pd.DataFrame()
for (drug, prog_type), kmf in kmf_dict.items():
    mask = (df_melted['regimen_drugs'] == drug) & (df_melted['progression_type'] == prog_type)
    survival_prob = kmf.survival_function_
    survival_prob.columns = ['survival_prob']
    survival_prob['drug'] = drug
    survival_prob['progression_type'] = prog_type
    survival_prob['time'] = survival_prob.index
    survival_df = pd.concat([survival_df, survival_prob], axis=0)
    

drug_options = sorted(df_melted['regimen_drugs'].unique())
selection = alt.selection_single(
    fields=['drug'], 
    bind=alt.binding_select(options=drug_options, 
                            name='Select Drug Regimen'),
     init={'drug': 'Docetaxel'}
)

# create the Altair plot with a dropdown menu
# selection = alt.selection_single(
#     fields=['drug'], 
#     bind=alt.binding_select(options=list(kmf_dict.keys())),
#     name='Select'
# )

alt_chart = alt.Chart(survival_df).mark_line().encode(
    x='time:Q',
    y='survival_prob:Q',
    color=alt.Color('progression_type:N', legend=alt.Legend(title='Progression Source',  labelExpr="{'pfs_i_g_status': 'Imaging based', 'pfs_m_g_status': 'MedOnc Notes based'}[datum.label]"))
).add_selection(
    selection
).transform_filter(
    selection
)

alt.data_transformers.enable('default', max_rows=10000)

# display the plot
alt_chart

In [58]:
data = cancer_idx
df_melted2 = pd.melt(
    data, 
    id_vars=["record_id", 'stage_dx', 'tt_os_dx_mos', 'tt_pfs_i_or_m_adv_mos'],
    value_vars=['os_dx_status', 'pfs_i_or_m_adv_status'],
    var_name="outcome_type",
    value_name="event_occurred"
)

# df_melted2["time_till_progression"] = df_melted2['tt_os_dx_mos']

df_melted2["time_till_event"] = df_melted2.apply(lambda x: x["tt_pfs_i_or_m_adv_mos"] if x["outcome_type"] == "pfs_i_or_m_adv_status" else x["tt_os_dx_mos"], axis=1)

df_melted2.drop(["tt_os_dx_mos", "tt_pfs_i_or_m_adv_mos"], axis=1, inplace = True)

df_melted2 = df_melted2.dropna(subset = ['event_occurred', 'time_till_event', 'stage_dx'])

from lifelines import KaplanMeierFitter
# Define a list of unique combinations of stage_dx and progression_type
unique_combinations = [(stage, out_type) for stage in df_melted2['stage_dx'].unique()
                       for out_type in df_melted2['outcome_type'].unique()]

# Initialize a dictionary to store the fitted Kaplan-Meier models
kmf_dict = {}

# Loop over the unique combinations of stage_dx and progression_type
for stage, out_type in unique_combinations:
    
    # Select the data for the current combination of stage_dx and progression_type
    mask = (df_melted2['stage_dx'] == stage) & (df_melted2['outcome_type'] == out_type)
    data = df_melted2.loc[mask, ['time_till_event', 'event_occurred']].dropna()
    
    # Check if there are enough observations for reliable estimation
    if len(data) >= 10:
        
        # Fit the Kaplan-Meier model and store it in the dictionary
        kmf = KaplanMeierFitter()
        kmf.fit(data['time_till_event'], data['event_occurred'], label=f'{stage} ({out_type})')
        kmf_dict[(stage, out_type)] = kmf
        
survival_df = pd.DataFrame()
for (stage, out_type), kmf in kmf_dict.items():
    mask = (df_melted2['stage_dx'] == stage) & (df_melted2['outcome_type'] == out_type)
    survival_prob = kmf.survival_function_
    survival_prob.columns = ['survival_prob']
    survival_prob['stage_dx'] = stage
    survival_prob['out_type'] = out_type
    survival_prob['time'] = survival_prob.index
    survival_df = pd.concat([survival_df, survival_prob], axis=0)

survival_df['stage_dx']= np.where(survival_df['stage_dx'] == 'Stage I-III NOS', 0, 
        np.where(survival_df['stage_dx'] == 'Stage I', 1, 
                np.where(survival_df['stage_dx'] == 'Stage II', 2, 
                        np.where(survival_df['stage_dx'] == 'Stage III', 3, 4))))


selection = alt.selection_single(
    fields=['stage_dx'], 
    bind=alt.binding_range(min = 0, max = 4, step = 1, 
                            name='Select Stage')
)

labels = {
    0: 'Stage I-III NOS',
    1: 'Stage I',
    2: 'Stage II',
    3: 'Stage III',
    4: 'Stage IV'
}

alt_chart = alt.Chart(survival_df).mark_line().encode(
    x='time:Q',
    y='survival_prob:Q',
color=alt.Color('out_type:N', legend=alt.Legend(title='Outcome',  labelExpr="{'os_dx_status': 'Overall Survival', 'pfs_i_or_m_adv_status': 'Progression-free survival'}[datum.label]"))
).add_selection(
    selection
).transform_filter(
    selection
)

alt.data_transformers.enable('default', max_rows=10000)

# display the plot
alt_chart

In [59]:
data = cancer_idx
df_melted2 = pd.melt(
    data, 
    id_vars=["record_id", 'stage_dx', 'tt_os_dx_mos', 'tt_pfs_i_or_m_adv_mos'],
    value_vars=['os_dx_status', 'pfs_i_or_m_adv_status'],
    var_name="outcome_type",
    value_name="event_occurred"
)

# df_melted2["time_till_progression"] = df_melted2['tt_os_dx_mos']

df_melted2["time_till_event"] = df_melted2.apply(lambda x: x["tt_pfs_i_or_m_adv_mos"] if x["outcome_type"] == "pfs_i_or_m_adv_status" else x["tt_os_dx_mos"], axis=1)

df_melted2.drop(["tt_os_dx_mos", "tt_pfs_i_or_m_adv_mos"], axis=1, inplace = True)

df_melted2 = df_melted2.dropna(subset = ['event_occurred', 'time_till_event', 'stage_dx'])

from lifelines import KaplanMeierFitter
# Define a list of unique combinations of stage_dx and outcome_type
unique_combinations = [(stage, out_type) for stage in df_melted2['stage_dx'].unique()
                       for out_type in df_melted2['outcome_type'].unique()]

# Initialize a dictionary to store the fitted Kaplan-Meier models
kmf_dict = {}

# Loop over the unique combinations of stage_dx and outcome_type
for stage, out_type in unique_combinations:
    
    # Select the data for the current combination of stage_dx and outcome_type
    mask = (df_melted2['stage_dx'] == stage) & (df_melted2['outcome_type'] == out_type)
    data = df_melted2.loc[mask, ['time_till_event', 'event_occurred']].dropna()
    
    # Check if there are enough observations for reliable estimation
    if len(data) >= 10:
        
        # Fit the Kaplan-Meier model and store it in the dictionary
        kmf = KaplanMeierFitter()
        kmf.fit(data['time_till_event'], data['event_occurred'], label=f'{stage} ({out_type})')
        kmf_dict[(stage, out_type)] = kmf

survival_df = pd.DataFrame()
for (stage, out_type), kmf in kmf_dict.items():
    mask = (df_melted2['stage_dx'] == stage) & (df_melted2['outcome_type'] == out_type)
    survival_prob = kmf.survival_function_
    survival_prob.columns = ['survival_prob']
    survival_prob['stage_dx'] = stage
    survival_prob['out_type'] = out_type
    survival_prob['time'] = survival_prob.index
    survival_df = pd.concat([survival_df, survival_prob], axis=0)
    
stage_options = sorted(df_melted2['stage_dx'].unique())
selection = alt.selection_single(
    fields=['stage_dx'], 
    bind=alt.binding_select(options=stage_options, 
                            name='Select Stage'),
     init={'stage_dx': 'Stage IV'}
)


alt_chart = alt.Chart(survival_df).mark_line().encode(
    x='time:Q',
    y='survival_prob:Q',
    color=alt.Color('out_type:N', legend=alt.Legend(title='Outcome',  labelExpr="{'os_dx_status': 'Overall Survival', 'pfs_i_or_m_adv_status': 'Progression-free survival'}[datum.label]"))
).add_selection(
    selection
).transform_filter(
    selection
)

alt.data_transformers.enable('default', max_rows=10000)

# display the plot
alt_chart

In [60]:
data = cancer_idx
df_melted2 = pd.melt(
    data, 
    id_vars=["record_id", 'stage_dx', 'tt_os_dx_mos', 'tt_pfs_i_or_m_adv_mos'],
    value_vars=['os_dx_status', 'pfs_i_or_m_adv_status'],
    var_name="outcome_type",
    value_name="event_occurred"
)

# df_melted2["time_till_progression"] = df_melted2['tt_os_dx_mos']

df_melted2["time_till_event"] = df_melted2.apply(lambda x: x["tt_pfs_i_or_m_adv_mos"] if x["outcome_type"] == "pfs_i_or_m_adv_status" else x["tt_os_dx_mos"], axis=1)

df_melted2.drop(["tt_os_dx_mos", "tt_pfs_i_or_m_adv_mos"], axis=1, inplace = True)

df_melted2 = df_melted2.dropna(subset = ['event_occurred', 'time_till_event', 'stage_dx'])


df_melted2['outcome_type'] = np.where(df_melted2['outcome_type'] == 'pfs_i_or_m_adv_status', "Progression-free survival", "Overall survival")

from lifelines import KaplanMeierFitter
# Define a list of unique combinations of stage_dx and outcome_type
unique_combinations = [(stage, out_type) for stage in df_melted2['stage_dx'].unique()
                       for out_type in df_melted2['outcome_type'].unique()]

# Initialize a dictionary to store the fitted Kaplan-Meier models
kmf_dict = {}

# Loop over the unique combinations of stage_dx and outcome_type
for stage, out_type in unique_combinations:
    
    # Select the data for the current combination of stage_dx and outcome_type
    mask = (df_melted2['stage_dx'] == stage) & (df_melted2['outcome_type'] == out_type)
    data = df_melted2.loc[mask, ['time_till_event', 'event_occurred']].dropna()
    
    # Check if there are enough observations for reliable estimation
    if len(data) >= 10:
        
        # Fit the Kaplan-Meier model and store it in the dictionary
        kmf = KaplanMeierFitter()
        kmf.fit(data['time_till_event'], data['event_occurred'], label=f'{stage} ({out_type})')
        kmf_dict[(stage, out_type)] = kmf
        
survival_df = pd.DataFrame()
for (stage, out_type), kmf in kmf_dict.items():
    mask = (df_melted2['stage_dx'] == stage) & (df_melted2['outcome_type'] == out_type)
    survival_prob = kmf.survival_function_
    survival_prob.columns = ['survival_prob']
    survival_prob['stage_dx'] = stage
    survival_prob['out_type'] = out_type
    survival_prob['time'] = survival_prob.index
    survival_df = pd.concat([survival_df, survival_prob], axis=0)
    
    

out_options = df_melted2['outcome_type'].unique()    
selection = alt.selection_single(
    fields=['out_type'], 
    bind=alt.binding_select(options=out_options, 
                            name='Select Outcome'),
    init = {'out_type': 'Overall survival'}
)


alt_chart = alt.Chart(survival_df).mark_line().encode(
    x='time:Q',
    y='survival_prob:Q',
    color=alt.Color('stage_dx:N', legend=alt.Legend(title='NSCLC Stage'))
).add_selection(
    selection
).transform_filter(
    selection
)

alt.data_transformers.enable('default', max_rows=10000)

# display the plot
alt_chart