# Model Setup

In [1]:
import lymph
from scipy.special import factorial
import numpy as np
import pandas as pd
import scipy as sp
import emcee

graph = {
    ('tumor', 'primary')  : ['I','II', 'III', 'IV','V', 'VII'],
    ('lnl'  , 'I') :        ['II'],
    ('lnl'  , 'II') :       ['III'], 
    ('lnl'  , 'III'):       ['IV'], 
    ('lnl'  , 'IV') :       ['V'],
    ('lnl'  , 'V') :        [],
    ('lnl'  , 'VII') :      [],
}

model = lymph.MidlineBilateral(graph = graph,use_mixing= True, trans_symmetric =True)
model.modalities = {'CT': [0.76, 0.81],
                    'MRI': [0.63, 0.81],
                    'PET': [0.86, 0.79],
                    'FNA': [0.98, 0.80],
                    'diagnostic_consensus': [0.86, 0.81],
                    'pathology': [1.0, 1.0],
                    'pCT': [0.86, 0.81],
                    'max_llh': [1.0, 1.0]
                    }


# Time prior with p(early) = 0.3
def binom_pmf(k: np.ndarray, n: int, p: float):
    """Binomial PMF"""
    if p > 1. or p < 0.:
        raise ValueError("Binomial prob must be btw. 0 and 1")
    q = (1. - p)
    binom_coeff = factorial(n) / (factorial(k) * factorial(n - k))
    return binom_coeff * p**k * q**(n - k)

def parametric_binom_pmf(n: int):
    """Return a parametric binomial PMF"""
    def inner(t, p):
        """Parametric binomial PMF"""
        return binom_pmf(t, n, p)
    return inner

max_t = 10
model.diag_time_dists["early"] = sp.stats.binom.pmf(np.arange(max_t+1), max_t, 0.3)
model.diag_time_dists["late"] = parametric_binom_pmf(max_t)
model.modalities = {'treatment_diagnose': [1,0.81]}
# model.patient_data = dataset_full

## Classical analysis example

In [2]:
midline_backend = emcee.backends.HDFBackend(filename = "../lynference/models/samples2.hdf5")
samples_midline_model = midline_backend.get_chain(flat = True)

In [3]:
from sparing_scripts import sample_from_flattened
samples_reduced = sample_from_flattened(samples_midline_model, num_samples = 203, spaced = True,step_size = 89)

In [4]:
model.modalities = {'treatment_diagnose': [1,0.81]}

In [17]:
from sparing_scripts import risk_sampled, levels_to_spare, ci_single

diagnose = {"treatment_diagnose": {'ipsi':{
        "I": 0,
        "II": 0,
        "III": 0,
        "IV": 0,
        "V": 0,
        "VII": 0
    },
    'contra':{
        "I": 0,
        "II": 0,
        "III": 0,
        "IV": 0,
        "V": 0,
        "VII": 0
    }}}

sampled_risks, mean_risk = risk_sampled(samples_reduced, model, 'early', given_diagnoses = diagnose, midline_extension = False)
spared_lnls, total_risk, ranked_combined, treated_lnls,treated_array, treated_ipsi, treated_contra, sampled_total_risk = levels_to_spare(threshold = 0.1, model = model, mean_risks = mean_risk, sampled_risks = sampled_risks, ci = True)


In [18]:
print("to spare: ", spared_lnls)
print("total risk: ", np.round(100*total_risk,2), "%")
print("confidence interval: ", np.round(100*ci_single(sampled_total_risk), 2), "%")

to spare:  [('contra I', 0.00014041503678369102), ('contra V', 0.00025355191154452126), ('contra III', 0.0007148542767304306), ('contra IV', 0.0007899036948606521), ('contra VII', 0.00235240305833678), ('ipsi IV', 0.004841190373359489), ('ipsi V', 0.006446829344689972), ('contra II', 0.008457191928417764), ('ipsi I', 0.008707130663085186), ('ipsi VII', 0.008858032230079348), ('ipsi III', 0.03658636444627596)]
total risk:  7.36 %
confidence interval:  [6.39 8.23] %


## Combination analysis
Here we can also specifically analyze all combinations we encounter in a dataset

In [5]:
#load data and modify it for analysis
dataset_USZ =  pd.read_csv("data/cleanedUSZ.csv", header=[0,1,2]) #import data

maxllh =  dataset_USZ['max_llh']
t_stage = dataset_USZ['info']
ipsi = maxllh.loc[:,'ipsi'].drop(['IIa','IIb','VIII','Ib','IX','VI','X','Ia'],axis = 1)[['I','II','III','IV','V','VII']]
contra = maxllh.loc[:,'contra'].drop(['IIa','IIb','VIII','Ib','IX','VI','X','Ia'],axis = 1)[['I','II','III','IV','V','VII']]
ipsi_header = header = pd.MultiIndex.from_product([ ['ipsi'], ['I','II','III','IV','V','VII']], names=['', ''])
contra_header = pd.MultiIndex.from_product([['contra'], ['I','II','III','IV','V','VII']], names=['', ''])
ipsi.columns = ipsi_header
contra.columns = contra_header

dataset_analyze = pd.concat([t_stage,ipsi,contra],axis = 1)

In [6]:
dataset_analyze

Unnamed: 0_level_0,tumor,tumor,ipsi,ipsi,ipsi,ipsi,ipsi,ipsi,contra,contra,contra,contra,contra,contra
Unnamed: 0_level_1,t_stage,midline_extension,I,II,III,IV,V,VII,I,II,III,IV,V,VII
0,late,True,False,True,False,False,False,False,False,False,False,False,False,False
1,early,False,False,True,False,False,False,False,False,True,False,False,False,False
2,late,True,True,True,True,True,False,True,True,True,True,True,False,False
3,late,True,False,True,True,True,False,True,False,False,False,False,False,False
4,early,False,False,True,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,late,True,True,True,True,False,False,False,False,False,False,False,False,False
283,late,False,False,True,True,False,False,False,False,False,False,False,False,False
284,late,True,False,True,True,False,True,False,False,True,True,False,False,True
285,late,True,False,True,True,False,False,False,False,True,False,False,False,False


In [7]:
from collections import Counter
from collections import defaultdict
# Convert the DataFrame to a NumPy array for easier manipulation
data = np.array(dataset_analyze)

# we extract all observed diagnoses
entry_combinations_with_indexes = defaultdict(list)
for index, row in enumerate(data):
    combination = tuple(row)
    entry_combinations_with_indexes[combination].append(index)
USZ_counts = []
USZ_combinations = []
USZ_indexes = []
for combination, indexes in entry_combinations_with_indexes.items():
    count = len(indexes)
    USZ_indexes.append(indexes)
    USZ_counts.append(count)
    USZ_combinations.append(combination)

In [8]:
from sparing_scripts import analysis_treated_lnls_combinations
usz_treated_lnls_no_risk, usz_treated_lnls_all, usz_treatment_array, usz_top3_spared, usz_total_risks, usz_treated_ipsi, usz_treated_contra, usz_sampled_risks_array, cis = analysis_treated_lnls_combinations(USZ_combinations, samples_reduced, model)

here we extract some extra information to produce a nice table

In [9]:
lnls = ['I','II', 'III', 'IV','V', 'VII']
t_stage = []
midline_extension = []
invovlvement_ipsi_USZ = []
invovlvement_contra_USZ = []
for diagnose_type in USZ_combinations:
    involved_ipsi = []
    involved_contra = []
    t_stage.append(diagnose_type[0])
    midline_extension.append(diagnose_type[1])
    for lnl_looper, involved_level in enumerate(lnls):
        if diagnose_type[lnl_looper +2] == True:
            involved_ipsi.append(involved_level) 
        if diagnose_type[lnl_looper +8] == True:
            involved_contra.append(involved_level)
    invovlvement_ipsi_USZ.append(involved_ipsi)
    invovlvement_contra_USZ.append(involved_contra)

In [10]:
data_export_usz = pd.DataFrame({'Percentage of patients': np.array(USZ_counts)/287,
                                'T-stage': t_stage,
                                'Midline Extension': midline_extension,
                                'Involvement Ipsi' : invovlvement_ipsi_USZ,
                                'Involvement Contra': invovlvement_contra_USZ,
                                'Treated Ipsi':  usz_treated_ipsi,
                                'Treated Contra': usz_treated_contra,
                                'risk': usz_total_risks,
                                'lower bound': cis[0],
                                'upper bound': cis[1],
                                'top 3 spared lnls risk': usz_top3_spared

})

In [16]:
cis[1]

[array([0.07256023, 0.07660419, 0.07715183, 0.07450056, 0.07404687,
        0.08429885, 0.07651124, 0.07782295, 0.07926604, 0.06826714,
        0.08155964, 0.07478151, 0.07804537, 0.06815774, 0.07671736,
        0.07656285, 0.08250828, 0.07199717, 0.07457798, 0.07910343,
        0.08544164, 0.07166251, 0.07202463, 0.07546939, 0.08006696,
        0.0751578 , 0.07897544, 0.0806369 , 0.0691167 , 0.08317173,
        0.06919964, 0.08447734, 0.076679  , 0.07593773, 0.07901994,
        0.08022335, 0.07207464, 0.08201217, 0.08412937, 0.06773563,
        0.07929362, 0.07442194, 0.08105662, 0.08359956, 0.08102473,
        0.0757746 , 0.08439458, 0.06580116, 0.07943434, 0.07724904,
        0.08004548, 0.0811991 , 0.08616653, 0.07872471, 0.07778236,
        0.07110889, 0.07859072, 0.08222916, 0.08078611, 0.07553369,
        0.07020196, 0.07594819, 0.09534926, 0.08601976, 0.06972487,
        0.06933533, 0.07737844, 0.0868301 , 0.07631395, 0.07292491,
        0.07700687, 0.07970196, 0.07515233, 0.08

In [15]:
cis[0]

[array([0.0677399, 0.0890076]),
 array([0.05384728, 0.07113282]),
 array([0.03225635, 0.07849678]),
 array([0.05499155, 0.07491399]),
 array([0.05107326, 0.06867698]),
 array([0.04317664, 0.06370794]),
 array([0.06386673, 0.08228965]),
 array([0.04958273, 0.07104799]),
 array([0.05475971, 0.07037767]),
 array([0.0580055 , 0.07603846]),
 array([0.06060629, 0.08218963]),
 array([0.06366309, 0.08687115]),
 array([0.05212756, 0.09552651]),
 array([0.04587328, 0.06421359]),
 array([0.04095829, 0.05485793]),
 array([0.06765081, 0.08861297]),
 array([0.05316043, 0.07157994]),
 array([0.05438345, 0.07393734]),
 array([0.04142647, 0.06284653]),
 array([0.04574405, 0.06577785]),
 array([0.04131449, 0.05721894]),
 array([0.06465281, 0.08605247]),
 array([0.05796743, 0.09982337]),
 array([0.02150617, 0.03993672]),
 array([0.04946428, 0.06910742]),
 array([0.0438158 , 0.06137082]),
 array([0.0595774, 0.0817572]),
 array([0.07006441, 0.09454338]),
 array([0.05388961, 0.07310317]),
 array([0.07476305

In [11]:
data_export_usz

Unnamed: 0,Percentage of patients,T-stage,Midline Extension,Involvement Ipsi,Involvement Contra,Treated Ipsi,Treated Contra,risk,lower bound,upper bound,top 3 spared lnls risk
0,0.048780,late,True,[II],[],"[III, II]",[II],0.077791,"[0.06773989730753313, 0.08900760326675584]","[0.07256023225332642, 0.07660418657719364, 0.0...","[(ipsi I, 0.020928523675987565), (ipsi VII, 0...."
1,0.010453,early,False,[II],[II],"[III, II]","[III, II]",0.061750,"[0.05384727578589572, 0.07113281583843743]","[0.06257526436875874, 0.05379753514651602, 0.0...","[(ipsi I, 0.020669208427492723), (ipsi VII, 0...."
2,0.003484,late,True,"[I, II, III, IV, VII]","[I, II, III, IV]","[V, I, II, III, IV, VII]","[I, II, III, IV]",0.053768,"[0.03225635008928569, 0.07849678203610887]","[0.030273157525584536, 0.07266967671537881, 0....","[(contra V, 0.042696622289579954), (contra VII..."
3,0.003484,late,True,"[II, III, IV, VII]",[],"[V, VII, II, III, IV]",[II],0.064973,"[0.054991553311946814, 0.0749139862476322]","[0.06463844751549983, 0.05882231714472115, 0.0...","[(ipsi I, 0.029491262610923226), (contra III, ..."
4,0.010453,early,False,"[II, VII]",[],"[III, VII, II]",[],0.059468,"[0.05107325781136613, 0.06867697955016971]","[0.06180680843090709, 0.05160597176813633, 0.0...","[(ipsi I, 0.021163655070474168), (contra II, 0..."
...,...,...,...,...,...,...,...,...,...,...,...
72,0.003484,early,False,"[II, IV]",[],"[V, III, II, IV]",[],0.057391,"[0.048247231469028345, 0.06637357958707926]","[0.060118314871434375, 0.05311087007545105, 0....","[(ipsi I, 0.022340680849047168), (ipsi VII, 0...."
73,0.006969,late,False,"[II, III, V]",[],"[IV, II, III, V]",[],0.071844,"[0.061415540192388464, 0.08177885702206882]","[0.06767400318588664, 0.07043073691686896, 0.0...","[(ipsi I, 0.028089405023911166), (ipsi VII, 0...."
74,0.003484,late,True,"[II, III]","[II, III, IV]","[I, IV, II, III]","[V, II, III, IV]",0.061302,"[0.05138832205135655, 0.07107104583878229]","[0.055786233405746954, 0.06320486288799503, 0....","[(ipsi VII, 0.02290127826783818), (ipsi V, 0.0..."
75,0.003484,late,False,"[II, V]",[],"[IV, III, II, V]",[],0.063624,"[0.05413075423616103, 0.0729940150301831]","[0.0597916515563008, 0.0630034822634655, 0.060...","[(ipsi I, 0.024800916851586873), (ipsi VII, 0...."


## risk tables
here is an example for calculating the risk of each combination to show up


In [3]:
midline_backend = emcee.backends.HDFBackend(filename = "../lynference/models/samples2.hdf5")
samples_midline_model = midline_backend.get_chain(flat = True)

In [4]:
from sparing_scripts import sample_from_flattened
samples_reduced = sample_from_flattened(samples_midline_model, num_samples = 203, spaced = True,step_size = 89)

In [72]:
sampled_risks_early_no_ext, mean_risk_early_no_ext = risk_sampled(samples_reduced, model, 'early', midline_extension = False, given_diagnoses = None) 
sampled_risks_early_ext, mean_risk_early_ext = risk_sampled(samples_reduced, model, 'early', midline_extension = True, given_diagnoses = None)
sampled_risks_late_no_ext, mean_risk_late_no_ext = risk_sampled(samples_reduced, model, 'late', midline_extension = False, given_diagnoses = None)
sampled_risks_late_ext, mean_risk_late_ext = risk_sampled(samples_reduced, model, 'late', midline_extension = True, given_diagnoses = None)

In [73]:
state_list = model.noext.ipsi.state_list
state_list_names = []
for index, state in enumerate(state_list):
    state_list_names.append(str(state))
df_early_no_ext = pd.DataFrame(mean_risk_early_no_ext, columns=state_list_names, index=state_list_names)
df_early_ext = pd.DataFrame(mean_risk_early_ext, columns=state_list_names, index=state_list_names)
df_late_no_ext = pd.DataFrame(mean_risk_late_no_ext, columns=state_list_names, index=state_list_names)
df_late_ext = pd.DataFrame(mean_risk_late_ext, columns=state_list_names, index=state_list_names)

# df_early_no_ext.to_excel('early_no_ext.xlsx')
# df_early_ext.to_excel('early_ext.xlsx')
# df_late_no_ext.to_excel('late_no_ext.xlsx')
# df_late_ext.to_excel('late_ext.xlsx')