# Model Setup

In [1]:
import lymph
from scipy.special import factorial
import numpy as np
import pandas as pd
import scipy as sp
import emcee

# base symmetric central
graph = {
    ('tumor', 'primary')  : ['I','II', 'III', 'IV','V', 'VII'],
    ('lnl'  , 'I') :        ['II'],
    ('lnl'  , 'II') :       ['III'], 
    ('lnl'  , 'III'):       ['IV'], 
    ('lnl'  , 'IV') :       ['V'],
    ('lnl'  , 'V') :        [],
    ('lnl'  , 'VII') :      [],
}

central = lymph.Bilateral(graph = graph,base_symmetric= True, trans_symmetric= True )
central.modalities = {'CT': [0.76, 0.81],
                    'MRI': [0.63, 0.81],
                    'PET': [0.86, 0.79],
                    'FNA': [0.98, 0.80],
                    'diagnostic_consensus': [0.86, 0.81],
                    'pathology': [1.0, 1.0],
                    'pCT': [0.86, 0.81],
                    'max_llh': [1.0, 1.0]
                    }


# Time prior with p(early) = 0.3
def binom_pmf(k: np.ndarray, n: int, p: float):
    """Binomial PMF"""
    if p > 1. or p < 0.:
        raise ValueError("Binomial prob must be btw. 0 and 1")
    q = (1. - p)
    binom_coeff = factorial(n) / (factorial(k) * factorial(n - k))
    return binom_coeff * p**k * q**(n - k)

def parametric_binom_pmf(n: int):
    """Return a parametric binomial PMF"""
    def inner(t, p):
        """Parametric binomial PMF"""
        return binom_pmf(t, n, p)
    return inner

max_t = 10
central.diag_time_dists["early"] = sp.stats.binom.pmf(np.arange(max_t+1), max_t, 0.3)
central.diag_time_dists["late"] = parametric_binom_pmf(max_t)

## Classical analysis example

In [2]:
midline_backend = emcee.backends.HDFBackend(filename = "data/samples_midline_trial.hdf5")
samples_midline_model = midline_backend.get_chain(flat = True)

In [3]:
#base params ipsi
first_part = samples_midline_model[:,:6]
#transmission + time prior
second_part = samples_midline_model[:,13:]
params_stacked = np.hstack((first_part, second_part))

In [4]:
from sparing_scripts import sample_from_flattened
samples_reduced = sample_from_flattened(params_stacked, num_samples = 203, spaced = True,step_size = 89)

In [5]:
central.modalities = {'treatment_diagnose': [1,0.81]}

In [6]:
from sparing_scripts import risk_sampled, levels_to_spare, ci_single

diagnose = {"treatment_diagnose": {'ipsi':{
        "I": 0,
        "II": 0,
        "III": 0,
        "IV": 0,
        "V": 0,
        "VII": 0
    },
    'contra':{
        "I": 0,
        "II": 0,
        "III": 0,
        "IV": 0,
        "V": 0,
        "VII": 0
    }}}

sampled_risks, mean_risk = risk_sampled(samples_reduced, central, 'early', given_diagnoses = diagnose, midline_extension = None)
spared_lnls, total_risk, ranked_combined, treated_lnls,treated_array, treated_ipsi, treated_contra, sampled_total_risk = levels_to_spare(threshold = 0.1, model = central, mean_risks = mean_risk, sampled_risks = sampled_risks, ci = True)


In [7]:
print('to treat: ', treated_lnls)
print("to spare: ", spared_lnls)
print("total risk: ", np.round(100*total_risk,2), "%")
print("confidence interval: ", np.round(100*ci_single(sampled_total_risk), 2), "%")

to treat:  [('ipsi II', 0.16252581485208106), ('contra II', 0.16252581485208106)]
to spare:  [('ipsi IV', 0.003241350273008568), ('contra IV', 0.003241350273008568), ('ipsi V', 0.004518440233585398), ('contra V', 0.004518440233585398), ('ipsi VII', 0.006236057703053506), ('contra VII', 0.006236057703053506), ('ipsi I', 0.006392306840454845), ('contra I', 0.006392306840454847), ('ipsi III', 0.024019341955639942), ('contra III', 0.024019341955639942)]
total risk:  8.25 %
confidence interval:  [7.26 9.17] %


## Combination analysis
Here we can also specifically analyze all combinations we encounter in a dataset

In [8]:
#load data and modify it for analysis
dataset_USZ =  pd.read_csv("data/cleanedUSZ.csv", header=[0,1,2]) #import data

maxllh =  dataset_USZ['max_llh']
t_stage = pd.DataFrame(dataset_USZ['info']['tumor']['t_stage'])
t_stage.columns = pd.MultiIndex.from_product([['tumor'], t_stage.columns])
ipsi = maxllh.loc[:,'ipsi'].drop(['IIa','IIb','VIII','Ib','IX','VI','X','Ia'],axis = 1)[['I','II','III','IV','V','VII']]
contra = maxllh.loc[:,'contra'].drop(['IIa','IIb','VIII','Ib','IX','VI','X','Ia'],axis = 1)[['I','II','III','IV','V','VII']]
ipsi_header = header = pd.MultiIndex.from_product([ ['ipsi'], ['I','II','III','IV','V','VII']], names=['', ''])
contra_header = pd.MultiIndex.from_product([['contra'], ['I','II','III','IV','V','VII']], names=['', ''])
ipsi.columns = ipsi_header
contra.columns = contra_header
dataset_analyze = pd.concat([t_stage,ipsi,contra],axis = 1)

In [9]:
from collections import Counter
from collections import defaultdict
# Convert the DataFrame to a NumPy array for easier manipulation
data = np.array(dataset_analyze)

# we extract all observed diagnoses
entry_combinations_with_indexes = defaultdict(list)
for index, row in enumerate(data):
    combination = tuple(row)
    entry_combinations_with_indexes[combination].append(index)
USZ_counts = []
USZ_combinations = []
USZ_indexes = []
for combination, indexes in entry_combinations_with_indexes.items():
    count = len(indexes)
    USZ_indexes.append(indexes)
    USZ_counts.append(count)
    USZ_combinations.append(combination)

In [10]:
from sparing_scripts import analysis_treated_lnls_combinations
usz_treated_lnls_no_risk, usz_treated_lnls_all, usz_treatment_array, usz_top3_spared, usz_total_risks, usz_treated_ipsi, usz_treated_contra, usz_sampled_risks_array, lnls_ranked, cis = analysis_treated_lnls_combinations(USZ_combinations, samples_reduced, central)

here we extract some extra information to produce a nice table

In [11]:
lnls = ['I','II', 'III', 'IV','V', 'VII']
t_stage = []
invovlvement_ipsi_USZ = []
invovlvement_contra_USZ = []
for diagnose_type in USZ_combinations:
    involved_ipsi = []
    involved_contra = []
    t_stage.append(diagnose_type[0])
    for lnl_looper, involved_level in enumerate(lnls):
        if diagnose_type[lnl_looper +1] == True:
            involved_ipsi.append(involved_level) 
        if diagnose_type[lnl_looper +7] == True:
            involved_contra.append(involved_level)
    invovlvement_ipsi_USZ.append(involved_ipsi)
    invovlvement_contra_USZ.append(involved_contra)

In [12]:
len(invovlvement_ipsi_USZ)

62

In [13]:
data_export_usz = pd.DataFrame({'T-stage': t_stage,
                                'Involvement Ipsi' : invovlvement_ipsi_USZ,
                                'Involvement Contra': invovlvement_contra_USZ,
                                'Treated Ipsi':  usz_treated_ipsi,
                                'Treated Contra': usz_treated_contra,
                                'risk': usz_total_risks,
                                'lower bound': cis[0],
                                'upper bound': cis[1],
                                'top 3 spared lnls risk': usz_top3_spared,
                                'lnls ranked': lnls_ranked
})

In [14]:
data_export_usz

Unnamed: 0,T-stage,Involvement Ipsi,Involvement Contra,Treated Ipsi,Treated Contra,risk,lower bound,upper bound,top 3 spared lnls risk,lnls ranked
0,late,[II],[],"[III, II]","[III, II]",0.084610,0.070548,0.097803,"[(ipsi I, 0.01832138213431852), (contra VII, 0...","[(contra IV, 0.007289396374694396), (ipsi IV, ..."
1,early,[II],[II],"[III, II]","[I, III, II]",0.073640,0.064256,0.082761,"[(ipsi I, 0.018304995925308473), (contra VII, ...","[(ipsi IV, 0.0077551947910500406), (contra IV,..."
2,late,"[I, II, III, IV, VII]","[I, II, III, IV]","[V, I, II, III, IV, VII]","[V, I, II, III, IV]",0.026659,0.019377,0.034192,"[(contra VII, 0.026658582065937932)]","[(contra VII, 0.026658582065937932), (ipsi V, ..."
3,late,"[II, III, IV, VII]",[],"[I, V, VII, II, III, IV]","[III, II]",0.065232,0.055691,0.074345,"[(contra VII, 0.019915644583170824), (contra I...","[(contra IV, 0.013856027008043248), (contra V,..."
4,early,"[II, VII]",[],"[III, II, VII]","[III, II]",0.071979,0.062148,0.081706,"[(ipsi I, 0.018182964731609512), (contra VII, ...","[(contra IV, 0.007059024635160332), (ipsi IV, ..."
...,...,...,...,...,...,...,...,...,...,...
57,early,"[II, IV]",[],"[V, III, II, IV]","[III, II]",0.072683,0.062857,0.082475,"[(ipsi I, 0.019177607745608685), (contra VII, ...","[(contra IV, 0.007734078732673839), (contra V,..."
58,late,"[II, III, V]",[],"[I, IV, II, III, V]","[III, II]",0.074306,0.062796,0.085538,"[(contra VII, 0.017802732441886737), (ipsi VII...","[(contra IV, 0.011717164313871696), (contra V,..."
59,late,"[II, III]","[II, III, IV]","[I, IV, II, III]","[I, V, II, III, IV]",0.059705,0.047181,0.071293,"[(contra VII, 0.021551926421364775), (ipsi VII...","[(ipsi V, 0.017926380692774975), (ipsi VII, 0...."
60,late,"[II, V]",[],"[IV, III, II, V]","[III, II]",0.082263,0.070822,0.095380,"[(ipsi I, 0.021543930442482633), (contra VII, ...","[(contra IV, 0.009212175780734598), (contra V,..."


## risk tables
here is an example for calculating the risk of each combination to show up


In [15]:
midline_backend = emcee.backends.HDFBackend(filename = "data/samples_midline_trial.hdf5")
samples_midline_model = midline_backend.get_chain(flat = True)

In [16]:
from sparing_scripts import sample_from_flattened
samples_reduced = sample_from_flattened(samples_midline_model, num_samples = 203, spaced = True,step_size = 89)

In [None]:
sampled_risks_early, mean_risk_early = risk_sampled(samples_reduced, central, 'early', given_diagnoses = None) 
sampled_risks_late, mean_risk_late = risk_sampled(samples_reduced, central, 'late', given_diagnoses = None)


In [None]:
state_list = central.ipsi.state_list
state_list_names = []
for index, state in enumerate(state_list):
    state_list_names.append(str(state))
df_early = pd.DataFrame(mean_risk_early, columns=state_list_names, index=state_list_names)
df_late = pd.DataFrame(mean_risk_late, columns=state_list_names, index=state_list_names)

# df_early.to_excel('early_central.xlsx')
# df_late.to_excel('late_central.xlsx')

## Full treatment table

In [21]:
from sparing_scripts import sample_from_flattened
samples_reduced = sample_from_flattened(samples_midline_model, num_samples = 100, spaced = False)
#note to exactly reproduce the trial results, the number of samples should be 203 and spaced = True, and step_size = 89 but for the sake of speed, we use 100 samples here

In [24]:
from itertools import product

# Generate all 2**13 combinations of 13 booleans
combinations = list(product([False, True], repeat=13))
combinations = [('early' if comb[0] == False else 'late',) + comb[1:] for comb in combinations]
print(combinations[0])
print(combinations[1])

('early', False, False, False, False, False, False, False, False, False, False, False, False)
('early', False, False, False, False, False, False, False, False, False, False, False, True)


In [25]:
import multiprocessing as mp

# Function to process a chunk of combinations
def process_combinations(chunk):
    return analysis_treated_lnls_combinations(chunk, samples_reduced, central)

# Divide the combinations into chunks
num_cores = mp.cpu_count() - 1
chunk_size = len(combinations) // num_cores
chunks = [combinations[i:i + chunk_size] for i in range(0, len(combinations), chunk_size)]

# Use multiprocessing to process the chunks
with mp.Pool(num_cores) as pool:
    results = pool.map(process_combinations, chunks)

# Combine the results from all chunks
treated_lnls_no_risk, treated_lnls_all, treatment_array, top3_spared, total_risks, treated_ipsi, treated_contra, sampled_risks_array, lnls_ranked, cis = zip(*results)

# Flatten the results
treated_lnls_no_risk = [item for sublist in treated_lnls_no_risk for item in sublist]
treated_lnls_all = [item for sublist in treated_lnls_all for item in sublist]
treatment_array = np.vstack(treatment_array)
top3_spared = [item for sublist in top3_spared for item in sublist]
total_risks = np.concatenate(total_risks)
treated_ipsi = [item for sublist in treated_ipsi for item in sublist]
treated_contra = [item for sublist in treated_contra for item in sublist]
sampled_risks_array = np.vstack(sampled_risks_array)
lnls_ranked = [item for sublist in lnls_ranked for item in sublist]
cis_lower = []
cis_upper = []
for item in cis:
    cis_lower.append(item[0])
    cis_upper.append(item[1])
flat_lower = [item for sublist in cis_lower for item in sublist]
flat_upper = [item for sublist in cis_upper for item in sublist]

compute the prevalence for each diagnosis


In [26]:
sampled_risks_early, mean_risk_early = risk_sampled(samples_reduced, central, 'early', given_diagnoses = None) 
sampled_risks_late, mean_risk_late = risk_sampled(samples_reduced, central, 'late', given_diagnoses = None)


In [27]:
#generate state list
state_list = np.array(np.meshgrid(*[[0, 1]] * 13)).T.reshape(-1, 13)
state_list = state_list[np.lexsort(np.fliplr(state_list).T)]
# Reshape the risk arrays into 1x4096 arrays
mean_risk_early_flat = mean_risk_early.reshape(-1)
mean_risk_late_flat = mean_risk_late.reshape(-1)
#combine them
full_risks = np.hstack([mean_risk_early_flat, mean_risk_late_flat])/2


generate some columns

In [28]:
lnls = ['I','II', 'III', 'IV','V', 'VII']
t_stage = []
invovlvement_ipsi= []
invovlvement_contra= []
for diagnose_type in combinations:
    involved_ipsi = []
    involved_contra = []
    t_stage.append(diagnose_type[0])
    for lnl_looper, involved_level in enumerate(lnls):
        if diagnose_type[lnl_looper +1] == True:
            involved_ipsi.append(involved_level) 
        if diagnose_type[lnl_looper +7] == True:
            involved_contra.append(involved_level)
    invovlvement_ipsi.append(involved_ipsi)
    invovlvement_contra.append(involved_contra)

Produce and export data frame

In [29]:
data_export = pd.DataFrame({'Percentage of patients': full_risks,
                                'T-stage': t_stage,
                                'Involvement Ipsi' : invovlvement_ipsi,
                                'Involvement Contra': invovlvement_contra,
                                'Treated Ipsi':  treated_ipsi,
                                'Treated Contra': treated_contra,
                                'risk': total_risks,
                                'lower bound': flat_lower,
                                'upper bound': flat_upper,
                                'top 3 spared lnls risk': top3_spared,
                                'lnls ranked': lnls_ranked
})

In [30]:
data_export

Unnamed: 0,Percentage of patients,T-stage,Involvement Ipsi,Involvement Contra,Treated Ipsi,Treated Contra,risk,lower bound,upper bound,top 3 spared lnls risk,lnls ranked
0,4.787616e-02,early,[],[],[II],[II],0.085794,0.076216,0.095853,"[(contra III, 0.02278777077510111), (ipsi III,...","[(ipsi IV, 0.003005773439340514), (contra IV, ..."
1,1.190294e-03,early,[],[VII],"[III, II]","[III, II, VII]",0.060666,0.051158,0.070753,"[(ipsi I, 0.013597188556368862), (contra I, 0....","[(contra IV, 0.0048038806633290175), (ipsi IV,..."
2,8.678618e-04,early,[],[V],"[III, II]","[III, II, V]",0.063463,0.054260,0.073464,"[(contra I, 0.013584515031546843), (ipsi I, 0....","[(ipsi IV, 0.004799554558999676), (contra IV, ..."
3,3.849016e-05,early,[],"[V, VII]","[III, II]","[III, II, V, VII]",0.065639,0.055157,0.076689,"[(contra I, 0.016710539296656977), (ipsi I, 0....","[(ipsi IV, 0.005890416882507717), (contra IV, ..."
4,5.411428e-04,early,[],[IV],"[III, II]","[III, II, IV]",0.066154,0.057597,0.077043,"[(contra I, 0.013573996190652665), (ipsi I, 0....","[(ipsi IV, 0.004796314307986904), (ipsi V, 0.0..."
...,...,...,...,...,...,...,...,...,...,...,...
8187,1.205927e-17,late,"[I, II, III, IV, V, VII]","[I, II, III, V, VII]","[I, II, III, IV, V, VII]","[I, II, III, V, VII]",0.003456,0.001327,0.006184,"[(contra IV, 0.003455709997680996)]","[(contra IV, 0.003455709997680996), (ipsi I, 1..."
8188,1.847068e-16,late,"[I, II, III, IV, V, VII]","[I, II, III, IV]","[I, II, III, IV, V, VII]","[I, II, III, IV]",0.011326,0.007301,0.015593,"[(contra VII, 0.0065086916521072385), (contra ...","[(contra V, 0.004852533696177574), (contra VII..."
8189,7.782549e-18,late,"[I, II, III, IV, V, VII]","[I, II, III, IV, VII]","[I, II, III, IV, V, VII]","[I, II, III, IV, VII]",0.005348,0.003207,0.007934,"[(contra V, 0.005348484432834034)]","[(contra V, 0.005348484432834034), (ipsi I, 1...."
8190,5.956849e-18,late,"[I, II, III, IV, V, VII]","[I, II, III, IV, V]","[I, II, III, IV, V, VII]","[I, II, III, IV, V]",0.007177,0.004334,0.010326,"[(contra VII, 0.007176596842290093)]","[(contra VII, 0.007176596842290093), (ipsi I, ..."


In [None]:
# data_export.to_csv('full_combination_central_table_100_samples.csv', index=False)