In [1]:
import numpy as np
import lymph
import pandas as pd
import scipy as sp
from scipy.special import factorial
import matplotlib.pyplot as plt
import emcee                      # inference and backends for sample storage
from multiprocessing import Pool  # for parallelization of the inference


dataset_full = pd.read_csv("../lynference/data/cleaned.csv", header=[0,1,2]) #import data
dataset_USZ =  pd.read_csv("../lynference/data/cleanedUSZ.csv", header=[0,1,2]) #import data

maxllh =  dataset_USZ['max_llh']
t_stage = dataset_USZ['info']
ipsi = maxllh.loc[:,'ipsi'].drop(['IIa','IIb','VIII','Ib','IX','VI','X','Ia'],axis = 1)[['I','II','III','IV','V','VII']]
contra = maxllh.loc[:,'contra'].drop(['IIa','IIb','VIII','Ib','IX','VI','X','Ia'],axis = 1)[['I','II','III','IV','V','VII']]
ipsi_header = header = pd.MultiIndex.from_product([ ['ipsi'], ['I','II','III','IV','V','VII']], names=['', ''])
contra_header = pd.MultiIndex.from_product([['contra'], ['I','II','III','IV','V','VII']], names=['', ''])
ipsi.columns = ipsi_header
contra.columns = contra_header

dataset_analyze = pd.concat([t_stage,ipsi,contra],axis = 1)

CLB_full = dataset_full.loc[287:]
maxllh_CLB =  CLB_full['max_llh']
t_stage_CLB = CLB_full['info']
ipsi_CLB = maxllh_CLB.loc[:,'ipsi'].drop(['IIa','IIb','VIII','Ib','IX','VI','X','Ia'],axis = 1)[['I','II','III','IV','V','VII']]
contra_CLB = maxllh_CLB.loc[:,'contra'].drop(['IIa','IIb','VIII','Ib','IX','VI','X','Ia'],axis = 1)[['I','II','III','IV','V','VII']]
ipsi_header = header = pd.MultiIndex.from_product([ ['ipsi'], ['I','II','III','IV','V','VII']], names=['', ''])
contra_header = pd.MultiIndex.from_product([['contra'], ['I','II','III','IV','V','VII']], names=['', ''])
ipsi_CLB.columns = ipsi_header
contra_CLB.columns = contra_header

dataset_CLB = pd.concat([t_stage_CLB,ipsi_CLB,contra_CLB],axis = 1)

maxllh_full =  dataset_full['max_llh']
t_stage_full = dataset_full['info']
ipsi_full = maxllh_full.loc[:,'ipsi'].drop(['IIa','IIb','VIII','Ib','IX','VI','X','Ia'],axis = 1)[['I','II','III','IV','V','VII']]
contra_full = maxllh_full.loc[:,'contra'].drop(['IIa','IIb','VIII','Ib','IX','VI','X','Ia'],axis = 1)[['I','II','III','IV','V','VII']]
ipsi_header = header = pd.MultiIndex.from_product([ ['ipsi'], ['I','II','III','IV','V','VII']], names=['', ''])
contra_header = pd.MultiIndex.from_product([['contra'], ['I','II','III','IV','V','VII']], names=['', ''])
ipsi_full.columns = ipsi_header
contra_full.columns = contra_header

analysis_full = pd.concat([t_stage_full,ipsi_full,contra_full],axis = 1)
analysis_full.fillna(False, inplace=True)
(analysis_full['contra']['II'] == True).sum()


73

In [2]:
(CLB_full['max_llh']['contra']['VII'] == True).sum()

0

In [3]:
graph = {
    ('tumor', 'primary')  : ['I','II', 'III', 'IV','V', 'VII'],
    ('lnl'  , 'I') :        [],
    ('lnl'  , 'II') :       ['I','III','V'], 
    ('lnl'  , 'III'):       ['IV','V'], 
    ('lnl'  , 'IV') :       [],
    ('lnl'  , 'V') :        [],
    ('lnl'  , 'VII') :      [],
}

model = lymph.MidlineBilateral(graph = graph,use_mixing= True, trans_symmetric =True)
model.modalities = {'CT': [0.76, 0.81],
                    'MRI': [0.63, 0.81],
                    'PET': [0.86, 0.79],
                    'FNA': [0.98, 0.80],
                    'diagnostic_consensus': [0.86, 0.81],
                    'pathology': [1.0, 1.0],
                    'pCT': [0.86, 0.81],
                    'max_llh': [1.0, 1.0]
                    }


# Time prior with p(early) = 0.3
def binom_pmf(k: np.ndarray, n: int, p: float):
    """Binomial PMF"""
    if p > 1. or p < 0.:
        raise ValueError("Binomial prob must be btw. 0 and 1")
    q = (1. - p)
    binom_coeff = factorial(n) / (factorial(k) * factorial(n - k))
    return binom_coeff * p**k * q**(n - k)

def parametric_binom_pmf(n: int):
    """Return a parametric binomial PMF"""
    def inner(t, p):
        """Parametric binomial PMF"""
        return binom_pmf(t, n, p)
    return inner

max_t = 10
model.diag_time_dists["early"] = sp.stats.binom.pmf(np.arange(max_t+1), max_t, 0.3)
model.diag_time_dists["late"] = parametric_binom_pmf(max_t)
model.patient_data = dataset_full

In [4]:
# base symmetric central
graph = {
    ('tumor', 'primary')  : ['I','II', 'III', 'IV','V', 'VII'],
    ('lnl'  , 'I') :        [],
    ('lnl'  , 'II') :       ['I','III','V'], 
    ('lnl'  , 'III'):       ['IV','V'], 
    ('lnl'  , 'IV') :       [],
    ('lnl'  , 'V') :        [],
    ('lnl'  , 'VII') :      [],
}

central = lymph.Bilateral(graph = graph,base_symmetric= True, trans_symmetric= True )
central.modalities = {'CT': [0.76, 0.81],
                    'MRI': [0.63, 0.81],
                    'PET': [0.86, 0.79],
                    'FNA': [0.98, 0.80],
                    'diagnostic_consensus': [0.86, 0.81],
                    'pathology': [1.0, 1.0],
                    'pCT': [0.86, 0.81],
                    'max_llh': [1.0, 1.0]
                    }


# Time prior with p(early) = 0.3
def binom_pmf(k: np.ndarray, n: int, p: float):
    """Binomial PMF"""
    if p > 1. or p < 0.:
        raise ValueError("Binomial prob must be btw. 0 and 1")
    q = (1. - p)
    binom_coeff = factorial(n) / (factorial(k) * factorial(n - k))
    return binom_coeff * p**k * q**(n - k)

def parametric_binom_pmf(n: int):
    """Return a parametric binomial PMF"""
    def inner(t, p):
        """Parametric binomial PMF"""
        return binom_pmf(t, n, p)
    return inner

max_t = 10
central.diag_time_dists["early"] = sp.stats.binom.pmf(np.arange(max_t+1), max_t, 0.3)
central.diag_time_dists["late"] = parametric_binom_pmf(max_t)
central.patient_data = dataset_full

In [5]:
central.modalities = {'max_llh_diagnose' : [1,0.81]}

In [6]:
backend = emcee.backends.HDFBackend(filename = "../lynference/models/samples.hdf5")
samples = backend.get_chain(flat = True)
model.check_and_assign(samples.mean(axis = 0))
model.modalities = {'max_llh_diagnose' : [1,0.81]}

In [7]:
bilateral_samples = np.zeros((19000,12))
bilateral_samples[:,0:6] = samples[:,0:6]
bilateral_samples[:,6:] = samples[:,13:]
bilateral_samples.mean(axis = 0)

array([0.00979183, 0.3781374 , 0.06128133, 0.00958199, 0.00848541,
       0.02088659, 0.03926396, 0.16041497, 0.01144267, 0.16367662,
       0.04831059, 0.38639099])

In [8]:
def risk_sampled_bilateral(samples, model, t_stage, given_diagnoses, thin = 89):
    sampled_risks = np.zeros(shape=(len(samples[::thin]),64,64), dtype=float)
    for i, sample in enumerate(np.random.permutation(samples[::thin])):
        sampled_risks[i] = model.risk(given_params = sample, t_stage = t_stage, given_diagnoses = given_diagnoses) 
    mean_risk = sampled_risks.mean(axis = 0)
    return sampled_risks, mean_risk

def levels_to_spare_bilateral(threshold, model, risks, sampled_risks):
    """Computes which LNLs to irradiate given the threshold, model and the risk of each state.

    Args:
        threshold (float): Risk threshold we want to apply
        model (lymph.Unilateral): lymph.unilateral object with fully analyzed patients
        risks (ndarray): Array with the risk of each state

    Returns:
        _type_: _description_
    """
    state_list = model.ipsi.state_list
    lnls = ['I', 'II', 'III', 'IV', 'V', 'VII']
    overall_risk_ipsi = {}
    overall_risk_contra = {}
    for index, lnl in enumerate(lnls):
        overall_risk_ipsi[lnl] = risks[np.where((state_list[:,index] == 1))[0]].sum()
        overall_risk_contra[lnl] = risks.T[np.where((state_list[:,index] == 1))[0]].sum()

    combined_dict = {f'ipsi {key}': value for key, value in overall_risk_ipsi.items()}
    combined_dict.update({f'contra {key}': value for key, value in overall_risk_contra.items()})
    ranked_combined = sorted(combined_dict.items(), key = lambda item: item[1])
    total_risk_new = 0
    sampled_total_risks_new = np.zeros(sampled_risks.shape[0])
    looper = 1
    treated_array = np.ones(12)
    contra_lnl_indices = []
    ipsi_lnl_indices = []
    treated_ipsi = []
    treated_contra = []
    while total_risk_new < threshold:
        sampled_total_risks = sampled_total_risks_new
        total_risk = total_risk_new
        if ipsi_lnl_indices != []:
            treated_array[ipsi_lnl_indices] = 0
        if contra_lnl_indices != []:
            treated_array[np.array(contra_lnl_indices)+6] = 0
        lnls_of_interest = ranked_combined[0:looper]
        lnls_of_interest_names = [t[0] for t in lnls_of_interest]
        contra_lnl_indices = []
        ipsi_lnl_indices = []
        for i,lnl_looper in enumerate(lnls_of_interest_names):
            contra_lnl_indices.append(np.where(np.array(lnls) == lnls_of_interest_names[i].split()[1])[0][0]) if lnl_looper.split()[0] == 'contra' else ipsi_lnl_indices.append(np.where(np.array(lnls) == lnls_of_interest_names[i].split()[1])[0][0])
        indices_list_contra = []
        indices_list_ipsi = []
        for index in contra_lnl_indices:
            condition_contra = (state_list[:, index] == 1)
            indices_contra = np.where(condition_contra)[0]
            indices_list_contra.extend(indices_contra)
            unique_contra = np.unique(indices_list_contra)
        for index in ipsi_lnl_indices:
            condition_ipsi = (state_list[:, index] == 1)
            indices_ipsi = np.where(condition_ipsi)[0]
            indices_list_ipsi.extend(indices_ipsi)
            unique_ipsi = np.unique(indices_list_ipsi)    
        if len(ipsi_lnl_indices) == 0:
            total_risk_new = risks.T[unique_contra].sum()
            sampled_total_risks_new = sampled_risks.transpose((0,2,1))[:,unique_contra].sum(axis = (1,2))
        elif len(contra_lnl_indices) == 0:
            total_risk_new = risks[unique_ipsi].sum()
            sampled_total_risks_new = sampled_risks[:,unique_ipsi].sum(axis = (1,2))
        else:
            total_risk_new = 0
            sampled_total_risks_new = np.zeros(sampled_risks.shape[0])
            total_risk_new += risks[unique_ipsi].sum()
            total_risk_new += risks.T[unique_contra][:,[np.setdiff1d(np.array(range(64)),unique_ipsi)]].sum()
            sampled_total_risks_new += sampled_risks[:,unique_ipsi].sum(axis = (1,2))
            sampled_total_risks_new += sampled_risks.transpose((0,2,1))[:,unique_contra][:,:,list(np.setdiff1d(np.array(range(64)),unique_ipsi))].sum(axis = (1,2))

        spared_lnls = lnls_of_interest[:-1]
        treated_lnls = ranked_combined[looper-1:]
        looper += 1
    for to_treat in treated_lnls:
        if to_treat[0].split()[0] == 'ipsi':
            treated_ipsi.append(to_treat[0].split()[1])
        else: 
            treated_contra.append(to_treat[0].split()[1])
    return spared_lnls, total_risk, ranked_combined, treated_lnls, treated_array, treated_ipsi, treated_contra,sampled_total_risks

In [9]:
full_USZ =  pd.read_csv("../lynference/data/2021-usz-oropharynx.csv", header=[0,1,2]) #import data
central_patients = dataset_analyze.loc[full_USZ['tumor']['1']['central']]
central_patients

Unnamed: 0_level_0,tumor,tumor,ipsi,ipsi,ipsi,ipsi,ipsi,ipsi,contra,contra,contra,contra,contra,contra
Unnamed: 0_level_1,t_stage,midline_extension,I,II,III,IV,V,VII,I,II,III,IV,V,VII
18,late,True,False,False,False,False,False,False,False,False,False,False,False,False
79,late,True,False,False,False,False,False,False,False,True,False,False,False,False
87,late,True,False,False,False,False,False,False,False,False,False,False,False,False
91,early,True,False,False,False,False,False,False,False,False,False,False,False,False
144,early,True,False,False,False,False,False,False,False,False,False,False,False,False
150,late,True,False,False,False,False,False,False,False,False,False,False,False,False
178,early,True,False,False,False,False,False,False,False,False,False,False,False,False
183,late,True,False,False,False,False,False,False,False,False,False,False,False,False
207,late,True,False,True,True,False,False,False,False,True,True,False,False,False


In [10]:
def ci_single(sampled_risks, level = 0.95):
    lower = (1-level)/2*100
    upper = 100- lower
    ci = np.percentile(sampled_risks,[lower,upper])
    return ci
diagnose = {'max_llh_diagnose':{
    "ipsi": {
        "I": 0,
        "II": 1,
        "III": 1,
        "IV": 0,
        "V": 0,
        "VII": 0,
    },
    "contra": {
        "I": 0,
        "II": 0,
        "III": 1,
        "IV": 0,
        "V": 0,
        "VII": 0,
    }
}}
sampled_risks, risk = risk_sampled_bilateral(samples = bilateral_samples, model = central, t_stage = 'late', given_diagnoses= diagnose)     
spared_lnls, total_risk, ranked_combined, treated_lnls, treated_array, treated_ipsi, treated_contra, sampled_total_risks = levels_to_spare_bilateral(0.10, central, risk, sampled_risks)
print(treated_lnls)
print(total_risk*100)
print(spared_lnls)
ci_single(sampled_total_risks)*100

[('ipsi V', 0.025064797870341552), ('ipsi I', 0.029081265446687262), ('ipsi IV', 0.05662126683553424), ('contra IV', 0.06006855628615346), ('contra II', 0.682505365057676), ('ipsi II', 0.9999999999999998), ('ipsi III', 0.9999999999999998), ('contra III', 0.9999999999999999)]
8.054970616530472
[('ipsi VII', 0.017820852057412438), ('contra VII', 0.01782085205741244), ('contra I', 0.023617219939726007), ('contra V', 0.024257871587631225)]


array([6.73615544, 9.41561591])

In [11]:
indices = np.where((central.ipsi.state_list[:,3] == 1))[0]

In [12]:
risk[indices].sum()


0.05662126683553427

In [13]:
risk.T[indices].sum()

0.060068556286153464

In [15]:
central.risk().shape

(64, 64)

The following code extracts the risks from our samples. thin was set to 89 which results in 214 samples/risks taken. If this number is changed, some downstream codes need to be adapted.
The mean_risk is simply the mean of the sampled risks. --> used for ranking later on

In [6]:
def risk_sampled(samples, model, t_stage, given_diagnoses, midline_extension, thin = 89):
    sampled_risks = np.zeros(shape=(len(samples[::thin]),64,64), dtype=float)
    for i, sample in enumerate(np.random.permutation(samples[::thin])):
        sampled_risks[i] = model.risk(given_params = sample, t_stage = t_stage, given_diagnoses = given_diagnoses,midline_extension=midline_extension) 
    mean_risk = sampled_risks.mean(axis = 0)
    return sampled_risks, mean_risk


Compute which levels are to be spared based on given mean risks, or sampled risks. The ranking is based on the mean risk of each state (which is equivalent to the mean risk when directly sampling the LNL involvement risk)

In [7]:
def levels_to_spare(threshold, model, risks, sampled_risks = None):
    """Computes which LNLs to irradiate given the threshold, model and the risk of each state.

    Args:
        threshold (float): Risk threshold we want to apply
        model (lymph.Midline): lymph.Midline object with fully analyzed patients
        risks (ndarray): Array with the risk of each state
        sampled_risks (ndarray): Array with sampled risks. i.e. Simply an n times risks.shape dimensional array holding several risks calculations

    Returns:
        _type_: _description_
    """
    sampled_total_risks = None
    #extract the state list. adapt to the model that is used!
    state_list = model.noext.ipsi.state_list
    lnls = []
    for lnl in model.noext.ipsi.lnls:
        lnls.append(lnl.name)
    overall_risk_ipsi = {}
    overall_risk_contra = {}
    #compute the risk for each lnl with the averaged risk array
    for index, lnl in enumerate(lnls):
        overall_risk_ipsi[lnl] = risks[np.where((state_list[:,index] == 1))[0]].sum()
        overall_risk_contra[lnl] = risks.T[np.where((state_list[:,index] == 1))[0]].sum()
    #combine both dictionaries from ipsi and contra to rank the lnls together
    combined_dict = {f'ipsi {key}': value for key, value in overall_risk_ipsi.items()}
    combined_dict.update({f'contra {key}': value for key, value in overall_risk_contra.items()})
    ranked_combined = sorted(combined_dict.items(), key = lambda item: item[1])

    #here we start extracting the risk for each lnl. with every loop we exclude a LNL from irradiation until we hit the threshold
    total_risk_new = 0
    if sampled_risks is not None and sampled_risks.size > 0:

        sampled_total_risks_new = np.zeros(sampled_risks.shape[0])
    looper = 1
    treated_array = np.ones(12)
    contra_lnl_indices = []
    ipsi_lnl_indices = []
    treated_ipsi = []
    treated_contra = []
    while total_risk_new < threshold:
        if sampled_risks is not None and sampled_risks.size > 0:

            sampled_total_risks = sampled_total_risks_new
        total_risk = total_risk_new
        if ipsi_lnl_indices != []:
            treated_array[ipsi_lnl_indices] = 0
        if contra_lnl_indices != []:
            treated_array[np.array(contra_lnl_indices)+6] = 0
        lnls_of_interest = ranked_combined[0:looper]
        lnls_of_interest_names = [t[0] for t in lnls_of_interest]
        contra_lnl_indices = []
        ipsi_lnl_indices = []
        for i,lnl_looper in enumerate(lnls_of_interest_names):
            contra_lnl_indices.append(np.where(np.array(lnls) == lnls_of_interest_names[i].split()[1])[0][0]) if lnl_looper.split()[0] == 'contra' else ipsi_lnl_indices.append(np.where(np.array(lnls) == lnls_of_interest_names[i].split()[1])[0][0])
        indices_list_contra = []
        indices_list_ipsi = []
        for index in contra_lnl_indices:
            condition_contra = (state_list[:, index] == 1)
            indices_contra = np.where(condition_contra)[0]
            indices_list_contra.extend(indices_contra)
            unique_contra = np.unique(indices_list_contra)
        for index in ipsi_lnl_indices:
            condition_ipsi = (state_list[:, index] == 1)
            indices_ipsi = np.where(condition_ipsi)[0]
            indices_list_ipsi.extend(indices_ipsi)
            unique_ipsi = np.unique(indices_list_ipsi)    
        if len(ipsi_lnl_indices) == 0:
            total_risk_new = risks.T[unique_contra].sum()
            if sampled_risks is not None and sampled_risks.size > 0:

                sampled_total_risks_new = sampled_risks.transpose((0,2,1))[:,unique_contra].sum(axis = (1,2))
        elif len(contra_lnl_indices) == 0:
            total_risk_new = risks[unique_ipsi].sum()
            if sampled_risks is not None and sampled_risks.size > 0:

                sampled_total_risks_new = sampled_risks[:,unique_ipsi].sum(axis = (1,2))
        else:
            total_risk_new = 0
            total_risk_new += risks[unique_ipsi].sum()
            total_risk_new += risks.T[unique_contra][:,[np.setdiff1d(np.array(range(64)),unique_ipsi)]].sum()
            if sampled_risks is not None and sampled_risks.size > 0:

                sampled_total_risks_new = np.zeros(sampled_risks.shape[0])
                sampled_total_risks_new += sampled_risks[:,unique_ipsi].sum(axis = (1,2))
                sampled_total_risks_new += sampled_risks.transpose((0,2,1))[:,unique_contra][:,:,list(np.setdiff1d(np.array(range(64)),unique_ipsi))].sum(axis = (1,2))

        spared_lnls = lnls_of_interest[:-1]
        treated_lnls = ranked_combined[looper-1:]
        looper += 1
    for to_treat in treated_lnls:
        if to_treat[0].split()[0] == 'ipsi':
            treated_ipsi.append(to_treat[0].split()[1])
        else: 
            treated_contra.append(to_treat[0].split()[1])
    return spared_lnls, total_risk, ranked_combined, treated_lnls, treated_array, treated_ipsi, treated_contra,sampled_total_risks

In [71]:
diagnose = {'max_llh_diagnose':{
    "ipsi": {
        "I": 0,
        "II": 1,
        "III": 0,
        "IV": 0,
        "V": 0,
        "VII": 0,
    },
    "contra": {
        "I": 0,
        "II": 0,
        "III": 0,
        "IV": 0,
        "V": 0,
        "VII": 0,
    }
}}
involv = {'ipsi':{
        "I": None,
        "II": None,
        "III": True,
        "IV": None,
        "V": None,
        "VII": None,
    },
    "contra":{
        "I": None,
        "II": None,
        "III": None,
        "IV": None,
        "V": None,
        "VII": None,
    }
}
model.risk(given_diagnoses= diagnose, involvement= involv, t_stage = 'early', midline_extension= False)

0.07277896938230138

In [72]:
risk_test = model.risk(given_diagnoses= diagnose, t_stage = 'early', midline_extension= False)

In [74]:
marg_states = {}   # vectors marginalizing over only the states we care about
for side in ["ipsi", "contra"]:
    if isinstance(involv[side], dict):
        involv[side] = np.array(
            [involv[side].get(lnl.name, None) for lnl in side_model.lnls]
        )
    else:
        involv[side] = np.array(involv[side])

    side_model = getattr(model.noext, side)
    marg_states[side] = np.zeros(shape=len(side_model.state_list), dtype=bool)
    for i,state in enumerate(side_model.state_list):
        marg_states[side][i] = np.all(np.equal(
            involv[side], state,
            where=(involv[side] != None),
            out=np.ones_like(state, dtype=bool)
        ))

In [75]:
marg_states["ipsi"] @ risk_test @ marg_states["contra"]

0.07277896938230138

In [76]:
diagnose = {'max_llh_diagnose':{
    "ipsi": {
        "I": 0,
        "II": 1,
        "III": 0,
        "IV": 0,
        "V": 0,
        "VII": 0,
    },
    "contra": {
        "I": 0,
        "II": 0,
        "III": 0,
        "IV": 0,
        "V": 0,
        "VII": 0,
    }
}}
sampled_risks, risk = risk_sampled(samples = samples, model = model, t_stage = 'early', given_diagnoses= diagnose, midline_extension= False)     
spared_lnls, total_risk, ranked_combined, treated_lnls, treated_array, treated_ipsi, treated_contra, sampled_total_risks = levels_to_spare(0.10, model, risk_test, sampled_risks = sampled_risks)
print(treated_lnls)
print(total_risk*100)
print(spared_lnls)


[('ipsi III', 0.07277896938230134), ('ipsi II', 1.0000000000000004)]
6.534802885096467
[('contra V', 0.00010966725001120429), ('contra I', 0.0004831298756013683), ('contra III', 0.0008017051638956271), ('contra IV', 0.001335568020490673), ('contra VII', 0.004077454222696565), ('ipsi IV', 0.00860530278722505), ('ipsi V', 0.010937710429865695), ('contra II', 0.01124515135799644), ('ipsi I', 0.013827957310150005), ('ipsi VII', 0.016958011416240093)]


## Combination analysis

In [None]:
from collections import Counter
from collections import defaultdict


# Sample array with different entry combinations
data = np.array(dataset_analyze)

entry_combinations_with_indexes = defaultdict(list)
for index, row in enumerate(data):
    combination = tuple(row)
    entry_combinations_with_indexes[combination].append(index)
USZ_counts = []
USZ_combinations = []
USZ_indexes = []
# Print the most common combinations, their USZ_counts, and indexes
for combination, indexes in entry_combinations_with_indexes.items():
    count = len(indexes)
    USZ_indexes.append(indexes)
    # print(f"Combination: {combination}, Count: {count}, Indexes: {indexes}")
    USZ_counts.append(count)
    USZ_combinations.append(combination)

lnls = ['I','II', 'III', 'IV','V', 'VII']
t_stage = []
midline_extension = []
invovlvement_ipsi_USZ = []
invovlvement_contra_USZ = []
for diagnose_type in USZ_combinations:
    involved_ipsi = []
    involved_contra = []
    t_stage.append(diagnose_type[0])
    midline_extension.append(diagnose_type[1])
    for lnl_looper, involved_level in enumerate(lnls):
        if diagnose_type[lnl_looper +2] == True:
            involved_ipsi.append(involved_level) 
        if diagnose_type[lnl_looper +8] == True:
            involved_contra.append(involved_level)
    invovlvement_ipsi_USZ.append(involved_ipsi)
    invovlvement_contra_USZ.append(involved_contra)

In [41]:
# df2 = pd.DataFrame(USZ_combinations)
# df2.to_csv('involvement_combinations_USZ.csv')

Here we havethe code to compute the treated lnls. The code samples the risk of involvement given a diagnose and outputs all samples. Which allows us to build confidence intervals

In [31]:
def analysis_treated_lnls_sampled(combinations):
    treatment_array = np.zeros((len(combinations),12))
    top3_spared = []
    diagnose_looper = {'max_llh_diagnose':{
        "ipsi": {
            "I": 0,
            "II": 0,
            "III": 0,
            "IV": 0,
            "V": 0,
            "VII": 0,
        },
        "contra": {
            "I": 0,
            "II": 0,
            "III": 0,
            "IV": 0,
            "V": 0,
            "VII": 0,
        }
    }}
    treated_lnls_all = []
    treated_lnls_no_risk = []
    total_risks = np.zeros(len(combinations))
    sampled_risks_array = np.zeros((len(combinations),214))
    treated_ipsi_all = []
    treated_contra_all = []
    for index, pattern in enumerate(combinations):
        treated_looper = set()
        stage = pattern[0]
        midline_extension = pattern[1]
        counter_ipsi = 0
        for lnl_ipsi, status in diagnose_looper['max_llh_diagnose']['ipsi'].items():
            diagnose_looper['max_llh_diagnose']['ipsi'][lnl_ipsi] = pattern[2+counter_ipsi]
            counter_ipsi += 1
        counter_contra = 0
        for lnl_contra, status in diagnose_looper['max_llh_diagnose']['contra'].items():
            diagnose_looper['max_llh_diagnose']['contra'][lnl_contra] = pattern[8+counter_contra]
            counter_contra += 1
        sampled_risks, mean_risk = risk_sampled(samples = samples, model = model, t_stage = stage, given_diagnoses=diagnose_looper,midline_extension=midline_extension)     
        spared_lnls, total_risk, ranked_combined, treated_lnls, treated_array, treated_ipsi, treated_contra, sampled_total_risks =levels_to_spare(0.10, model, mean_risk, sampled_risks)
        for i in treated_lnls:
            treated_looper.add(i[0])
        treated_lnls_all.append(treated_lnls)
        treated_lnls_no_risk.append(treated_looper)
        treatment_array[index] = treated_array
        total_risks[index] = total_risk
        sampled_risks_array[index] = sampled_total_risks
        top3_spared.append(spared_lnls[::-1][:3])
        treated_ipsi_all.append(treated_ipsi)
        treated_contra_all.append(treated_contra)
    return treated_lnls_no_risk, treated_lnls_all, treatment_array, top3_spared, total_risks, treated_ipsi_all, treated_contra_all, sampled_risks_array





def count_number_treatments(treated_lnls_no_risk):
    set_counts = {}
    # Iterate through the list and update the counts in the dictionary
    for value in treated_lnls_no_risk:
        frozen_set = frozenset(value)  # Convert the set to a frozenset
        if frozen_set in set_counts:
            set_counts[frozen_set] += 1
        else:
            set_counts[frozen_set] = 1
    return set_counts


In [32]:
usz_treated_lnls_no_risk, usz_treated_lnls_all, usz_treatment_array, usz_top3_spared, usz_total_risks, usz_treated_ipsi, usz_treated_contra, usz_sampled_risks_array = analysis_treated_lnls_sampled(USZ_combinations)
usz_set_counts = count_number_treatments(usz_treated_lnls_no_risk)
len(usz_set_counts)

39

The following code computes the CIs for our sampled risks by taking the equal tailed interval

In [222]:
def ci_calculator(sampled_risks,level = 0.95):
    lower = (1-level)/2*100
    upper = 100- lower
    ci = np.zeros((77,2))
    for index in range(77):
        ci[index] = np.percentile(sampled_risks[index],[lower,upper])
    return ci


In [11]:
ci = ci_calculator(usz_sampled_risks_array)

In [12]:
df = pd.DataFrame(usz_treatment_array)
df.to_csv('treatment_array_010_new.csv')

data_export_usz = pd.DataFrame({'Percentage of patients': np.array(USZ_counts)/287,
                                'T-stage': t_stage,
                                'Midline Extension': midline_extension,
                                'Involvement Ipsi' : invovlvement_ipsi_USZ,
                                'Involvement Contra': invovlvement_contra_USZ,
                                'Treated Ipsi':  usz_treated_ipsi,
                                'Treated Contra': usz_treated_contra,
                                'risk': usz_total_risks,
                                'lower bound': ci.T[0],
                                'upper bound': ci.T[1],
                                'top 3 spared lnls risk': usz_top3_spared

})
data_export_usz.to_csv('analyzed_usz_data_010_new.csv', sep = ';', index = False)

In [13]:
number_of_repetitions = []
for key, value in usz_set_counts.items():
    number_of_repetitions.append(value)
usz_set_counts[frozenset({'ipsi II'})]
asdf = (np.array(USZ_combinations) == ['late',False,False,True,False,False,False,False,False,False,False,False,False,False,])
# Define the condition (e.g., all 'True' values)
condition = (asdf[:, 1:] == 'True').all(axis=1)

# Find indices where the condition is met
indices = np.where(condition)

print(indices)

(array([], dtype=int64),)


## Here we repeat the analysis for the CLB dataset


In [49]:
# Sample array with different entry combinations
data_CLB = np.array(dataset_CLB)

entry_combinations_with_indexes = defaultdict(list)
for index, row in enumerate(data_CLB):
    combination = tuple(row)
    entry_combinations_with_indexes[combination].append(index)
CLB_counts = []
CLB_combinations = []
CLB_indexes = []
# Print the most common combinations, their CLB_counts, and indexes
for combination, indexes in entry_combinations_with_indexes.items():
    count = len(indexes)
    CLB_indexes.append(indexes)
    # print(f"Combination: {combination}, Count: {count}, Indexes: {indexes}")
    CLB_counts.append(count)
    CLB_combinations.append(combination)

lnls = ['I','II', 'III', 'IV','V', 'VII']
t_stage = []
midline_extension = []
involvement_ipsi_CLB = []
involvement_contra_CLB = []
for diagnose_type in CLB_combinations:
    involved_ipsi = []
    involved_contra = []
    t_stage.append(diagnose_type[0])
    midline_extension.append(diagnose_type[1])
    for lnl_looper, involved_level in enumerate(lnls):
        if diagnose_type[lnl_looper +2] == True:
            involved_ipsi.append(involved_level) 
        if diagnose_type[lnl_looper +8] == True:
            involved_contra.append(involved_level)
    involvement_ipsi_CLB.append(involved_ipsi)
    involvement_contra_CLB.append(involved_contra)

In [48]:
clb_treated_lnls_no_risk, clb_treated_lnls_all, clb_treatment_array, clb_top3_spared, clb_total_risks, clb_treated_ipsi, clb_treated_contra, clb_sampled_risks_array = analysis_treated_lnls_sampled(CLB_combinations)
clb_set_counts = count_number_treatments(clb_treated_lnls_no_risk)
len(clb_set_counts)

39

In [64]:
len(clb_treated_contra)

69

In [53]:
def ci_calculator(sampled_risks,level = 0.95):
    lower = (1-level)/2*100
    upper = 100- lower
    ci = np.zeros((69,2))
    for index in range(69):
        ci[index] = np.percentile(sampled_risks[index],[lower,upper])
    return ci


In [65]:
ci_clb = ci_calculator(clb_sampled_risks_array)
df = pd.DataFrame(clb_treatment_array)
df.to_csv('treatment_array_010_new.csv')

data_export_clb = pd.DataFrame({'Percentage of patients': np.array(CLB_counts)/263,
                                'T-stage': t_stage,
                                'Midline Extension': midline_extension,
                                'Involvement Ipsi' : involvement_ipsi_CLB,
                                'Involvement Contra': involvement_contra_CLB,
                                'Treated Ipsi':  clb_treated_ipsi,
                                'Treated Contra': clb_treated_contra,
                                'risk': clb_total_risks,
                                'lower bound': ci_clb.T[0],
                                'upper bound': ci_clb.T[1],
                                'top 3 spared lnls risk': clb_top3_spared

})
data_export_clb.to_csv('analyzed_clb_data_010_new.csv', sep = ';', index = False)

## Combined USZ and CLB analysis

In [121]:
# Sample array with different entry combinations
data_full = np.array(analysis_full)

entry_combinations_with_indexes = defaultdict(list)
for index, row in enumerate(data_full):
    combination = tuple(row)
    entry_combinations_with_indexes[combination].append(index)
full_counts = []
full_combinations = []
full_indexes = []
# Print the most common combinations, their full_counts, and indexes
for combination, indexes in entry_combinations_with_indexes.items():
    count = len(indexes)
    full_indexes.append(indexes)
    # print(f"Combination: {combination}, Count: {count}, Indexes: {indexes}")
    full_counts.append(count)
    full_combinations.append(combination)

lnls = ['I','II', 'III', 'IV','V', 'VII']
t_stage = []
midline_extension = []
involvement_ipsi_full = []
involvement_contra_full = []
for diagnose_type in full_combinations:
    involved_ipsi = []
    involved_contra = []
    t_stage.append(diagnose_type[0])
    midline_extension.append(diagnose_type[1])
    for lnl_looper, involved_level in enumerate(lnls):
        if diagnose_type[lnl_looper +2] == True:
            involved_ipsi.append(involved_level) 
        if diagnose_type[lnl_looper +8] == True:
            involved_contra.append(involved_level)
    involvement_ipsi_full.append(involved_ipsi)
    involvement_contra_full.append(involved_contra)

In [122]:
full_treated_lnls_no_risk, full_treated_lnls_all, full_treatment_array, full_top3_spared, full_total_risks, full_treated_ipsi, full_treated_contra, full_sampled_risks_array = analysis_treated_lnls_sampled(full_combinations)
full_set_counts = count_number_treatments(full_treated_lnls_no_risk)
len(full_set_counts)

49

In [123]:
len(full_treated_ipsi)

112

In [124]:
def ci_calculator(sampled_risks,level = 0.95):
    lower = (1-level)/2*100
    upper = 100- lower
    ci = np.zeros((112,2))
    for index in range(112):
        ci[index] = np.percentile(sampled_risks[index],[lower,upper])
    return ci


In [125]:
ci_full = ci_calculator(full_sampled_risks_array)
df = pd.DataFrame(full_treatment_array)
df.to_csv('treatment_array_010_new.csv')

data_export_full = pd.DataFrame({'Percentage of patients': np.array(full_counts)/550,
                                'T-stage': t_stage,
                                'Midline Extension': midline_extension,
                                'Involvement Ipsi' : involvement_ipsi_full,
                                'Involvement Contra': involvement_contra_full,
                                'Treated Ipsi':  full_treated_ipsi,
                                'Treated Contra': full_treated_contra,
                                'risk': full_total_risks,
                                'lower bound': ci_full.T[0],
                                'upper bound': ci_full.T[1],
                                'top 3 spared lnls risk': full_top3_spared

})
data_export_full.to_csv('analyzed_full_data_010_new.csv', sep = ';', index = False)