In [1]:
import numpy as np
import pandas as pd
rng = np.random.default_rng(12345)

from lymph.models import Unilateral
from lymixture import LymphMixture
from lymixture.utils import binom_pmf, late_binomial, normalize

dataset = pd.read_csv('../../data/mixture_model_data_combined.csv', header = [0,1,2],)
dataset_staging = dataset.copy()
dataset_staging['tumor','1','t_stage'] = dataset_staging['tumor','1','t_stage'].replace([0,1,2], 'early')
dataset_staging['tumor','1','t_stage'] = dataset_staging['tumor','1','t_stage'].replace([3,4], 'late')
dataset_staging = dataset_staging[~(dataset_staging['tumor']['1']['subsite'].str.startswith(('C00.4')))]

In [2]:
dataset_staging_main_groups = dataset_staging.copy()

# Reduce to only main subsite
dataset_staging_main_groups.loc[:, ('tumor', '1', 'subsite')] = (
    dataset_staging_main_groups.loc[:, ('tumor', '1', 'subsite')].str.replace(r'\..*', '', regex=True)
)

dataset_staging_main_groups['tumor']['1']['subsite'].value_counts()


subsite
C09    452
C12    227
C01    212
C10    169
C13    165
C02    158
C04     99
C05     61
C06     46
C03     45
Name: count, dtype: int64

In [3]:
graph = {
    ("tumor", "T"): ["II", "III"],
    ("lnl", "II"): ["III"],
    ("lnl", "III"): [],
}
num_components = 3

mixture = LymphMixture(
    model_cls=Unilateral,
    model_kwargs={"graph_dict": graph},
    num_components=num_components,
)
mixture.load_patient_data(
    dataset_staging_main_groups,
    split_by=("tumor", "1", "subsite"),
    mapping=lambda x: x,
)


In [4]:
mixture.subgroups.keys()

dict_keys(['C01', 'C02', 'C03', 'C04', 'C05', 'C06', 'C09', 'C10', 'C12', 'C13'])

Set the diagnostic modality

In [5]:
mixture.set_modality("diagnostic_consensus", 1., 1.)
mixture.get_all_modalities()

{'diagnostic_consensus': Clinical(spec=1.0, sens=1.0, is_trinary=False)}

Fix the distribution over diagnosis times for early T-stage (T1 & T2) to be a binomial distribution with a parameters $p=0.3$.

The late T-stage's diagnosis time distribution is a binomial one with a free model parameter than needs to be learned as well.

In [6]:
mixture.set_distribution("early", binom_pmf(np.arange(11), 10, 0.3))
mixture.set_distribution("late", late_binomial)
mixture.get_all_distributions()

{'early': Distribution([0.0282475249, 0.121060821, 0.23347444050000002, 0.266827932, 0.20012094900000002, 0.1029193452, 0.03675690900000001, 0.009001692, 0.0014467005000000002, 0.00013778100000000004, 5.904900000000001e-06]),
 'late': Distribution([0.0009765625, 0.009765625, 0.0439453125, 0.1171875, 0.205078125, 0.24609375, 0.205078125, 0.1171875, 0.0439453125, 0.009765625, 0.0009765625])}

Initialize random model parameters and latent variables/responsibilities.

In [7]:
from lymixture.em import expectation, maximization

params = {k: rng.uniform() for k in mixture.get_params()}
mixture.set_params(**params)
mixture.normalize_mixture_coefs()
latent = normalize(rng.uniform(size=mixture.get_resps().shape).T, axis=0).T

In [8]:
mixture._mixture_coefs

Unnamed: 0,C01,C02,C03,C04,C05,C06,C09,C10,C12,C13
0,0.190133,0.260473,0.388398,0.146347,0.431519,0.401678,0.245872,0.465496,0.508445,0.107701
1,0.356791,0.172286,0.052968,0.125313,0.218147,0.198405,0.263873,0.400198,0.147293,0.145405
2,0.453076,0.567241,0.558635,0.72834,0.350334,0.399917,0.490255,0.134306,0.344262,0.746894


In [10]:
def to_numpy(params: dict[str, float]) -> np.ndarray:
    return np.array([p for p in params.values()])

Iterate the computation of the expectation value of the latent variables (E-step) and the maximization of the (complete) data log-likelihood w.r.t. the model parameters (M-step).

In [18]:
def check_convergence(params_history, likelihood_history, steps_back_list):
    current_params = params_history[-1]
    current_likelihood = likelihood_history[-1]
    for steps_back in steps_back_list:
        previous_params = params_history[-steps_back - 1]
        if np.allclose(to_numpy(current_params), to_numpy(previous_params)):
            print('stopped due to parameter similarity')
            return True  # Return True if any of the steps is close
        elif (np.isclose(current_likelihood, likelihood_history[-steps_back - 1],rtol = 0, atol = 0.01)) and np.all(current_likelihood >= np.array(likelihood_history)):
            print('stopped due to likelihood similarity')
            return True
    return False

In [19]:
is_converged = False
count = 0
params_history = []
likelihood_history = []
params_history.append(params.copy())
likelihood_history.append(mixture.likelihood(use_complete=False))
# Number of steps to look back for convergence
look_back_steps = 3

while not is_converged:
    print(count)
    print(likelihood_history[-1])
    old_params = params
    latent = expectation(mixture, params)
    params = maximization(mixture, latent)
    
    # Append current params and likelihood to history
    params_history.append(params.copy())
    likelihood_history.append(mixture.likelihood(use_complete=False))
    
    # Check if converged
    if count >= 3:  # Ensure enough history is available
        is_converged = check_convergence(params_history, likelihood_history,list(range(1,look_back_steps+1)))
    count += 1

0
-1834.5252758119436
1
-1834.5247304769914
2
-1834.5186238878496
3
-1834.5124548081726
stopped due to likelihood similarity


In [20]:
likelihood_history

[-2029.544782475618,
 -1999.8295700905965,
 -1978.7182912846233,
 -1961.608214584008,
 -1944.906006504008,
 -1929.11411849683,
 -1914.9556484538593,
 -1902.7961326004379,
 -1892.6245655407909,
 -1884.2662224559454,
 -1877.5291731617756,
 -1872.2108769279107,
 -1868.0772908617757,
 -1864.8790876763478,
 -1862.3807361787308,
 -1860.3809993038467,
 -1858.72228781427,
 -1857.2880501764073,
 -1855.9945139813158,
 -1854.7833058001324,
 -1853.6222123858179,
 -1852.4954833632955,
 -1851.3982178474157,
 -1850.3270873504705,
 -1849.3395244330086,
 -1848.4009404154713,
 -1847.5109472451206,
 -1846.6817055267588,
 -1845.9171746367288,
 -1845.2144520974578,
 -1844.5715700382314,
 -1843.9810228167646,
 -1843.4372267921772,
 -1842.9325011940598,
 -1842.4606412819887,
 -1842.0174182403414,
 -1841.5965360765795,
 -1841.1973161457165,
 -1840.8147071250928,
 -1840.4514170532643,
 -1840.104803580725,
 -1839.7741292571275,
 -1839.4606554056077,
 -1839.1625089874087,
 -1838.8795617195492,
 -1838.61081066078

In [13]:
likelihood_history

[-1999.8456029715994,
 -3481.8567868822524,
 -3444.0676357482275,
 -3412.258869495842,
 -3366.0193197466733,
 -3310.8710682181763,
 -3252.251166799644,
 -3193.953062417877,
 -3138.0933589710403,
 -3085.702630577298,
 -3037.4914037110393,
 -2994.1472638534265,
 -2956.445543687523,
 -2924.4128916995346,
 -2897.7699294889526,
 -2876.016886376408,
 -2858.3703732538997,
 -2844.2792949587542,
 -2832.961801727201,
 -2823.6607536685756,
 -2815.6136512171292,
 -2808.9110655973927,
 -2803.1837312441176,
 -2797.9333893151033,
 -2793.4524790156343,
 -2789.4560584063274,
 -2785.8473287913007,
 -2782.5958661999966,
 -2779.523224384362,
 -2776.7295446673515,
 -2774.1687234255596,
 -2771.871072266926,
 -2769.6295019244612,
 -2767.6903652093565,
 -2765.9383243336033,
 -2764.274668654775,
 -2762.6383034305645,
 -2761.0018107547944,
 -2759.305157757133,
 -2757.6079130675953,
 -2755.7734226944913,
 -2753.8180613363525,
 -2751.795435258011,
 -2749.707132594692,
 -2747.551514579277,
 -2745.3337333358045,
 -

In [135]:
params_history

[{'0_TtoII_spread': 0.8871230015148194,
  '0_TtoIII_spread': 6.610696135189607e-05,
  '0_IItoIII_spread': 0.16058087757136857,
  '0_C01_coef': 0.7353554599062526,
  '0_C02_coef': 0.23116548059175568,
  '0_C03_coef': 6.610696135189607e-05,
  '0_C04_coef': 3.850014114876546e-05,
  '0_C05_coef': 0.25779181410318286,
  '0_C06_coef': 0.009152031913958254,
  '0_C09_coef': 0.6046086377786678,
  '0_C10_coef': 0.33469106700801887,
  '0_C12_coef': 0.1153243962207408,
  '0_C13_coef': 0.1602474430856042,
  '1_TtoII_spread': 0.3496612945194357,
  '1_TtoIII_spread': 0.38513965585697396,
  '1_IItoIII_spread': 8.291381946198457e-06,
  '1_C01_coef': 0.11115709123859119,
  '1_C02_coef': 0.047024048609263125,
  '1_C03_coef': 0.0,
  '1_C04_coef': 0.09484258273851447,
  '1_C05_coef': 0.06322970019036828,
  '1_C06_coef': 0.02507105607128742,
  '1_C09_coef': 0.11792220213179472,
  '1_C10_coef': 0.25396319002243684,
  '1_C12_coef': 0.6415532747071483,
  '1_C13_coef': 0.4750361713097032,
  '2_TtoII_spread': 0.

In [136]:
params

{'0_TtoII_spread': 0.9999374531230925,
 '0_TtoIII_spread': 0.005524409093925671,
 '0_IItoIII_spread': 0.14535863390696943,
 '0_C01_coef': 0.7135949740356992,
 '0_C02_coef': 0.07519997731310805,
 '0_C03_coef': 6.610696135189607e-05,
 '0_C04_coef': 0.22059440074063127,
 '0_C05_coef': 0.2746828369407964,
 '0_C06_coef': 0.03126776011898311,
 '0_C09_coef': 0.5831542686134161,
 '0_C10_coef': 0.29459764419633216,
 '0_C12_coef': 6.610696135189606e-05,
 '0_C13_coef': 0.05688958141977362,
 '1_TtoII_spread': 0.31840664989309136,
 '1_TtoIII_spread': 0.25530558731417297,
 '1_IItoIII_spread': 6.610696135189607e-05,
 '1_C01_coef': 0.1522428712603201,
 '1_C02_coef': 0.20688465008228274,
 '1_C03_coef': 0.0,
 '1_C04_coef': 0.03953989490111362,
 '1_C05_coef': 0.1078522813535922,
 '1_C06_coef': 0.03610856474130164,
 '1_C09_coef': 0.1676090939477366,
 '1_C10_coef': 0.3602096442580276,
 '1_C12_coef': 0.8551453280846267,
 '1_C13_coef': 0.6861957398737709,
 '2_TtoII_spread': 0.06255274400275875,
 '2_TtoIII_sp

In [21]:
mixture.get_resps()

Unnamed: 0,0,1,2
0,0.947263,0.052680,0.000058
1,0.724477,0.275519,0.000004
2,0.947263,0.052680,0.000058
3,0.724477,0.275519,0.000004
4,0.938981,0.060999,0.000020
...,...,...,...
1629,0.092014,0.897119,0.010867
1630,0.054937,0.271921,0.673142
1631,0.271362,0.386895,0.341743
1632,0.054752,0.193157,0.752091


In [22]:
mixture.get_mixture_coefs()

Unnamed: 0,C01,C02,C03,C04,C05,C06,C09,C10,C12,C13
0,0.837629,0.155789,4.586617e-12,3e-06,0.344971,0.062947,0.715279,0.366739,0.055954,0.120277
1,0.162277,0.173772,4.8732070000000005e-62,0.16177,0.105783,0.019277,0.158784,0.329656,0.765535,0.595671
2,9.4e-05,0.670438,1.0,0.838227,0.549246,0.917776,0.125937,0.303605,0.178511,0.284052


In [23]:
mixture.patient_data['tumor']['1']['subsite'] == 'C01'

0        True
1        True
2        True
3        True
4        True
        ...  
1629    False
1630    False
1631    False
1632    False
1633    False
Name: subsite, Length: 1634, dtype: bool

In [24]:
mixture.get_mixture_coefs()

Unnamed: 0,C01,C02,C03,C04,C05,C06,C09,C10,C12,C13
0,0.837629,0.155789,4.586617e-12,3e-06,0.344971,0.062947,0.715279,0.366739,0.055954,0.120277
1,0.162277,0.173772,4.8732070000000005e-62,0.16177,0.105783,0.019277,0.158784,0.329656,0.765535,0.595671
2,9.4e-05,0.670438,1.0,0.838227,0.549246,0.917776,0.125937,0.303605,0.178511,0.284052


In [26]:
subgroup = 'C01'
print('true percentages for', subgroup)
print(mixture.subgroups[subgroup].patient_data['_model']['diagnostic_consensus'].value_counts()/mixture.subgroups[subgroup].patient_data['_model']['diagnostic_consensus'].value_counts().sum())
mixture.state_dist(subgroup=subgroup)
df = pd.DataFrame(np.array([mixture.state_dist(subgroup=subgroup)]), columns = [str(col) for col in mixture.components[0].graph.state_list])
print('predicted percentages for', subgroup)
print(df*100)

true percentages for C01
II     III  
True   False    0.518868
       True     0.325472
False  False    0.127358
       True     0.028302
Name: count, dtype: float64
predicted percentages for C01
       [0 0]    [0 1]      [1 0]      [1 1]
0  16.863058  3.16968  52.799912  27.167351


In [97]:
mixture.set_modality("diagnose", 1., 0.81 )

print(mixture.risk(subgroup='C01', involvement = {'II': True, 'III': None}, given_diagnosis={'diagnose':{'II': False, 'III': False}}))
mixture.del_modality("diagnose")

0.4161934109072689


In [98]:
mixture.risk(subgroup='C01', involvement = {'II': None, 'III': True}, given_diagnosis=None)

0.27878726348443644