In [156]:
import ipywidgets as widgets
from IPython.display import display

import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Data

## Age group dataset

In [157]:
ag_df = pd.read_csv("./final_dataset/age_group_incidence.csv").dropna()

### Regions

In [158]:
regions = ag_df.region.unique()

regions

array(['Вінницька', 'Волинська', 'Дніпропетровська', 'Донецька',
       'Житомирська', 'Закарпатська', 'Запорізька', 'Івано-Франківська',
       'Київська', 'Кіровоградська', 'Луганська', 'Львівська',
       'Миколаївська', 'Одеська', 'Полтавська', 'Рівненська', 'Сумська',
       'Тернопільська', 'Харківська', 'Херсонська', 'Хмельницька',
       'Черкаська', 'Чернівецька', 'Чернігівська'], dtype=object)

### Age groups

In [159]:
age_groups = ag_df.age_group.unique()

age_groups

array(['до 1 р.', '1-4 р.', '5-9 р.', '10-14р', '15-19р', 'в т.ч. 15-17р',
       '20-24р', '25-29р', '30-34р', '35-39р', '40-44р', '45-49р',
       '50-54р', '55-59р', '60-64р', '65-69р', '70-74р', '75-79р',
       '80-84р', '85 та старші'], dtype=object)

### Region-age groups combinations:

In [160]:
models_data_groups = list(itertools.product(age_groups, regions))

models_data_groups

[('до 1 р.', 'Вінницька'),
 ('до 1 р.', 'Волинська'),
 ('до 1 р.', 'Дніпропетровська'),
 ('до 1 р.', 'Донецька'),
 ('до 1 р.', 'Житомирська'),
 ('до 1 р.', 'Закарпатська'),
 ('до 1 р.', 'Запорізька'),
 ('до 1 р.', 'Івано-Франківська'),
 ('до 1 р.', 'Київська'),
 ('до 1 р.', 'Кіровоградська'),
 ('до 1 р.', 'Луганська'),
 ('до 1 р.', 'Львівська'),
 ('до 1 р.', 'Миколаївська'),
 ('до 1 р.', 'Одеська'),
 ('до 1 р.', 'Полтавська'),
 ('до 1 р.', 'Рівненська'),
 ('до 1 р.', 'Сумська'),
 ('до 1 р.', 'Тернопільська'),
 ('до 1 р.', 'Харківська'),
 ('до 1 р.', 'Херсонська'),
 ('до 1 р.', 'Хмельницька'),
 ('до 1 р.', 'Черкаська'),
 ('до 1 р.', 'Чернівецька'),
 ('до 1 р.', 'Чернігівська'),
 ('1-4 р.', 'Вінницька'),
 ('1-4 р.', 'Волинська'),
 ('1-4 р.', 'Дніпропетровська'),
 ('1-4 р.', 'Донецька'),
 ('1-4 р.', 'Житомирська'),
 ('1-4 р.', 'Закарпатська'),
 ('1-4 р.', 'Запорізька'),
 ('1-4 р.', 'Івано-Франківська'),
 ('1-4 р.', 'Київська'),
 ('1-4 р.', 'Кіровоградська'),
 ('1-4 р.', 'Луганська'),
 ('1

### Cancer statistics categories

In [161]:
all_categories = list(ag_df.category.unique())

total_male_category = "Злоякiснi новоутворення-всього         C00-C97    ч"
total_female_category = "C00-C97    ж"

localization_categories = all_categories.copy()
localization_categories.remove(total_male_category)
localization_categories.remove(total_female_category)

male_localization_categories = [category for category in localization_categories if category.endswith('ч')]
female_localization_categories = [category for category in localization_categories if category.endswith('ж')]

## Stage dataset

In [162]:
st_df = pd.read_csv("./final_dataset/stage_incidence_features.csv")

st_df["age_group"] = "all"
st_df['category'] = "all"

st_df = st_df.dropna()

st_df.head()

Unnamed: 0,year,region,mtumors,syncmtumors,insitu,ncervix,nhospotal_pht,nbeds_pht,ybeds_pht,nill_pht,...,cpi,population_pht,tincidence_pht,mtumors_pht,syncmtumors_pht,insitu_pht,insitu_pti,mtumors_pti,age_group,category
0,2010,Вінницька,356,112,113,103,6.6036,753.7223,752.6318,22315.1106,...,109.1,100000.0,338.4813,21.56781,6.785378,6.845962,20.225524,63.719348,all,all
1,2010,Волинська,177,50,81,77,5.0161,738.2327,737.9433,22185.9521,...,109.1,100000.0,274.4377,17.073982,4.823159,7.813517,28.471002,62.214411,all,all
2,2010,Дніпропетровська,767,212,174,150,4.6193,885.5064,890.841,24009.1361,...,109.1,100000.0,366.8625,22.858124,6.318021,5.185546,14.13485,62.307067,all,all
3,2010,Донецька,906,234,57,24,3.8731,719.4061,720.727,20000.7836,...,109.1,100000.0,337.7855,20.283249,5.23872,1.276098,3.777837,60.04772,all,all
4,2010,Житомирська,285,94,33,30,5.3661,670.452,671.6107,21998.0729,...,109.1,100000.0,307.7344,22.164344,7.310345,2.566398,8.339651,72.024261,all,all


# Model selection

In [163]:
def select_best_model_for(df: pd.DataFrame, target: str, ommit: list[str]) -> tuple[sm.OLS, list[str]]:
    predictors = df.drop(columns=ommit + [target], errors="ignore")

    predictors = predictors.select_dtypes(include=["number"])
    
    X = predictors.copy()
    y = df[target]
    
    X = sm.add_constant(X)
    
    best_model = sm.OLS(y, X).fit()

    best_aic = np.inf

    to_drop = None

    while len(X.columns) > 0:
        aic_not_changed = True
            
        for col in X.columns:
            temp_X = X.drop(col, axis = 1)
            temp_model = sm.OLS(y, temp_X).fit()
            if temp_model.aic < best_aic:
                best_aic = temp_model.aic
                best_model = temp_model
                to_drop = col
                aic_not_changed = False

        if aic_not_changed:
            break

        X = X.drop(to_drop, axis = 1)
    
    return best_model


In [164]:
class Model:

    def __init__(self, name: str, ols: sm.OLS, identifier: tuple[str]):
        self.name = name
        self.ols = ols
        self.identifier = identifier

    def display_summary(self):
        original_summary = str(self.ols.summary())
        without_title = '\n'.join(original_summary.split('\n')[1:])
        new_summary = f"{self.name:^78}\n"
        new_summary += f"{"Category:":<{20}} {self.identifier[0]:>{57}}\n"
        new_summary += f"{"Region:":<{20}} {self.identifier[1]:>{57}}\n"
        new_summary += f"{"Age group:":<{20}} {self.identifier[2]:>{57}}\n"
        new_summary += without_title
        print(new_summary)

    def get_df(self, df: pd.DataFrame):
        category, region, age_group = self.identifier

        if category == '__all__':
            mask = (df.region == region) & (df.age_group == age_group)
        elif region == '__all__':
            mask = (df.category == category) & (df.age_group == age_group)
        elif age_group == '__all__':
            mask = (df.category == category) & (df.region == region)
        else:
            mask = (df.category == category) & (df.region == region) & (df.age_group == age_group)

        return df[mask]


def filter_models(models: dict[tuple, Model], categories: list[str] = None, regions: list[str] = None, age_groups: list[str] = None):
    filtered_models = []
    for model_identifier, model in models.items():
        category, region, age_group = model_identifier
        if categories and category not in categories:
            continue

        if regions and region not in regions:
            continue
        
        if age_groups and age_group not in age_groups:
            continue
            
        filtered_models.append(model)

    return filtered_models


### Level-level models

In [165]:
import multiprocessing
import multiprocessing.queues

def worker(queue, models_identifiers, target, ommit, df):
    for category, region, age_group in models_identifiers:

        if category == '__all__':
            mask = (df.region == region) & (df.age_group == age_group)
        elif region == '__all__':
            mask = (df.category == category) & (df.age_group == age_group)
        elif age_group == '__all__':
            mask = (df.category == category) & (df.region == region)
        else:
            mask = (df.category == category) & (df.region == region) & (df.age_group == age_group)

        filtered_df = df[mask]
        best_model = select_best_model_for(filtered_df, target, ommit)
        model_identifier = (category, region, age_group)
        model = Model(f"Level-level Incidence model", 
                        ols=best_model,
                        identifier=model_identifier)
        queue.put(((category, region, age_group), model ))

def create_models_for(categories: list[str],
                      regions: list[str],
                      age_groups: list[str], 
                      df: pd.DataFrame,
                      target: str,
                      ommit: list,
                      njobs: int = 4) -> dict[tuple, Model]:
    models = {}

    progress = widgets.FloatProgress(value=0, min=0, max=len(regions)*len(age_groups), step=1)

    display(progress)

    model_identifiers = []

    for category in categories:
        for region in regions:
            for age_group in age_groups:
                model_identifiers.append((category, region, age_group))

    models_per_worker = len(model_identifiers) // njobs
    q = multiprocessing.Queue()
    processes = [ multiprocessing.Process(target=worker, args=(q, 
                                                               model_identifiers[i * models_per_worker: (i+1) * models_per_worker], 
                                                               target,
                                                               ommit,
                                                               df))
                                          for i in range(njobs) ]

    for p in processes:
        p.start()

    while any(p.is_alive() for p in processes) or not q.empty():
        try:
            identifier, model = q.get_nowait()
            models[identifier] = model
            progress.value += 1
        except multiprocessing.queues.Empty:
            pass


    return models

#### Male age region models

In [166]:
male_age_region_model = create_models_for(categories=[total_male_category],
                                         regions=regions,
                                         age_groups=age_groups,
                                         df=ag_df,
                                         target="incidence",
                                         ommit=["age_group", "year", "gdp", "region", "category"],
                                         njobs=12)

FloatProgress(value=0.0, max=480.0)

#### Female age region models

In [167]:
female_age_region_model = create_models_for(categories=[total_female_category],
                                            regions=regions,
                                            age_groups=age_groups,
                                            df=ag_df,
                                            target="incidence",
                                            ommit=["age_group", "year", "gdp", "region", "category"],
                                            njobs=12)

FloatProgress(value=0.0, max=480.0)

In [168]:
dnipro_models = filter_models(female_age_region_model, regions=['Дніпропетровська'])
dnipro_models[11].display_summary()

                         Level-level Incidence model                          
Category:                                                         C00-C97    ж
Region:                                                       Дніпропетровська
Age group:                                                              45-49р
Dep. Variable:              incidence   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Wed, 26 Mar 2025   Prob (F-statistic):                nan
Time:                        11:16:30   Log-Likelihood:                 282.86
No. Observations:                  13   AIC:                            -539.7
Df Residuals:                       0   BIC:                            -532.4
Df Model:                          12                                         
Covariance Type:            nonrobust               

  return hypotest_fun_in(*args, **kwds)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid


#### Multiple tumor detection model

In [169]:
st_df['t'] = st_df['year'] - 2008

In [170]:
ommit = ["age_group", "year", "region", "category",  'mtumors_pht', 'syncmtumors_pht',
         'insitu_pht', 'insitu_pti', 'mtumors_pti', 'ncervix', 'mtumors', 'syncmtumors', 'insitu', 
         'gdp', 'mtumors_pti', 'tincidence', 'tincidence_pht']

In [171]:
mtumors_model = select_best_model_for(st_df, "mtumors_pti", ommit=ommit)

In [172]:
print(mtumors_model.summary())

                            OLS Regression Results                            
Dep. Variable:            mtumors_pti   R-squared:                       0.657
Model:                            OLS   Adj. R-squared:                  0.637
Method:                 Least Squares   F-statistic:                     33.32
Date:                Wed, 26 Mar 2025   Prob (F-statistic):           1.34e-58
Time:                        11:16:31   Log-Likelihood:                -1123.6
No. Observations:                 314   AIC:                             2283.
Df Residuals:                     296   BIC:                             2351.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
nhospotal_pht           

### In situ model

In [173]:
insitu_model = select_best_model_for(st_df, "insitu_pti", ommit=ommit)

In [174]:
print(insitu_model.summary())

                            OLS Regression Results                            
Dep. Variable:             insitu_pti   R-squared:                       0.270
Model:                            OLS   Adj. R-squared:                  0.231
Method:                 Least Squares   F-statistic:                     6.861
Date:                Wed, 26 Mar 2025   Prob (F-statistic):           2.35e-13
Time:                        11:16:31   Log-Likelihood:                -1039.8
No. Observations:                 314   AIC:                             2114.
Df Residuals:                     297   BIC:                             2177.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
ybeds_pht              -0.0333    

In [175]:
st_df.dvisits_pht.mean(), st_df.dvisits_pht.std()

(np.float64(453315.0943923567), np.float64(171419.2128427116))

In [176]:
st_df.insitu_pti.mean(), st_df.insitu_pti.std()

(np.float64(10.883736282298965), np.float64(7.7783324683339705))