In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [40]:
age_grups_incidence = pd.read_csv("./final_dataset/age_group_incidence.csv")

  age_grups_incidence = pd.read_csv("./final_dataset/age_group_incidence.csv")


In [41]:
age_groups = ["до 1 р.", "1-4 р.", "5-9 р.", "10-14р", "15-19р", 
              "в т.ч. 15-17р",	"20-24р",	"25-29р", "30-34р",
              "35-39р", "40-44р", "45-49р", "50-54р", "55-59р", 
              "60-64р", "65-69р",	"70-74р",	"75-79р",	"80-84р",	"85 та старші"]

category = "Злоякiснi новоутворення-всього         C00-C97    ч"
female_category = "C00-C97    ж"
age_group_df = [age_grups_incidence[(age_grups_incidence["age_group"] == group)] for group in age_groups]

Drop missing values for the time being

In [42]:
age_grups_incidence = age_grups_incidence.dropna()

gdp_numeric = pd.to_numeric(age_grups_incidence["gdp"], errors="coerce")
mask_convertible = gdp_numeric.notna()

age_grups_incidence = age_grups_incidence[mask_convertible]

age_grups_incidence["gdp"] = age_grups_incidence.gdp.astype("float64")

age_grups_incidence["normalized_gdp"] = age_grups_incidence["gdp"]/age_grups_incidence["cpi"]

# Splitting datasets to compare linear and log-linear models

In [36]:
train_idx, test_idx = train_test_split(age_grups_incidence.index, test_size=0.3, random_state=42)


X_train = age_grups_incidence.loc[train_idx]
X_test = age_grups_incidence.loc[test_idx]

X_train = X_train[X_train['category'] == category]

# Linear-Linear models

In [33]:
def fit_models(predicted: str, ommited_vars: list[str], df: pd.DataFrame) -> dict[str, sm.OLS]:
    models = {}

    for group_name, group_data in df.groupby("age_group"):
        predictors = group_data.copy().drop(columns=ommited_vars + [predicted], errors="ignore")
        
        predictors = predictors.select_dtypes(include=["number"])
        
        X = predictors.copy()
        y = group_data[predicted]
        
        X = sm.add_constant(X)
        
        best_model = sm.OLS(y, X).fit()

        best_aic = np.inf

        to_drop = None

        while len(X.columns) > 0:
            aic_not_changed = True
                
            for col in X.columns:
                temp_X = X.drop(col, axis = 1)
                temp_model = sm.OLS(y, temp_X).fit()
                if temp_model.aic < best_aic:
                    best_aic = temp_model.aic
                    best_model = temp_model
                    to_drop = col
                    aic_not_changed = False

            if aic_not_changed:
                print(group_name)
                break
            X = X.drop(to_drop, axis = 1)

        models[group_name] = best_model

    return models


In [34]:
models = fit_models("incidence", ["age_group", "year", "gdp", "region", "category"], X_train)

1-4 р.
10-14р
15-19р
20-24р
25-29р
30-34р
35-39р
40-44р
45-49р
5-9 р.
50-54р
55-59р
60-64р
65-69р
70-74р
75-79р
80-84р
85 та старші
в т.ч. 15-17р
до 1 р.


In [38]:
for model_name, model in models.items():
    print(model_name, "Adj.R^2:", model.rsquared_adj)

1-4 р. Adj.R^2: 0.8399617550381775
10-14р Adj.R^2: 0.8109931649348991
15-19р Adj.R^2: 0.551638424583592
20-24р Adj.R^2: 0.760496623023493
25-29р Adj.R^2: 0.9688442780483942
30-34р Adj.R^2: 0.8958466221064671
35-39р Adj.R^2: 0.9005294795611296
40-44р Adj.R^2: 0.9300027390101416
45-49р Adj.R^2: 0.9571242557593831
5-9 р. Adj.R^2: 0.8040157905646982
50-54р Adj.R^2: 0.9921152620612783
55-59р Adj.R^2: 0.971426924796913
60-64р Adj.R^2: 0.9590662349281486
65-69р Adj.R^2: 0.9246875582523965
70-74р Adj.R^2: 0.9413368992139806
75-79р Adj.R^2: 0.9789078346123222
80-84р Adj.R^2: 0.9217272777401181
85 та старші Adj.R^2: 0.8898131585012284
в т.ч. 15-17р Adj.R^2: 0.8065992458888289
до 1 р. Adj.R^2: 0.3633115479133603


# Log-linear models

In [54]:
X_train_log = X_train.copy()
X_train_log = X_train_log[X_train_log["incidence"] > 0]
X_train_log["log_incidence"] = np.log(X_train_log["incidence"])
log_models = fit_models("log_incidence", ["incidence","age_group", "year", "gdp", "region", "category_x"], X_train_log)


1-4 р.
10-14р
15-19р
20-24р
25-29р
30-34р
35-39р
40-44р
45-49р
5-9 р.
50-54р
55-59р
60-64р
65-69р
70-74р
75-79р
80-84р
85 та старші
в т.ч. 15-17р
до 1 р.


In [55]:
for model_name, model in log_models.items():
    print(f"{model_name}: Adj.R^2: {model.rsquared_adj}")

1-4 р.: Adj.R^2: 0.38085763334584843
10-14р: Adj.R^2: 0.8363385415060975
15-19р: Adj.R^2: 0.44334261618083304
20-24р: Adj.R^2: 0.6154241541547694
25-29р: Adj.R^2: 0.703001455246745
30-34р: Adj.R^2: 0.7224048762118533
35-39р: Adj.R^2: 0.7906096706919444
40-44р: Adj.R^2: 0.8386931635037717
45-49р: Adj.R^2: 0.8637301170871957
5-9 р.: Adj.R^2: 0.8977463776219217
50-54р: Adj.R^2: 0.8644715911037174
55-59р: Adj.R^2: 0.8868788362394489
60-64р: Adj.R^2: 0.854081066065308
65-69р: Adj.R^2: 0.839428010644362
70-74р: Adj.R^2: 0.8326152441864939
75-79р: Adj.R^2: 0.7884707043376957
80-84р: Adj.R^2: 0.7900275464014315
85 та старші: Adj.R^2: 0.6987589739105924
в т.ч. 15-17р: Adj.R^2: 0.8509779951842005
до 1 р.: Adj.R^2: 0.765762903809819


In [197]:
X_test = X_test[X_test["incidence"] > 0]

is_linear_better_models = {}
is_log_better_models = {}

for group, group_data in X_test.groupby("age_group"):
    lin_X_test_curr_group = group_data[models[group].model.exog_names[1:]].copy()
    log_X_test_curr_group = group_data[log_models[group].model.exog_names[1:]].copy()

    y_lin_pred = models[group].predict(sm.add_constant(lin_X_test_curr_group))+1
    y_log_pred = log_models[group].predict(sm.add_constant(log_X_test_curr_group))
    log_of_lin_preds = np.log(y_lin_pred)

    lin_X_test_curr_group["log(lin)-log"] = log_of_lin_preds - y_log_pred

    lin_X_test_curr_group = lin_X_test_curr_group["log(lin)-log"].fillna(0)

    is_linear_better_models[group] = sm.OLS(group_data["incidence"], sm.add_constant(lin_X_test_curr_group)).fit()

    exp_of_log_preds = np.exp(y_log_pred)
    log_X_test_curr_group["lin-exp(log)"] = y_lin_pred - exp_of_log_preds

    is_log_better_models[group] = sm.OLS(np.log(group_data["incidence"]+1), sm.add_constant(log_X_test_curr_group)).fit()

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [195]:
p_values_is_linear_better = [int(is_linear_better_models[group].pvalues.get("log(lin)-log")>0.05) for group in age_groups]

for i, group in enumerate(age_groups):
    if p_values_is_linear_better[i] == 0:
        print(f"for group {group} log model may be bettr")

for group 5-9 р. log model may be bettr
for group 15-19р log model may be bettr
for group 20-24р log model may be bettr
for group 25-29р log model may be bettr
for group 30-34р log model may be bettr
for group 35-39р log model may be bettr
for group 40-44р log model may be bettr
for group 45-49р log model may be bettr
for group 50-54р log model may be bettr
for group 55-59р log model may be bettr
for group 60-64р log model may be bettr
for group 65-69р log model may be bettr
for group 70-74р log model may be bettr
for group 75-79р log model may be bettr
for group 80-84р log model may be bettr
for group 85 та старші log model may be bettr


In [203]:

p_values_is_log_better = [int(is_log_better_models[group].pvalues.get("lin-exp(log)")>0.05) for group in age_groups]

for i, group in enumerate(age_groups):
    if p_values_is_log_better[i] == 0:
        print(f"for group {group} lin model may be bettr, pvalue is {is_log_better_models[group].pvalues.get('lin-exp(log)')}")

for group 20-24р lin model may be bettr, pvalue is 0.017489980681450582
for group 25-29р lin model may be bettr, pvalue is 0.024364614747046
for group 40-44р lin model may be bettr, pvalue is 0.026840706680787488
for group 65-69р lin model may be bettr, pvalue is 0.018143841212603016
for group 85 та старші lin model may be bettr, pvalue is 0.003107229831960195
