In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [29]:
age_grups_incidence = pd.read_csv("./final_dataset/age_group_incidence.csv").drop("category_y", axis=1)

  age_grups_incidence = pd.read_csv("./final_dataset/age_group_incidence.csv").drop("category_y", axis=1)


In [30]:
age_groups = ["до 1 р.", "1-4 р.", "5-9 р.", "10-14р", "15-19р", 
              "в т.ч. 15-17р",	"20-24р",	"25-29р", "30-34р",
              "35-39р", "40-44р", "45-49р", "50-54р", "55-59р", 
              "60-64р", "65-69р",	"70-74р",	"75-79р",	"80-84р",	"85 та старші"]

age_group_df = [age_grups_incidence[age_grups_incidence["age_group"] == group] for group in age_groups]

Drop missing values for the time being

In [31]:

age_grups_incidence = age_grups_incidence.dropna()

gdp_numeric = pd.to_numeric(age_grups_incidence["gdp"], errors="coerce")
mask_convertible = gdp_numeric.notna()

age_grups_incidence = age_grups_incidence[mask_convertible]

age_grups_incidence["gdp"] = age_grups_incidence.gdp.astype("float64")

age_grups_incidence["normalized_gdp"] = age_grups_incidence["gdp"]/age_grups_incidence["cpi"]

In [32]:
age_grups_incidence.columns

Index(['Unnamed: 0.1', 'year', 'category_x', 'region', 'age_group',
       'incidence', 'nhospotal', 'nbeds', 'ybeds', 'nill', 'nvillage_ill',
       'bed_days', 'dvisits', 'hvisits', 'ndoctors', 'nnursing', 'nx_ray',
       'nflurography', 'nradiology', 'nradlab', 'nсt', 'ncardiogram',
       'ndiaglab', 'nbacter', 'nbiochem', 'ncyto', 'nimun', 'nphysic',
       'nendoscop', 'nultrasound', 'ndialysis', 'gdp', 'Unnamed: 0',
       'air_pollution', 'polluted_dumps', 'not_cleaned_dumps',
       'dumps_not_cleaned_enough', 'num_clearing_plants', 'cpi', 'population',
       'normalized_gdp'],
      dtype='object')

# Linear-Linear models

In [33]:
def fit_models(predicted: str, ommited_vars: list[str], df: pd.DataFrame) -> dict[str, sm.OLS]:
    models = {}

    for group_name, group_data in age_grups_incidence.groupby("age_group"):
        predictors = group_data.drop(columns=ommited_vars + [predicted], errors="ignore")
        
        predictors = predictors.select_dtypes(include=["number"])
        
        X = predictors.copy()
        y = group_data[predicted]
        
        X = sm.add_constant(X)
        
        best_model = sm.OLS(y, X).fit()

        best_aic = np.inf

        to_drop = None

        while len(X.columns) > 0:
            aic_not_changed = True
                
            for col in X.columns:
                temp_X = X.drop(col, axis = 1)
                temp_model = sm.OLS(y, temp_X).fit()
                if temp_model.aic < best_aic:
                    best_aic = temp_model.aic
                    best_model = temp_model
                    to_drop = col
                    aic_not_changed = False

            if aic_not_changed:
                print(group_name)
                break
            X = X.drop(to_drop, axis = 1)

        models[group_name] = best_model

    return models


In [34]:
models = fit_models("incidence", ["age_group", "year", "gdp", "region", "category_x"], age_grups_incidence)

1-4 р.
10-14р
15-19р
20-24р
25-29р
30-34р
35-39р
40-44р
45-49р
5-9 р.
50-54р
55-59р
60-64р
65-69р
70-74р
75-79р
80-84р
85 та старші
в т.ч. 15-17р
до 1 р.


In [37]:
for group in age_groups:
    print(group, models[group].rsquared_adj)

до 1 р. 0.01881337090705959
1-4 р. 0.01809259839590094
5-9 р. 0.09260676928989509
10-14р 0.01693226616288357
15-19р 0.09949528969813404
в т.ч. 15-17р 0.08173806983148535
20-24р 0.10571406438968112
25-29р 0.1007244062530499
30-34р 0.09014301220770915
35-39р 0.016283856629521143
40-44р 0.08871551869297789
45-49р 0.018377612701082735
50-54р 0.10212980661735627
55-59р 0.02172017200412768
60-64р 0.10525527406093105
65-69р 0.02107320916785449
70-74р 0.0309124275568865
75-79р 0.10162135639280945
80-84р 0.09666135106793639
85 та старші 0.02033757538858716


We have adjusted $R^2$ as big as 0.1.

# Log-linear models