# Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, root_mean_squared_error, median_absolute_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
import sys
sys.path.append('../..')
from utils.estimators import *

In [3]:
pd.options.mode.chained_assignment = None

# Load data

In [4]:
df = pd.read_csv('../../processed_data/scotland.csv')

### Replace Pagan with Other religion
As it was only added as a category in 2021

In [5]:
df['religion'] = df['religion'].replace('Pagan', 'Other religion')
yob_order = (y := sorted(df['yob'].unique()))[-1:] + y[:-1]
df['yob'] = pd.Categorical(df['yob'], categories=yob_order)
df.sort_values('yob', inplace=True)

In [6]:
religion_cols = sorted(df['religion'].unique())

In [7]:
qwe = df.pivot_table(
    index=['census_year', 'sex', 'yob', 'geo_code'],
    columns='religion',
    values='population',
    aggfunc='sum',
    observed=True,
)

In [8]:
qwe = qwe.loc[[2011,2021]]

## Add age_as_of columns

### 2011

In [9]:
age_bands_2011 = dict(df.loc[df['census_year'] == 2011, ['yob', 'age_band']].drop_duplicates().values)
age_bands_2011['pre-1937'] = '75+'

In [10]:
qwe['age_2011'] = pd.Categorical(
    qwe.index.get_level_values('yob').map(age_bands_2011),
    categories=['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44',
                '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75+']
)

### 2021

In [11]:
age_bands_2021 = dict(df.loc[df['census_year'] == 2021, ['yob', 'age_band']].drop_duplicates().values)

In [12]:
age_bands_2021['pre-1937'] = '75+'
age_bands_2021['1937-1941'] = '75+'
age_bands_2021['1942-1946'] = '75+'

In [13]:
qwe['age_2021'] = pd.Categorical(
    qwe.index.get_level_values('yob').map(age_bands_2021),
    categories=qwe['age_2011'].cat.categories
)

## Build datasets

In [14]:
df_train_X = qwe.loc[2011].groupby(['sex', 'age_2011', 'geo_code'], observed=True)[religion_cols].sum().apply(lambda x:x/x.sum(), axis='columns')
df_train_Y = qwe.loc[2021].groupby(['sex', 'age_2011', 'geo_code'], observed=True)[religion_cols].sum().apply(lambda x:x/x.sum(), axis='columns')
df_predict_X = qwe.loc[2021].groupby(['sex', 'age_2021', 'geo_code'], observed=True)[religion_cols].sum().apply(lambda x:x/x.sum(), axis='columns')

assert df_train_X.index.equals(df_train_Y.index)
assert df_predict_X.isna().sum().sum() == 0

# Fit models

In [15]:
group_estimators = [
    LinearTrendEstimator,
    ExponentialEstimator,
    # OddsMultiplierEstimator,
    # TransitionMatrixEstimator,
    BasicTransitionMatrixEstimator,
]

individual_estimators = [
    IndividualLinearEstimator,
    IndividualExponentialEstimator,
    # IndividualOddsRatioEstimator,
    IndividualBasicTransitionMatrixEstimator,
]

In [16]:
group_models = {}

for est in group_estimators:
    group_models[est.__name__] = {}
    for (sex, age), group in df_train_X.groupby(level=['sex', 'age_2011'], observed=True):
        group_models[est.__name__][(sex, age)] = est().fit(group, df_train_Y.loc[(sex, age)])

In [17]:
individual_models = []

for est in individual_estimators:
    individual_models.append(est().fit(df_train_X, df_train_Y))

## Make ensemble predictions for over tens
### NB: this has been adjusted because Scotland held their census in 2022, not 2021

In [19]:
ensemble_weights = pd.read_csv('../../model_selection/optimal_weights_for_over_10_model.csv')

In [20]:
assert ([e.__name__ for e in group_estimators]
        + [e.__name__ for e in individual_estimators]
        == ensemble_weights['Estimator'].values).all()

In [21]:
%%time

predictions = {}

for yr in range(2022, 2032):
    print(f'Processing {yr}...')
    # 2022 not 2021
    years_ahead = yr - 2022
    preds = []
    
    for model in group_models:
        ps = []
        for (sex, age), group in df_predict_X.groupby(level=['sex', 'age_2021'], observed=True):
            # power here is divided by 11 because the period between the last two censuses was 11 years
            p = group_models[model][(sex, age)].predict(df_predict_X.loc[(sex, age)], power=years_ahead/11).values
            ps.append(p)
        preds.append(np.concatenate(ps))
    
    for model in individual_models:
        preds.append(model.predict(df_predict_X, power=years_ahead/10))
    
    preds = np.stack(preds, axis=-1)
    ensemble_preds = np.dot(preds, ensemble_weights['Model Weight'])
    predictions[yr] = ensemble_preds

Processing 2022...
Processing 2023...
Processing 2024...
Processing 2025...
Processing 2026...
Processing 2027...
Processing 2028...
Processing 2029...
Processing 2030...
Processing 2031...
CPU times: user 44.4 s, sys: 15.5 s, total: 59.8 s
Wall time: 18.1 s


## Build age band maps

In [22]:
ages_2021 = ['0-4','5-9',
             '10-14','15-19','20-24','25-29','30-34','35-39','40-44',
             '45-49','50-54','55-59','60-64','65-69','70-74']

age_band_maps = {}

for yr in range(2022, 2032):
    years_ahead = yr - 2021
    age_band_maps[yr] = {a: f'{int(a.split("-")[0]) + years_ahead}-{int(a.split("-")[1]) + years_ahead}' for a in ages_2021}
    age_band_maps[yr]['75+'] = f'{75 + years_ahead}+'

In [23]:
prediction_dfs = {}

for yr in range(2022, 2032):
    prediction_dfs[yr] = pd.DataFrame(
        predictions[yr].reshape(-1, len(religion_cols)),
        df_predict_X.index,
        df_predict_X.columns,
    ).reset_index()
    prediction_dfs[yr]['year'] = yr
    prediction_dfs[yr]['age_band'] = prediction_dfs[yr]['age_2021'].map(age_band_maps[yr])
    prediction_dfs[yr] = prediction_dfs[yr].set_index(['year', 'sex', 'age_band', 'geo_code']).drop(columns='age_2021')

## Make predictions for under tens

In [24]:
optimal_under_10_models = pd.read_csv('../../model_selection/optimal_under_10_models.csv')

In [25]:
u10_models = {}
cols = ['sex_of_child', 'age_of_child', 'sex_of_parent', 'age_of_parent', 'estimator']

for _, (sex_of_child, age_of_child, sex_of_parent, age_of_parent, estimator) in optimal_under_10_models[cols].iterrows():
    u10_models[(sex_of_child, age_of_child)] = locals()[estimator]().fit(df_predict_X.loc[(sex_of_parent, age_of_parent)], df_predict_X.loc[(sex_of_child, age_of_child)])

In [26]:
u10_prediction_dfs = {}

for yr in range(2022, 2032):
    years_ahead = yr - 2021
    age_band1 = f'0-{min(years_ahead, 5)-1}'
    for sex_of_child in ['female', 'male']:
        preds = u10_models[(sex_of_child, '0-4')].predict(prediction_dfs[yr].loc[(yr, 'female', age_band_maps[yr]['25-29'])])
        u10_prediction_dfs[(yr, sex_of_child, age_band1)] = preds
    if years_ahead > 5:
        age_band2 = f'5-{min(years_ahead, 10)-1}'
        for sex_of_child in ['female', 'male']:
            preds = u10_models[(sex_of_child, '5-9')].predict(prediction_dfs[yr].loc[(yr, 'female', age_band_maps[yr]['30-34'])])
            u10_prediction_dfs[(yr, sex_of_child, age_band2)] = preds

## Combine with over 10 predictions

In [27]:
for k in u10_prediction_dfs:
    u10_prediction_dfs[k].index = pd.MultiIndex.from_product([[k[0]], [k[1]], [k[2]], u10_prediction_dfs[k].index])

all_predictions_pct = pd.concat([*u10_prediction_dfs.values(), *prediction_dfs.values()]).sort_index()
all_predictions_pct.index.names = ['year', 'sex', 'age_band', 'geo_code']

# Write to CSV

In [28]:
all_predictions_pct.to_csv('sc_religious_mix_prediction.csv')