# Imports

In [1]:
import pandas as pd

# Load data

In [2]:
df = pd.read_csv('../../processed_data/england_wales.csv')

In [3]:
ons = {}

for country in ['EN', 'WA']:
    ons[country] = pd.read_excel(f'../../raw_data/ew/ONS_population_projection_2022_{country}.xlsx', sheet_name='Population', usecols=range(12))
    ons[country] = ons[country][ons[country]['Sex'].isin(['Females', 'Males'])]
    ons[country] = ons[country].rename(columns={'Sex': 'sex', 'Age': 'age'})
    ons[country]['sex'] = ons[country]['sex'].replace({'Females': 'female', 'Males': 'male'})
    ons[country]['age'] = ons[country]['age'].replace({'105 - 109': '105', '110 and over': '110'})
    ons[country]['age'] = ons[country]['age'].map(lambda x: '95+' if int(str(x)) > 94 else x)
    ons[country] = ons[country].groupby(['sex', 'age']).sum()

In [4]:
# ons['EN'].sum()

In [5]:
# ons['WA'].sum()

# Survival rate analysis

In [6]:
sr = (
    df[df['census_year'].isin([2011, 2021])]
    .groupby(['sex', 'geo_code', 'yob', 'census_year'], observed=True)['population'].sum()
    .unstack()
    .dropna()
    .reset_index()
)

In [7]:
yob_to_age_band_2011 = dict(df.loc[df['census_year'] == 2011, ['yob', 'age_band']].drop_duplicates().values)
yob_to_age_band_2011['pre-1927'] = '85+'

In [8]:
yob_to_age_band_2021 = dict(df.loc[df['census_year'] == 2021, ['yob', 'age_band']].drop_duplicates().values)
yob_to_age_band_2021['pre-1927'] = '85+'
yob_to_age_band_2021['1927-1931'] = '85+'
yob_to_age_band_2021['1932-1936'] = '85+'

In [9]:
sr['survival_rate'] = sr[2021] / sr[2011]

In [10]:
sr['age_band_2011'] = sr['yob'].map(yob_to_age_band_2011)
sr['age_band_2021'] = sr['yob'].map(yob_to_age_band_2021)

In [11]:
survival_rates = sr[['sex', 'age_band_2011', 'geo_code', 'survival_rate']].set_index(['sex', 'age_band_2011', 'geo_code']).sort_index()

assert survival_rates.index.duplicated().sum() == 0

In [12]:
pops_2021 = df[df['census_year'] == 2021].groupby(['sex', 'yob', 'geo_code'], observed=True)['population'].sum().reset_index()

In [13]:
pops_2021['age_band_2021'] = pops_2021['yob'].map(yob_to_age_band_2021)
pops_2021 = pops_2021.groupby(['sex', 'age_band_2021', 'geo_code'])['population'].sum().sort_index()

assert survival_rates.index.equals(pops_2021.index)
survival_rates.index = survival_rates.index.rename(pops_2021.index.names)

# Project 2021 age bands

In [14]:
dfs = []

population_projections = survival_rates.join(pops_2021).rename(columns={'population': 'population_2021'}).reset_index()
ages_2021 = ['0-4','5-9',
             '10-14','15-19','20-24','25-29','30-34','35-39','40-44',
             '45-49','50-54','55-59','60-64','65-69','70-74','75-79','80-84']

for yr in range(2022,2032):
    years_ahead = yr - 2021
    population_projections['population'] = population_projections['population_2021'] * (population_projections['survival_rate'] ** (years_ahead / 10))
    age_from_age_2021 = {a: f'{int(a.split("-")[0]) + years_ahead}-{int(a.split("-")[1]) + years_ahead}' for a in ages_2021}
    age_from_age_2021['85+'] = f'{85 + years_ahead}+'
    population_projections['age_band'] = population_projections['age_band_2021'].map(age_from_age_2021)
    population_projections['year'] = yr
    dfs.append(population_projections[['sex','geo_code','population','year','age_band']])

population_projections = pd.concat(dfs)

# New born projections

In [15]:
newborns = pops_2021.loc[:, '0-4', :]

In [16]:
dfs = []

for yr in range(2022,2032):
    years_ahead = yr - 2021
    age_band1 = f'0-{min(years_ahead, 5)-1}'
    population1 = newborns * min(years_ahead, 5) / 5
    df = population1.reset_index()
    df['year'] = yr
    df['age_band'] = age_band1
    dfs.append(df)
    if years_ahead > 5:
        age_band2 = f'5-{min(years_ahead, 10)-1}'
        population2 = newborns * (min(years_ahead, 10) - 5) / 5
        df = population2.reset_index()
        df['year'] = yr
        df['age_band'] = age_band2
        dfs.append(df)

newborn_projections = pd.concat(dfs)

# Combined basic projection

In [17]:
combined = (
    pd.concat([newborn_projections, population_projections])
    .reset_index(drop=True)
    .rename(columns={'population': 'basic_projection'})
)

In [18]:
combined['country'] = combined['geo_code'].str[0].map({'E': 'EN', 'W': 'WA'})

# Calculate ratios

In [19]:
combined_totals = combined.groupby(['country', 'year', 'sex', 'age_band'])['basic_projection'].sum()

In [20]:
res = {}
for country, year, sex, age_band in combined_totals.index:
    lower = age_band.split('-')[0]
    if '+' in age_band:
        if int(lower[:2]) < 95:
            lower = int(lower[:2])
        ons_total = ons[country].loc[(sex, lower):(sex, '95+'), year].sum()
    else:
        lower = int(lower)
        upper = int(age_band.split('-')[1])
        ons_total = ons[country].loc[(sex, lower):(sex, upper), year].sum()
    res[(country, year, sex, age_band)] = ons_total

ons_totals = pd.Series(res).rename('ons_total')
ons_totals.index.names = ['country', 'year', 'sex', 'age_band']

In [21]:
assert combined_totals.index.equals(ons_totals.index)

In [22]:
forecast_ratios = pd.concat([combined_totals, ons_totals], axis='columns')

In [23]:
forecast_ratios['forecast_ratio'] = forecast_ratios['ons_total'] / forecast_ratios['basic_projection']

# Apply ratios

In [24]:
final = combined.join(forecast_ratios['forecast_ratio'], on=['country', 'year', 'sex', 'age_band'])

In [25]:
final['projection'] = final['basic_projection'] * final['forecast_ratio']

In [26]:
for yr in final['year'].unique():
    assert final.loc[final['year'] == yr, 'projection'].sum().round() == (ons['EN'][yr].sum() + ons['WA'][yr].sum())

# Write to CSV

In [27]:
output_cols = ['year', 'sex', 'age_band', 'geo_code', 'projection']
final[output_cols].to_csv('ew_population_projection.csv', index=False)