# Imports

In [1]:
import string
import pandas as pd
import numpy as np

# Load data

## 2011

In [2]:
rownums = np.array([2,9,16,23,30,37,38,39,40,41,42,43,44,45,46,47,48,49,50])
colnums = np.array([12,14,15,16,17,18,19,20,22,24,25,26,27,28,29,30]) - 1  # minus one because column 0 is used as the index

religions = ['Catholic', 'Presbyterian', 'Church of Ireland', 'Methodist', 'Other Christian', 'Other religions', 'No religion', 'Not stated']
col_index = pd.MultiIndex.from_product([['male', 'female'], religions])

dfs = []
for char in string.ascii_uppercase:
    path = '../raw_data/ni/census_2011/DC2114NI_95' + (char * 2) + '.xlsx'
    df = pd.read_excel(path, header=8, index_col=0).iloc[rownums, colnums]
    df.columns = col_index
    dfs.append(df)

ni_2011 = pd.concat(dfs, keys=['95' + (char * 2) for char in string.ascii_uppercase])

ni_2011 = ni_2011.stack(future_stack=True).stack().reset_index()
ni_2011.columns = ['lgd_code', 'age', 'religion', 'sex', 'count']

## 2021

In [3]:
ni_2021 = pd.read_csv('../raw_data/ni/census_2021.csv').iloc[:, [0,1,3,5,7,8]]
ni_2021.columns = ['lgd_code', 'lgd_name', 'religion', 'age', 'sex', 'count']

# Harmonise categories

In [4]:
col_map = {
    'Other religions ': 'Other religions',
    'Religion not stated': 'Not stated',
    'Methodist Church in Ireland': 'Methodist',
    'Presbyterian Church in Ireland': 'Presbyterian',
    'Other Christian (including Christian related)': 'Other Christian'
} 

ni_2021['religion'] = ni_2021['religion'].map(lambda x: col_map.get(x, x))
ni_2021['sex'] = ni_2021['sex'].map(str.lower)
ni_2021['age'] = ni_2021['age'].str[:-6]

age_map = dict(zip(ni_2011['age'].unique(), ni_2021['age'].unique()))
lgd_map = dict(zip(ni_2011['lgd_code'].unique(), ni_2021['lgd_name'].unique()))

ni_2011['age'] = ni_2011['age'].map(age_map)
ni_2011['lgd_name'] = ni_2011['lgd_code'].map(lgd_map)

# Join 2011 with 2021

In [5]:
master_df = pd.concat([ni_2011, ni_2021], keys=[2011, 2021]).reset_index(0).rename(columns={'level_0': 'census_year'}).reset_index(drop=True)

# Calculate year of birth column

In [6]:
to_age = master_df['age'].str.split('[-+]', regex=True).str[1]
to_age.loc[to_age == ''] = 120

from_yob = master_df['census_year'] - to_age.astype(int)

to_yob = master_df['census_year'] - master_df['age'].str.split('[-+]', regex=True).str[0].astype(int)

yob = from_yob.astype(str).str.cat(to_yob.astype(str), '-')

yob_map = {
    '1891-1921': 'pre-1932',
    '1901-1931': 'pre-1932',
    '1922-1926': 'pre-1932',
    '1927-1931': 'pre-1932',
}

master_df['yob'] = yob.apply(lambda x: yob_map.get(x, x))

# Standardize column names

In [7]:
master_df.rename(columns={'lgd_code': 'geo_code', 'count': 'population', 'lgd_name': 'geography', 'age': 'age_band'}, inplace=True)

In [8]:
# master_df.columns

In [9]:
master_df = master_df[['census_year', 'geo_code', 'geography', 'sex', 'religion', 'age_band', 'yob', 'population']]

# Write to CSV

In [10]:
master_df.to_csv('../processed_data/northern_ireland.csv', index=False)