# Imports

In [1]:
import pandas as pd
import numpy as np

# Load data

## 2021

In [2]:
sc_2021 = pd.concat([
    df.iloc[:, 1:].assign(ca=k, sex=(['female'] * 21) + (['male'] * 21))
    for k, df in
    pd.read_excel('../raw_data/sc/census_2022_sex_age_CA.xlsx',
                  sheet_name=None,
                  header=None,
                  skiprows=34,
                  skipfooter=5,
                  usecols=range(2,16),
                  index_col=0)
    .items()
]).dropna()

sc_2021.columns = [
    'Church of Scotland',
    'Roman Catholic',
    'Other Christian',
    'Buddhist',
    'Hindu',
    'Jewish',
    'Muslim',
    'Sikh',
    'Pagan',
    'Other religion',
    'No religion',
    'Religion not stated',
] + ['ca', 'sex']

sc_2021.index = sc_2021.index.rename('age')

sc_2021 = sc_2021.reset_index().melt(id_vars=['ca', 'age', 'sex'], var_name='religion')

# sc_2021['ca_code'] = sc_2021['ca'].str.split('.').str[0]

sc_2021['ca_name'] = sc_2021['ca'].str.split('.').str[1].str[1:]

sc_2021 = sc_2021.drop(columns='ca')

sc_2021 = sc_2021.loc[sc_2021['age'] != 'Total']

## 2011

In [3]:
sc_2011 = pd.concat([
    df.iloc[:, 1:].assign(ca=k, sex=(['male'] * 21) + (['female'] * 21))
    for k, df in
    pd.read_excel('../raw_data/sc/census_2011_sex_age_CA.xlsx',
                  sheet_name=None,
                  header=None,
                  skiprows=35,
                  skipfooter=2,
                  usecols=range(2,15),
                  index_col=0)
    .items()
]).dropna()

sc_2011.columns = [
    'Church of Scotland',
    'Roman Catholic',
    'Other Christian',
    'Buddhist',
    'Hindu',
    'Jewish',
    'Muslim',
    'Sikh',
    'Other religion',
    'No religion',
    'Religion not stated',
] + ['ca', 'sex']

sc_2011.index = sc_2011.index.rename('age')

sc_2011 = sc_2011.reset_index().melt(id_vars=['ca', 'age', 'sex'], var_name='religion')

# sc_2011['ca_code'] = sc_2011['ca'].str.split('.').str[0]

sc_2011['ca_name'] = sc_2011['ca'].str.split('.').str[1].str[1:]

sc_2011 = sc_2011.drop(columns='ca')

sc_2011 = sc_2011.loc[sc_2011['age'] != 'Total']

# Join 2011 with 2021

In [4]:
master_df = pd.concat([sc_2011, sc_2021], keys=[2011, 2021]).reset_index(0).rename(columns={'level_0': 'census_year'}).reset_index(drop=True)

# Harmonise categories

In [5]:
master_df['age'] = master_df['age'].str.replace('to', '-').str.replace(' ', '').str.replace('andover', '+')

In [6]:
ca_map = {
    'Argyll & Bute': 'Argyll and Bute',
    'Edinburgh, City of': 'City of Edinburgh',
    'Dumfries & Galloway': 'Dumfries and Galloway',
    'Eilean Siar': 'Na h-Eileanan Siar',
    'Perth & Kinross': 'Perth and Kinross'
}

master_df['ca_name'] = master_df['ca_name'].apply(lambda x: ca_map.get(x, x))

In [7]:
master_df['ca_name'] = pd.Categorical(
    values=master_df['ca_name'],
    categories=master_df['ca_name'].unique(),
)

In [8]:
master_df['ca_code'] = master_df['ca_name'].cat.codes

# Map age to 5 year buckets

In [9]:
age_map = {'15': '15-19', '16-17': '15-19', '18-19': '15-19'}

master_df['age_band'] = master_df['age'].apply(lambda x: age_map.get(x, x))

# Calculate year of birth column

In [10]:
to_age = master_df['age_band'].str.split('[-+]', regex=True).str[1]
to_age.loc[to_age == ''] = 120

from_yob = master_df['census_year'] - to_age.astype(int)

to_yob = master_df['census_year'] - master_df['age_band'].str.split('[-+]', regex=True).str[0].astype(int)

yob = from_yob.astype(str).str.cat(to_yob.astype(str), '-')

yob_map = {
    '1891-1926': 'pre-1937',
    '1891-1921': 'pre-1937',
    '1901-1931': 'pre-1937',
    '1922-1926': 'pre-1937',
    '1927-1931': 'pre-1937',
    '1901-1936': 'pre-1937',
    '1932-1936': 'pre-1937',
}

master_df['yob'] = yob.apply(lambda x: yob_map.get(x, x))

# Standardize column names

In [11]:
master_df.rename(columns={'ca_code': 'geo_code', 'value': 'population', 'ca_name': 'geography'}, inplace=True)

In [12]:
master_df.columns

Index(['census_year', 'age', 'sex', 'religion', 'population', 'geography',
       'geo_code', 'age_band', 'yob'],
      dtype='object')

In [13]:
master_df = master_df[['census_year', 'geo_code', 'geography', 'sex', 'religion', 'age', 'age_band', 'yob', 'population']]

# Geo code map

In [14]:
geo_codes = {
    'Clackmannanshire': 'S12000005',
    'Dumfries and Galloway': 'S12000006',
    'East Ayrshire': 'S12000008',
    'East Lothian': 'S12000010',
    'East Renfrewshire': 'S12000011',
    'Na h-Eileanan Siar': 'S12000013',
    'Falkirk': 'S12000014',
    'Highland': 'S12000017',
    'Inverclyde': 'S12000018',
    'Midlothian': 'S12000019',
    'Moray': 'S12000020',
    'North Ayrshire': 'S12000021',
    'Orkney Islands': 'S12000023',
    'Scottish Borders': 'S12000026',
    'Shetland Islands': 'S12000027',
    'South Ayrshire': 'S12000028',
    'South Lanarkshire': 'S12000029',
    'Stirling': 'S12000030',
    'Aberdeen City': 'S12000033',
    'Aberdeenshire': 'S12000034',
    'Argyll and Bute': 'S12000035',
    'City of Edinburgh': 'S12000036',
    'Renfrewshire': 'S12000038',
    'West Dunbartonshire': 'S12000039',
    'West Lothian': 'S12000040',
    'Angus': 'S12000041',
    'Dundee City': 'S12000042',
    'East Dunbartonshire': 'S12000045',
    'Fife': 'S12000047',
    'Perth and Kinross': 'S12000048',
    'Glasgow City': 'S12000049',
    'North Lanarkshire': 'S12000050',
}

In [15]:
master_df['geo_code'] = master_df['geography'].map(geo_codes)
master_df['geo_code'].isna().sum()

0

# Write to CSV

In [16]:
master_df.to_csv('../processed_data/scotland.csv', index=False)