# Characteristic Probabilities

### Importing Data

Start by importing necessary libraries.

In [1]:
import os
import pathlib

import pandas as pd
# import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt

Import `Country of Birth` sample dataset and clean it up.

In [9]:
root = pathlib.Path("data")
df = pd.read_csv(root / "census2021-ts004" / "census2021-ts004-rgn.csv")

df = (
    df.loc[
        df["Country of birth: Total; measures: Value"] > 1,
        df.columns.drop(
            df.columns[[0, 2]]
        )
    ]
)

names = list(df.columns)

for name in names:
    names[names.index(name)] = name[18:-17]

df.columns = names

df.columns.values[0] = "Region"
df.columns.values[1] = "Total Population"

df.loc[len(df.index)] = (
    ['Total',
     df["Total Population"].sum(),
    df["Europe"].sum(),
    df["Europe: United Kingdom"].sum(),
    df["Europe: EU countries"].sum(),
    df["Europe: EU countries: European Union EU14"].sum(),
    df["Europe: EU countries: European Union EU8"].sum(),
    df["Europe: EU countries: European Union EU2"].sum(),
    df["Europe: EU countries: All other EU countries"].sum(),
    df["Europe: Non-EU countries"].sum(),
    df["Europe: Non-EU countries: All other non-EU countries"].sum(),
    df["Africa"].sum(),
    df["Middle East and Asia"].sum(),
    df["The Americas and the Caribbean"].sum(),
    df["Antarctica and Oceania (including Australasia) and Other"].sum(),
    df["British Overseas "].sum()]
)

df_birth_country = df


Import `Education` sample dataset and clean it up.

In [3]:
df = pd.read_csv(root / "census2021-ts067" / "census2021-ts067-rgn.csv")

df = (
    df.loc[
        df["Highest level of qualification: Total: All usual residents aged 16 years and over"] > 1,
        df.columns.drop(
            df.columns[[0, 2]]
        )
    ]
)

names = list(df.columns)

for name in names:
    names[names.index(name)] = name[32:]

df.columns = names

df.columns.values[0] = "Region"

df.loc[len(df.index)] = (
    ['Total',
     df["Total: All usual residents aged 16 years and over"].sum(),
    df["No qualifications"].sum(),
    df["Level 1 and entry level qualifications"].sum(),
    df["Level 2 qualifications"].sum(),
    df["Apprenticeship"].sum(),
    df["Level 3 qualifications"].sum(),
    df["Level 4 qualifications and above"].sum(),
    df["Other qualifications"].sum()]
)

df_education = df

### Probability Function

Function to calculate probailities as a percent of the total in each region.

In [4]:
def prob_calc(dataframe):
    probabilities = pd.DataFrame(dataframe)
    
    count = 1
    
    for num in range(len(dataframe.columns)-1):
        column = dataframe.loc[:,dataframe.columns[count]]
        cell_count = 0
        for row in column:
            probabilities.loc[:, dataframe.columns[count]].at[cell_count] = row / column[10]
            cell_count += 1
        count += 1
    
    return probabilities

### Testing

Run both sample datasets through the function.

In [5]:
prob_calc(df_birth_country)

Unnamed: 0,Region,Total Population,Europe,Europe: United Kingdom,Europe: EU countries,Europe: EU countries: European Union EU14,Europe: EU countries: European Union EU8,Europe: EU countries: European Union EU2,Europe: EU countries: All other EU countries,Europe: Non-EU countries,Europe: Non-EU countries: All other non-EU countries,Africa,Middle East and Asia,The Americas and the Caribbean,Antarctica and Oceania (including Australasia) and Other,British Overseas
0,North East,0.044415,0.047214,0.049776,0.016943,0.018565,0.016608,0.013033,0.021347,0.013673,0.013673,0.016707,0.021653,0.011672,0.01723,0.012943
1,North West,0.124458,0.128162,0.132151,0.083912,0.086339,0.094629,0.063441,0.056568,0.054993,0.054993,0.081386,0.1051,0.055164,0.058158,0.089181
2,Yorkshire and The Humber,0.091963,0.095265,0.097994,0.065988,0.048235,0.097429,0.054742,0.047349,0.037926,0.037926,0.052748,0.072885,0.036868,0.04477,0.06135
3,East Midlands,0.081883,0.084147,0.084928,0.077169,0.053771,0.109491,0.077107,0.061824,0.057543,0.057543,0.068907,0.0634,0.040319,0.040603,0.079841
4,West Midlands,0.099849,0.100032,0.101823,0.08257,0.072817,0.090638,0.096115,0.050989,0.049694,0.049694,0.083564,0.115456,0.066018,0.043153,0.131282
5,East,0.106298,0.108968,0.108728,0.114199,0.100655,0.12635,0.12598,0.104361,0.094574,0.094574,0.087059,0.074846,0.098997,0.09274,0.070927
6,London,0.147652,0.122677,0.105366,0.30815,0.35728,0.210692,0.351918,0.397574,0.488006,0.488006,0.394472,0.34619,0.467913,0.372235,0.284184
7,South East,0.155679,0.157156,0.157645,0.153351,0.160909,0.149931,0.141697,0.153124,0.136465,0.136465,0.147958,0.136171,0.141887,0.202525,0.164754
8,South West,0.095661,0.100682,0.103258,0.07261,0.07576,0.075486,0.058729,0.08125,0.04978,0.04978,0.049076,0.043361,0.065411,0.102256,0.090508
9,Wales,0.052141,0.055697,0.058332,0.025109,0.02567,0.028747,0.017237,0.025614,0.017345,0.017345,0.018124,0.020939,0.015752,0.026329,0.015029


In [6]:
prob_calc(df_education)

Unnamed: 0,Region,Total: All usual residents aged 16 years and over,No qualifications,Level 1 and entry level qualifications,Level 2 qualifications,Apprenticeship,Level 3 qualifications,Level 4 qualifications and above,Other qualifications
0,North East,0.044866,0.050114,0.044648,0.046454,0.055888,0.049165,0.037922,0.040696
1,North West,0.12407,0.133073,0.124611,0.126147,0.136631,0.129103,0.114545,0.114288
2,Yorkshire and The Humber,0.091839,0.104179,0.09597,0.093539,0.105673,0.094469,0.080146,0.088217
3,East Midlands,0.082321,0.088237,0.088708,0.085631,0.092355,0.088914,0.070905,0.084985
4,West Midlands,0.098861,0.115016,0.105725,0.102671,0.097745,0.099564,0.086133,0.103764
5,East,0.106005,0.105603,0.118604,0.114362,0.109702,0.104936,0.099028,0.109036
6,London,0.146274,0.130417,0.11653,0.108958,0.087876,0.114019,0.202083,0.16276
7,South East,0.155552,0.13166,0.157958,0.161208,0.149313,0.15969,0.164626,0.15263
8,South West,0.097513,0.083963,0.099583,0.104455,0.109486,0.10657,0.095458,0.092313
9,Wales,0.052699,0.057738,0.047663,0.056574,0.055329,0.053571,0.049154,0.051312
