# Characteristic Probabilities

### Importing/Cleaning Data

Start by importing necessary libraries.

In [1]:
import os
import pathlib

import pandas as pd
# import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt

Import `Country of Birth` sample dataset and clean it up.

In [2]:
root = pathlib.Path("data")
df = pd.read_csv(root / "census2021-ts004" / "census2021-ts004-rgn.csv")

df = (
    df.loc[
        df["Country of birth: Total; measures: Value"] > 1,
        df.columns.drop(
            df.columns[[0, 2]]
        )
    ]
)

names = list(df.columns)

for name in names:
    names[names.index(name)] = name[18:-17]

df.columns = names

df.columns.values[0] = "Region"
df.columns.values[1] = "Total Population"

df_birth_country = df
df_birth_country

Unnamed: 0,Region,Total Population,Europe,Europe: United Kingdom,Europe: EU countries,Europe: EU countries: European Union EU14,Europe: EU countries: European Union EU8,Europe: EU countries: European Union EU2,Europe: EU countries: All other EU countries,Europe: Non-EU countries,Europe: Non-EU countries: All other non-EU countries,Africa,Middle East and Asia,The Americas and the Caribbean,Antarctica and Oceania (including Australasia) and Other,British Overseas
0,North East,2647013,2536430,2467870,61727,30080,20355,8981,2311,6833,6833,26473,71693,9171,2973,273
1,North West,7417399,6885187,6551993,305712,139892,115979,43717,6124,27482,27482,128962,347989,43345,10035,1881
2,Yorkshire and The Humber,5480774,5117879,4858514,240412,78153,119411,37722,5126,18953,18953,83583,241324,28969,7725,1294
3,East Midlands,4880054,4520575,4210674,281145,87123,134195,53134,6693,28756,28756,109189,209919,31681,7006,1684
4,West Midlands,5950759,5373979,5048322,300823,117983,111088,66232,5520,24834,24834,132413,382278,51874,7446,2769
5,East,6335074,5854022,5390705,416055,163088,154857,86812,11298,47262,47262,137951,247816,77787,16002,1496
6,London,8799726,6590522,5223986,1122663,578889,258229,242504,43041,243873,243873,625071,1146247,367664,64228,5994
7,South East,9278065,8442839,7815950,558693,260715,183759,97642,16577,68196,68196,234451,450867,111488,34945,3475
8,South West,5701186,5408903,5119492,264534,122751,92517,40470,8796,24877,24877,77765,143568,51397,17644,1909
9,Wales,3107494,2992209,2892064,91477,41593,35233,11878,2773,8668,8668,28719,69329,12377,4543,317


Import `Education` sample dataset and clean it up.

In [3]:
df = pd.read_csv(root / "census2021-ts067" / "census2021-ts067-rgn.csv")

df = (
    df.loc[
        df["Highest level of qualification: Total: All usual residents aged 16 years and over"] > 1,
        df.columns.drop(
            df.columns[[0, 2]]
        )
    ]
)

names = list(df.columns)

for name in names:
    names[names.index(name)] = name[32:]

df.columns = names

df.columns.values[0] = "Region"

df_education = df
df_education

Unnamed: 0,Region,Total: All usual residents aged 16 years and over,No qualifications,Level 1 and entry level qualifications,Level 2 qualifications,Apprenticeship,Level 3 qualifications,Level 4 qualifications and above,Other qualifications
0,North East,2178960,442379,208919,301648,144765,404417,622419,54413
1,North West,6025635,1174699,583083,819133,353909,1061950,1880049,152812
2,Yorkshire and The Humber,4460299,919640,449065,607396,273719,777070,1315456,117953
3,East Midlands,3998045,778906,415084,556046,239223,731371,1163784,113631
4,West Midlands,4801326,1015300,494710,666695,253185,818973,1413723,138740
5,East,5148281,932209,554976,742611,284157,863162,1625376,145790
6,London,7103985,1151250,545269,707518,227622,937875,3316829,217622
7,South East,7554580,1162221,739122,1046800,386759,1313552,2702048,204078
8,South West,4735840,741183,465970,678280,283597,876607,1566774,123429
9,Wales,2559416,509682,223025,367360,143316,440652,806773,68608


### Probability Function

Function to calculate probailities as a percent of the total in each region.

In [4]:
def prob_calc(dataframe):
    probabilities = pd.DataFrame(dataframe)
    total = dataframe.loc[:,dataframe.columns[1]]
    
    count = 2
    
    for num in range(len(dataframe.columns)-2):
        column = dataframe.loc[:,dataframe.columns[count]]
        cell_count = 0
        for row in column:
            probabilities.loc[:, dataframe.columns[count]].at[cell_count] = row / total[cell_count]
            cell_count += 1
        count += 1
    
    return probabilities

### Testing

Run `Country of Birth` sample dataset through the function.

In [5]:
prob_calc(df_birth_country)

Unnamed: 0,Region,Total Population,Europe,Europe: United Kingdom,Europe: EU countries,Europe: EU countries: European Union EU14,Europe: EU countries: European Union EU8,Europe: EU countries: European Union EU2,Europe: EU countries: All other EU countries,Europe: Non-EU countries,Europe: Non-EU countries: All other non-EU countries,Africa,Middle East and Asia,The Americas and the Caribbean,Antarctica and Oceania (including Australasia) and Other,British Overseas
0,North East,2647013,0.958223,0.932323,0.023319,0.011364,0.00769,0.003393,0.000873,0.002581,0.002581,0.010001,0.027084,0.003465,0.001123,0.000103
1,North West,7417399,0.928248,0.883328,0.041216,0.01886,0.015636,0.005894,0.000826,0.003705,0.003705,0.017386,0.046915,0.005844,0.001353,0.000254
2,Yorkshire and The Humber,5480774,0.933788,0.886465,0.043865,0.014259,0.021787,0.006883,0.000935,0.003458,0.003458,0.01525,0.044031,0.005286,0.001409,0.000236
3,East Midlands,4880054,0.926337,0.862833,0.057611,0.017853,0.027499,0.010888,0.001372,0.005893,0.005893,0.022375,0.043016,0.006492,0.001436,0.000345
4,West Midlands,5950759,0.903075,0.848349,0.050552,0.019827,0.018668,0.01113,0.000928,0.004173,0.004173,0.022251,0.06424,0.008717,0.001251,0.000465
5,East,6335074,0.924065,0.85093,0.065675,0.025744,0.024444,0.013703,0.001783,0.00746,0.00746,0.021776,0.039118,0.012279,0.002526,0.000236
6,London,8799726,0.748946,0.593653,0.127579,0.065785,0.029345,0.027558,0.004891,0.027714,0.027714,0.071033,0.130259,0.041781,0.007299,0.000681
7,South East,9278065,0.909978,0.842412,0.060217,0.0281,0.019806,0.010524,0.001787,0.00735,0.00735,0.025269,0.048595,0.012016,0.003766,0.000375
8,South West,5701186,0.948733,0.89797,0.0464,0.021531,0.016228,0.007099,0.001543,0.004363,0.004363,0.01364,0.025182,0.009015,0.003095,0.000335
9,Wales,3107494,0.962901,0.930674,0.029438,0.013385,0.011338,0.003822,0.000892,0.002789,0.002789,0.009242,0.02231,0.003983,0.001462,0.000102


Run `Education` sample dataset through the function.

In [6]:
prob_calc(df_education)

Unnamed: 0,Region,Total: All usual residents aged 16 years and over,No qualifications,Level 1 and entry level qualifications,Level 2 qualifications,Apprenticeship,Level 3 qualifications,Level 4 qualifications and above,Other qualifications
0,North East,2178960,0.203023,0.09588,0.138437,0.066438,0.185601,0.28565,0.024972
1,North West,6025635,0.19495,0.096767,0.135941,0.058734,0.176239,0.312008,0.02536
2,Yorkshire and The Humber,4460299,0.206183,0.10068,0.136178,0.061368,0.174219,0.294926,0.026445
3,East Midlands,3998045,0.194822,0.103822,0.139079,0.059835,0.182932,0.291088,0.028422
4,West Midlands,4801326,0.211462,0.103036,0.138856,0.052732,0.170572,0.294444,0.028896
5,East,5148281,0.181072,0.107798,0.144244,0.055195,0.16766,0.315712,0.028318
6,London,7103985,0.162057,0.076755,0.099595,0.032041,0.132021,0.466897,0.030634
7,South East,7554580,0.153843,0.097838,0.138565,0.051195,0.173875,0.35767,0.027014
8,South West,4735840,0.156505,0.098392,0.143223,0.059883,0.185101,0.330833,0.026063
9,Wales,2559416,0.19914,0.087139,0.143533,0.055996,0.172169,0.315218,0.026806
