# Characteristic Probabilities

### Importing Data

Start by importing necessary libraries.

In [6]:
import os
import pathlib

import pandas as pd
# import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt

#supresses performance warnings
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

Import `Country of Birth` sample dataset and clean it up.

In [8]:
root = pathlib.Path("data")
df = pd.read_csv(root / "census2021-ts004" / "census2021-ts004-rgn.csv")

df = (
    df.loc[
        df["Country of birth: Total; measures: Value"] > 1,
        df.columns.drop(
            df.columns[[0, 2]]
        )
    ]
)

names = list(df.columns)

for name in names:
    names[names.index(name)] = name[18:-17]

df.columns = names

df.columns.values[0] = "Region"
df.columns.values[1] = "Total Population"

df.loc[len(df.index)] = (
    ['Total',
     df["Total Population"].sum(),
    df["Europe"].sum(),
    df["Europe: United Kingdom"].sum(),
    df["Europe: EU countries"].sum(),
    df["Europe: EU countries: European Union EU14"].sum(),
    df["Europe: EU countries: European Union EU8"].sum(),
    df["Europe: EU countries: European Union EU2"].sum(),
    df["Europe: EU countries: All other EU countries"].sum(),
    df["Europe: Non-EU countries"].sum(),
    df["Europe: Non-EU countries: All other non-EU countries"].sum(),
    df["Africa"].sum(),
    df["Middle East and Asia"].sum(),
    df["The Americas and the Caribbean"].sum(),
    df["Antarctica and Oceania (including Australasia) and Other"].sum(),
    df["British Overseas "].sum()]
)

df_birth_country = df


Import `Education` sample dataset and clean it up.

In [9]:
df = pd.read_csv(root / "census2021-ts067" / "census2021-ts067-rgn.csv")

df = (
    df.loc[
        df["Highest level of qualification: Total: All usual residents aged 16 years and over"] > 1,
        df.columns.drop(
            df.columns[[0, 2]]
        )
    ]
)

names = list(df.columns)

for name in names:
    names[names.index(name)] = name[32:]

df.columns = names

df.columns.values[0] = "Region"

df.loc[len(df.index)] = (
    ['Total',
     df["Total: All usual residents aged 16 years and over"].sum(),
    df["No qualifications"].sum(),
    df["Level 1 and entry level qualifications"].sum(),
    df["Level 2 qualifications"].sum(),
    df["Apprenticeship"].sum(),
    df["Level 3 qualifications"].sum(),
    df["Level 4 qualifications and above"].sum(),
    df["Other qualifications"].sum()]
)

df_education = df

### Probability Function

Function to calculate probailities as a percent of the total in each region.

In [10]:
def prob_calc(dataframe):
    probabilities = pd.DataFrame(dataframe)
    
    count = 1
    
    for num in range(len(dataframe.columns)-1):
        column = dataframe.loc[:,dataframe.columns[count]]
        cell_count = 0
        for row in column:
            probabilities.loc[:, dataframe.columns[count]].at[cell_count] = row / column[10]
            cell_count += 1
        count += 1
    
    return probabilities[:-1]

### Testing

Run both sample datasets through the function.

In [12]:
prob_calc(df_birth_country)

Unnamed: 0,Region,Total Population,Europe,Europe: United Kingdom,Europe: EU countries,Europe: EU countries: European Union EU14,Europe: EU countries: European Union EU8,Europe: EU countries: European Union EU2,Europe: EU countries: All other EU countries,Europe: Non-EU countries,Europe: Non-EU countries: All other non-EU countries,Africa,Middle East and Asia,The Americas and the Caribbean,Antarctica and Oceania (including Australasia) and Other,British Overseas
0,North East,0.044415,0.047214,0.049776,0.016943,0.018565,0.016608,0.013033,0.021347,0.013673,0.013673,0.016707,0.021653,0.011672,0.01723,0.012943
1,North West,0.124458,0.128162,0.132151,0.083912,0.086339,0.094629,0.063441,0.056568,0.054993,0.054993,0.081386,0.1051,0.055164,0.058158,0.089181
2,Yorkshire and The Humber,0.091963,0.095265,0.097994,0.065988,0.048235,0.097429,0.054742,0.047349,0.037926,0.037926,0.052748,0.072885,0.036868,0.04477,0.06135
3,East Midlands,0.081883,0.084147,0.084928,0.077169,0.053771,0.109491,0.077107,0.061824,0.057543,0.057543,0.068907,0.0634,0.040319,0.040603,0.079841
4,West Midlands,0.099849,0.100032,0.101823,0.08257,0.072817,0.090638,0.096115,0.050989,0.049694,0.049694,0.083564,0.115456,0.066018,0.043153,0.131282
5,East,0.106298,0.108968,0.108728,0.114199,0.100655,0.12635,0.12598,0.104361,0.094574,0.094574,0.087059,0.074846,0.098997,0.09274,0.070927
6,London,0.147652,0.122677,0.105366,0.30815,0.35728,0.210692,0.351918,0.397574,0.488006,0.488006,0.394472,0.34619,0.467913,0.372235,0.284184
7,South East,0.155679,0.157156,0.157645,0.153351,0.160909,0.149931,0.141697,0.153124,0.136465,0.136465,0.147958,0.136171,0.141887,0.202525,0.164754
8,South West,0.095661,0.100682,0.103258,0.07261,0.07576,0.075486,0.058729,0.08125,0.04978,0.04978,0.049076,0.043361,0.065411,0.102256,0.090508
9,Wales,0.052141,0.055697,0.058332,0.025109,0.02567,0.028747,0.017237,0.025614,0.017345,0.017345,0.018124,0.020939,0.015752,0.026329,0.015029


In [13]:
prob_calc(df_education)

Unnamed: 0,Region,Total: All usual residents aged 16 years and over,No qualifications,Level 1 and entry level qualifications,Level 2 qualifications,Apprenticeship,Level 3 qualifications,Level 4 qualifications and above,Other qualifications
0,North East,0.044866,0.050114,0.044648,0.046454,0.055888,0.049165,0.037922,0.040696
1,North West,0.12407,0.133073,0.124611,0.126147,0.136631,0.129103,0.114545,0.114288
2,Yorkshire and The Humber,0.091839,0.104179,0.09597,0.093539,0.105673,0.094469,0.080146,0.088217
3,East Midlands,0.082321,0.088237,0.088708,0.085631,0.092355,0.088914,0.070905,0.084985
4,West Midlands,0.098861,0.115016,0.105725,0.102671,0.097745,0.099564,0.086133,0.103764
5,East,0.106005,0.105603,0.118604,0.114362,0.109702,0.104936,0.099028,0.109036
6,London,0.146274,0.130417,0.11653,0.108958,0.087876,0.114019,0.202083,0.16276
7,South East,0.155552,0.13166,0.157958,0.161208,0.149313,0.15969,0.164626,0.15263
8,South West,0.097513,0.083963,0.099583,0.104455,0.109486,0.10657,0.095458,0.092313
9,Wales,0.052699,0.057738,0.047663,0.056574,0.055329,0.053571,0.049154,0.051312


This function takes 2 data tables as inputs, and outputs one table of all combined probabilities

In [11]:
def combineprobabilitytables(inputdata1, inputdata2):
    df1 = prob_calc(inputdata1)
    df2 = prob_calc(inputdata2)

    
    #create lists of characteristics in each table
    df1_names = df1.columns.values.tolist()
    df1_names.remove("Region") 
    
    df2_names = df2.columns.values.tolist()
    df2_names.remove("Region") 
    
    
    #create a new table to store the combined probabilities, our results
    dfprobability = df1.copy()
    
    #nested for loop iterates through columns of both tables and multiplies them together 
    #output is a series which is then added to the reults table
    for col1 in df1_names:
        for col2 in df2_names:
            s = df1[col1] * df2[col2]
            dfprobability[col1 +" and "+ col2] = s
            
    
    #drop leftover columns from the copying of df1
    dfprobability = dfprobability.drop(columns = df1_names)
    
    #output result
    return dfprobability

In [12]:
combineprobabilitytables(df_education, df_birth_country)

Unnamed: 0,Region,Total: All usual residents aged 16 years and over and Total Population,Total: All usual residents aged 16 years and over and Europe,Total: All usual residents aged 16 years and over and Europe: United Kingdom,Total: All usual residents aged 16 years and over and Europe: EU countries,Total: All usual residents aged 16 years and over and Europe: EU countries: European Union EU14,Total: All usual residents aged 16 years and over and Europe: EU countries: European Union EU8,Total: All usual residents aged 16 years and over and Europe: EU countries: European Union EU2,Total: All usual residents aged 16 years and over and Europe: EU countries: All other EU countries,Total: All usual residents aged 16 years and over and Europe: Non-EU countries,...,Other qualifications and Europe: EU countries: European Union EU8,Other qualifications and Europe: EU countries: European Union EU2,Other qualifications and Europe: EU countries: All other EU countries,Other qualifications and Europe: Non-EU countries,Other qualifications and Europe: Non-EU countries: All other non-EU countries,Other qualifications and Africa,Other qualifications and Middle East and Asia,Other qualifications and The Americas and the Caribbean,Other qualifications and Antarctica and Oceania (including Australasia) and Other,Other qualifications and British Overseas
0,North East,0.001993,0.002118,0.002233,0.00076,0.000833,0.000745,0.000585,0.000958,0.000613,...,0.000676,0.00053,0.000869,0.000556,0.000556,0.00068,0.000881,0.000475,0.000701,0.000527
1,North West,0.015442,0.015901,0.016396,0.010411,0.010712,0.011741,0.007871,0.007018,0.006823,...,0.010815,0.007251,0.006465,0.006285,0.006285,0.009301,0.012012,0.006305,0.006647,0.010192
2,Yorkshire and The Humber,0.008446,0.008749,0.009,0.00606,0.00443,0.008948,0.005027,0.004349,0.003483,...,0.008595,0.004829,0.004177,0.003346,0.003346,0.004653,0.00643,0.003252,0.00395,0.005412
3,East Midlands,0.006741,0.006927,0.006991,0.006353,0.004426,0.009013,0.006348,0.005089,0.004737,...,0.009305,0.006553,0.005254,0.00489,0.00489,0.005856,0.005388,0.003427,0.003451,0.006785
4,West Midlands,0.009871,0.009889,0.010066,0.008163,0.007199,0.008961,0.009502,0.005041,0.004913,...,0.009405,0.009973,0.005291,0.005156,0.005156,0.008671,0.01198,0.00685,0.004478,0.013622
5,East,0.011268,0.011551,0.011526,0.012106,0.01067,0.013394,0.013355,0.011063,0.010025,...,0.013777,0.013736,0.011379,0.010312,0.010312,0.009493,0.008161,0.010794,0.010112,0.007734
6,London,0.021598,0.017944,0.015412,0.045074,0.052261,0.030819,0.051476,0.058155,0.071382,...,0.034292,0.057278,0.064709,0.079428,0.079428,0.064204,0.056346,0.076157,0.060585,0.046254
7,South East,0.024216,0.024446,0.024522,0.023854,0.02503,0.023322,0.022041,0.023819,0.021227,...,0.022884,0.021627,0.023371,0.020829,0.020829,0.022583,0.020784,0.021656,0.030911,0.025146
8,South West,0.009328,0.009818,0.010069,0.00708,0.007388,0.007361,0.005727,0.007923,0.004854,...,0.006968,0.005421,0.0075,0.004595,0.004595,0.00453,0.004003,0.006038,0.00944,0.008355
9,Wales,0.002748,0.002935,0.003074,0.001323,0.001353,0.001515,0.000908,0.00135,0.000914,...,0.001475,0.000884,0.001314,0.00089,0.00089,0.00093,0.001074,0.000808,0.001351,0.000771
