# Characteristic Probabilities

### Importing Data

Start by importing necessary libraries.

In [24]:
import os
import pathlib

import pandas as pd
# import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt

#supresses performance warnings
import warnings

#warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

Import `Country of Birth` sample dataset and clean it up.

In [64]:
root = pathlib.Path("data")
df = pd.read_csv(root / "census2021-ts004" / "census2021-ts004-lsoa.csv")

df = (
    df.loc[
        df["Country of birth: Total; measures: Value"] > 1,
        df.columns.drop(
            df.columns[[0, 2]]
        )
    ]
)

names = list(df.columns)

for name in names:
    names[names.index(name)] = name[18:-17]

df.columns = names

df.columns.values[0] = "Region"
df.columns.values[1] = "Total Population"



df_birth_country = df


Import `Education` sample dataset and clean it up.

In [65]:
df = pd.read_csv(root / "census2021-ts067" / "census2021-ts067-lsoa.csv")

df = (
    df.loc[
        df["Highest level of qualification: Total: All usual residents aged 16 years and over"] > 1,
        df.columns.drop(
            df.columns[[0, 2]]
        )
    ]
)

names = list(df.columns)

for name in names:
    names[names.index(name)] = name[32:]

df.columns = names

df.columns.values[0] = "Region"



df_education = df

In [61]:
df_education.head()

Unnamed: 0,Region,Total: All usual residents aged 16 years and over,No qualifications,Level 1 and entry level qualifications,Level 2 qualifications,Apprenticeship,Level 3 qualifications,Level 4 qualifications and above,Other qualifications
0,City of London 001A,1353,32,27,53,12,86,1127,16
1,City of London 001B,1298,23,28,52,4,87,1086,18
2,City of London 001C,1505,132,61,90,12,120,1055,35
3,City of London 001E,961,197,64,120,32,131,387,30
4,Barking and Dagenham 016A,1433,295,158,170,39,204,501,66


### Probability Function

Function to calculate probailities as a percent of the total in each region.

In [63]:
def prob_calc(dataframe):
    probabilities = pd.DataFrame(dataframe)
    total = dataframe.loc[:,dataframe.columns[1]]
    
    count = 2

    # Iterates through columns in dataframe
    for num in range(len(dataframe.columns)-2):
        column = dataframe.loc[:,dataframe.columns[count]]
        cell_count = 0
        # Iterates through items in each column
        for row in column:
            #Divides current cell by the item at the start of the row
            probabilities.loc[:, dataframe.columns[count]].at[cell_count] = row / total[cell_count]
            cell_count += 1
        count += 1
    
    return probabilities

### Testing

Run both sample datasets through the function.

In [66]:
prob_calc(df_birth_country)

Unnamed: 0,Region,Total Population,Europe,Europe: United Kingdom,Europe: EU countries,Europe: EU countries: European Union EU14,Europe: EU countries: European Union EU8,Europe: EU countries: European Union EU2,Europe: EU countries: All other EU countries,Europe: Non-EU countries,Europe: Non-EU countries: All other non-EU countries,Africa,Middle East and Asia,The Americas and the Caribbean,Antarctica and Oceania (including Australasia) and Other,British Overseas
0,Hartlepool 001A,2283,0.988173,0.977661,0.008760,0.003942,0.003504,0.000438,0.000876,0.001752,0.001752,0.002628,0.007008,0.000438,0.001752,0.0
1,Hartlepool 001B,1344,0.991815,0.985119,0.006696,0.004464,0.001488,0.000000,0.000744,0.000000,0.000000,0.000744,0.005952,0.000000,0.001488,0.0
2,Hartlepool 001C,1070,0.989720,0.981308,0.007477,0.005607,0.001869,0.000000,0.000000,0.000935,0.000935,0.000000,0.007477,0.000935,0.001869,0.0
3,Hartlepool 001D,1323,0.994709,0.990930,0.003023,0.001512,0.001512,0.000000,0.000000,0.000756,0.000756,0.001512,0.002268,0.001512,0.000000,0.0
4,Hartlepool 001F,1955,0.987724,0.976471,0.009719,0.003581,0.003581,0.001535,0.001023,0.001535,0.001535,0.002558,0.006650,0.002046,0.001023,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35667,Merthyr Tydfil 008B,1738,0.976985,0.911392,0.064442,0.005178,0.050058,0.009206,0.000000,0.001151,0.001151,0.001726,0.019563,0.001726,0.000000,0.0
35668,Merthyr Tydfil 008C,1913,0.991636,0.900157,0.091479,0.006796,0.072138,0.012546,0.000000,0.000000,0.000000,0.002614,0.005227,0.000523,0.000000,0.0
35669,Merthyr Tydfil 008D,1252,0.988019,0.942492,0.038339,0.006390,0.028754,0.002396,0.000799,0.007188,0.007188,0.001597,0.008786,0.000799,0.000799,0.0
35670,Merthyr Tydfil 008E,1301,0.989239,0.926211,0.063028,0.006149,0.053036,0.000769,0.003075,0.000000,0.000000,0.001537,0.008455,0.000769,0.000000,0.0


In [68]:
prob_calc(df_education)

Unnamed: 0,Region,Total: All usual residents aged 16 years and over,No qualifications,Level 1 and entry level qualifications,Level 2 qualifications,Apprenticeship,Level 3 qualifications,Level 4 qualifications and above,Other qualifications
0,City of London 001A,1353,0.023651,0.019956,0.039172,0.008869,0.063562,0.832964,0.011826
1,City of London 001B,1298,0.017720,0.021572,0.040062,0.003082,0.067026,0.836672,0.013867
2,City of London 001C,1505,0.087708,0.040532,0.059801,0.007973,0.079734,0.700997,0.023256
3,City of London 001E,961,0.204995,0.066597,0.124870,0.033299,0.136316,0.402706,0.031217
4,Barking and Dagenham 016A,1433,0.205862,0.110258,0.118632,0.027216,0.142359,0.349616,0.046057
...,...,...,...,...,...,...,...,...,...
35667,Vale of Glamorgan 005G,1278,0.095462,0.068858,0.112676,0.037559,0.136933,0.521127,0.027387
35668,Vale of Glamorgan 005H,912,0.093202,0.061404,0.099781,0.035088,0.141447,0.557018,0.012061
35669,Vale of Glamorgan 014G,1087,0.207912,0.112236,0.157314,0.068077,0.183993,0.234591,0.035879
35670,Vale of Glamorgan 014H,1302,0.095238,0.073733,0.168203,0.036866,0.232719,0.372504,0.020737


This function takes 2 data tables as inputs, and outputs one table of all combined probabilities

In [77]:
def combineprobabilitytables(inputdata1, inputdata2):
    df1 = prob_calc(inputdata1)
    df2 = prob_calc(inputdata2)

    
    #create lists of characteristics in each table
    df1_names = df1.columns.values.tolist()
    del df1_names[:1]
    
    df2_names = df2.columns.values.tolist()
    del df2_names[:1]
    
    
    #create a new table to store the combined probabilities, our results
    dfprobability = df1.copy()
    #drop leftover columns from the copying of df1
    dfprobability = dfprobability.drop(columns = df1_names)

    del df1_names[:1]
    del df2_names[:1]
    
    #nested for loop iterates through columns of both tables and multiplies them together 
    #output is a series which is then added to the reults table
    for col1 in df1_names:
        for col2 in df2_names:
            s = df1[col1] * df2[col2]
            dfprobability[col1 +" and "+ col2] = s
            #dfprobability = pd.concat([dfprobability,s.rename(col1+" and " +col2)], axis=1) 
            
    
    
    
    #output result
    return dfprobability

In [76]:
combineprobabilitytables(df_education, df_birth_country)

Unnamed: 0,Region,No qualifications and Europe,No qualifications and Europe: United Kingdom,No qualifications and Europe: EU countries,No qualifications and Europe: EU countries: European Union EU14,No qualifications and Europe: EU countries: European Union EU8,No qualifications and Europe: EU countries: European Union EU2,No qualifications and Europe: EU countries: All other EU countries,No qualifications and Europe: Non-EU countries,No qualifications and Europe: Non-EU countries: All other non-EU countries,...,Other qualifications and Europe: EU countries: European Union EU8,Other qualifications and Europe: EU countries: European Union EU2,Other qualifications and Europe: EU countries: All other EU countries,Other qualifications and Europe: Non-EU countries,Other qualifications and Europe: Non-EU countries: All other non-EU countries,Other qualifications and Africa,Other qualifications and Middle East and Asia,Other qualifications and The Americas and the Caribbean,Other qualifications and Antarctica and Oceania (including Australasia) and Other,Other qualifications and British Overseas
0,City of London 001A,0.023371,0.023123,0.000207,0.000093,0.000083,0.000010,0.000021,0.000041,0.000041,...,0.000041,0.000005,0.000010,0.000021,0.000021,0.000031,0.000083,0.000005,0.000021,0.0
1,City of London 001B,0.017575,0.017456,0.000119,0.000079,0.000026,0.000000,0.000013,0.000000,0.000000,...,0.000021,0.000000,0.000010,0.000000,0.000000,0.000010,0.000083,0.000000,0.000021,0.0
2,City of London 001C,0.086806,0.086068,0.000656,0.000492,0.000164,0.000000,0.000000,0.000082,0.000082,...,0.000043,0.000000,0.000000,0.000022,0.000022,0.000000,0.000174,0.000022,0.000043,0.0
3,City of London 001E,0.203910,0.203135,0.000620,0.000310,0.000310,0.000000,0.000000,0.000155,0.000155,...,0.000047,0.000000,0.000000,0.000024,0.000024,0.000047,0.000071,0.000047,0.000000,0.0
4,Barking and Dagenham 016A,0.203335,0.201018,0.002001,0.000737,0.000737,0.000316,0.000211,0.000316,0.000316,...,0.000165,0.000071,0.000047,0.000071,0.000071,0.000118,0.000306,0.000094,0.000047,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35667,Vale of Glamorgan 005G,0.093265,0.087003,0.006152,0.000494,0.004779,0.000879,0.000000,0.000110,0.000110,...,0.001371,0.000252,0.000000,0.000032,0.000032,0.000047,0.000536,0.000047,0.000000,0.0
35668,Vale of Glamorgan 005H,0.092422,0.083896,0.008526,0.000633,0.006723,0.001169,0.000000,0.000000,0.000000,...,0.000870,0.000151,0.000000,0.000000,0.000000,0.000032,0.000063,0.000006,0.000000,0.0
35669,Vale of Glamorgan 014G,0.205421,0.195955,0.007971,0.001329,0.005978,0.000498,0.000166,0.001495,0.001495,...,0.001032,0.000086,0.000029,0.000258,0.000258,0.000057,0.000315,0.000029,0.000029,0.0
35670,Vale of Glamorgan 014H,0.094213,0.088211,0.006003,0.000586,0.005051,0.000073,0.000293,0.000000,0.000000,...,0.001100,0.000016,0.000064,0.000000,0.000000,0.000032,0.000175,0.000016,0.000000,0.0
