# functions library

In [1]:
import os, pathlib
import pandas as pd
import warnings

In [2]:
root = pathlib.Path("data")

In [3]:
codes = pd.read_csv(root / "census_codes.csv")

In [4]:
region_sizes = {"Country" : "ctry",
                "Region" : "rgn",
                "Upper-Tier Local Authority" : "utla",      #153 in England
                "Lower-Tier Local Authority" : "ltla",      #296 in England
                "Middle-Layer Super Output Area" : "msoa",  #2000-6000 households; 5000-15000 persons
                "Lower-Layer Super Output Area" : "lsoa",   #400-1200 households; 1000-3000 persons
                "Output Area" : "oa"}                       #40-250 households; 100-625 persons
region_indices = {3 : "ctry",
                  10 : "rgn",
                  174 : "utla",
                  331 : "ltla",
                  7264 : "msoa",
                  35672 : "lsoa",
                  188800 : "oa"}

regions = list(region_sizes.values())


In [5]:
def import_data(region : str, target_groups : list = None, target_codes: list = None): 
    '''Imports target census data\n
    Parameters:\n
        region, should be string of one of the following: ctry, rgn, utla, ltla, msoa, lsoa, oa\n
        target_groups, should be a list of strings of the descriptions of the data values as found at https://www.nomisweb.co.uk/census/2021/bulk\n
        target_codes, should be a list of strings of the codes of the data values as found at https://www.nomisweb.co.uk/census/2021/bulk\n
    Note: one of the two target parameters must be passed\n

    Returns dictionary of "Data type code" : Pandas DataFrame'''
    if target_groups == None and target_codes == None:
        raise ValueError("Need specified groups to import")
    
    region = str(region)
    if region not in regions:
        regions_error = '\n'.join(f"{key}  :  {val}" for key, val in region_sizes.items())
        raise ValueError(f"Region code must be one of the following: \nRegion  :  Region Code\n{regions_error}")
    
    codes = pd.read_csv(root / "census_codes.csv")
    codes["Filename"] = (
    codes["Filename"].str.split(".")
    .apply(lambda x: x[0])
    )

    if target_groups:
        valid_target_groups = [group for group in target_groups if group in codes["Description"].values]
        if len(valid_target_groups) == 0:
            raise ValueError("Inputs not found in code database, ensure the group names are as found on https://www.nomisweb.co.uk/census/2021/bulk")
        erroneous_groups = [group for group in target_groups if group not in valid_target_groups]
        output_erroneous_groups = '\n'.join(erroneous_groups)
        if len(erroneous_groups) > 0:
            warnings.warn(f"The following groups were not imported as they are invalid: {output_erroneous_groups}\nEnsure the group names are as found on https://www.nomisweb.co.uk/census/2021/bulk")
        
    if target_codes:
        valid_target_codes = [code for code in target_codes if code in codes["Code"].values]
        if len(valid_target_codes) == 0:
            raise ValueError("Inputs not found in code database, ensure codes are in the form 'TSXXX' where X's are digits, as found on https://www.nomisweb.co.uk/census/2021/bulk")
        erroneous_codes = [code for code in target_codes if code not in valid_target_codes]
        output_erroneous_codes = '\n'.join(erroneous_codes)
        if len(erroneous_codes) > 0:
            warnings.warn(f"The following codes were not imported as they are invalid: {output_erroneous_codes}\nEnsure codes are in the form 'TSXXX' where X's are digits, as found on https://www.nomisweb.co.uk/census/2021/bulk")
    
    if target_groups:
        target_codes = target_codes if target_codes is not None else []
        for group in valid_target_groups:
            valid_target_codes.append(codes.loc[codes["Description"] == group,"Code"].item())

    data = {}
    for code in valid_target_codes:
        folder = codes.loc[codes["Code"] == code, "Filename"].item()
        try:
            data.update({code : pd.read_csv(root / folder / f"{folder}-{region}.csv").drop(columns="date")})
        except FileNotFoundError:
            warnings.warn(f"File {folder}-{region}.csv not found in data/{folder}")
    
    return data



In [6]:
def import_all_data(region : str):
    '''Imports all installed census data\n
    Parameters:\n
        region, should be string of one of the following: ctry, rgn, utla, ltla, msoa, lsoa, oa\n
    Returns dictionary of "Data type code" : Pandas DataFrame
    '''
    region = str(region)
    if region not in regions:
        regions_error = '\n'.join(f"{key}  :  {val}" for key, val in region_sizes.items())
        raise ValueError(f"Region code must be one of the following: \nRegion  :  Region Code\n{regions_error}")

    codes = pd.read_csv(root / "census_codes.csv")
    codes["Filename"] = (
    codes["Filename"].str.split(".")
    .apply(lambda x: x[0])
    )
    
    data = {}
    for code in codes["Code"].values:
        folder = codes.loc[codes["Code"] == code, "Filename"].item()
        try:
            data.update({code : pd.read_csv(root / folder / f"{folder}-{region}.csv").drop(columns="date")})
        except FileNotFoundError:
            warnings.warn(f"File {folder}-{region}.csv not found in data/{folder}")
    
    return data



In [7]:
def cleanup_all(data : dict, remove_geography : bool = True, remove_geography_code : bool = True):
    '''
    Removes the specified columns from all dataframes\n
    Parameters:\n
        data, data to be cleaned, dictionary of "Data type code" : Pandas DataFrame\n
        remove_geography, whether to remove geography column, bool\n
        remove_geography-code, whether to remove geography code column, bool\n
        
    Returns dictionary of "Data type code" : Pandas DataFrame
    '''
    for key in data.keys():
        dataframe = data[key]
        try:
            if remove_geography:
                dataframe = dataframe.drop(columns="geography")
        except KeyError:
            pass
        try:
            if remove_geography_code:
                dataframe = dataframe.drop(columns="geography code")
        except KeyError:
            pass
        data[key] = dataframe
    return data



In [8]:
def cleanup(dataframe, columns : list):
    '''Removes specified columns from dataframe\n
    Parameters:\n
        dataframe, Pandas DataFrame to be cleaned\n
        columns, list of strings of column names to be removed\n
    '''
    if type(columns) is not list:
        raise ValueError("Parameter 'columns' must be a list")
    elif len([column for column in columns if type(column) is str]) == 0:
        raise ValueError("Parameter 'columns' must be a list of strings")  
    for column in columns:
        try:
            dataframe = dataframe.drop(columns=column)
        except KeyError:
            warnings.warn(f"Column {column} does not exist ")
    return dataframe

In [9]:
def factor_in_age(df):
    df_age = import_data(region_indices[len(df.index)], target_codes=["TS004"])["TS004"]
    df_age_totals_column = [column for column in list(df_age.columns) if "Total" in column][0]
    df_totals_column = [column for column in list(df.columns) if "Total" in column][0]
    df["Not Accounted For"] = df_age[df_age_totals_column] - df[df_totals_column]
    df[df_totals_column] = df_age[df_age_totals_column]
    df.rename(columns={df_totals_column:"Total"}, inplace=True)

    return df

   

In [10]:
def prob_calc(dataframe):
    probabilities = pd.DataFrame(dataframe)
    total = dataframe.loc[:,dataframe.columns[1]]
    
    count = 2

    # Iterates through columns in dataframe
    for num in range(len(dataframe.columns)-2):
        column = dataframe.loc[:,dataframe.columns[count]]
        cell_count = 0
        # Iterates through items in each column
        for row in column:
            #Divides current cell by the item at the start of the row
            probabilities.loc[:, dataframe.columns[count]].at[cell_count] = row / total[cell_count]
            cell_count += 1
        count += 1
    
    return probabilities

In [48]:
def combineprobabilitytables(inputdata1, inputdata2):
    df1 = prob_calc(inputdata1)
    df2 = prob_calc(inputdata2)

    
    #create lists of characteristics in each table
    df1_names = df1.columns.values.tolist()
    df2_names = df2.columns.values.tolist()
    
    #create a new table to store the combined probabilities, our results
    dfprobability = pd.DataFrame();
    dfprobability["Region"] = df1[df1_names[0]]
    dfprobability["Total Population"] = df1[df1_names[1]]
    
    del df1_names[:2]
    del df2_names[:2]
    
    #nested for loop iterates through columns of both tables and multiplies them together 
    #output is a series which is then added to the reults table
    for col1 in df1_names:
        for col2 in df2_names:
            s = df1[col1] * df2[col2]
            dfprobability = pd.concat([dfprobability,s.rename(col1+" and " +col2)], axis=1)
     
    #output result
    return dfprobability

In [49]:
data = import_data("rgn", None, ["TS004", "TS037"])
data = cleanup_all(data, False, True)

combineprobabilitytables(data["TS004"], data["TS037"])

Unnamed: 0,Region,Total Population,Country of birth: Europe; measures: Value and General health: Very good health,Country of birth: Europe; measures: Value and General health: Good health,Country of birth: Europe; measures: Value and General health: Fair health,Country of birth: Europe; measures: Value and General health: Bad health,Country of birth: Europe; measures: Value and General health: Very bad health,Country of birth: Europe: United Kingdom; measures: Value and General health: Very good health,Country of birth: Europe: United Kingdom; measures: Value and General health: Good health,Country of birth: Europe: United Kingdom; measures: Value and General health: Fair health,...,Country of birth: Antarctica and Oceania (including Australasia) and Other; measures: Value and General health: Very good health,Country of birth: Antarctica and Oceania (including Australasia) and Other; measures: Value and General health: Good health,Country of birth: Antarctica and Oceania (including Australasia) and Other; measures: Value and General health: Fair health,Country of birth: Antarctica and Oceania (including Australasia) and Other; measures: Value and General health: Bad health,Country of birth: Antarctica and Oceania (including Australasia) and Other; measures: Value and General health: Very bad health,Country of birth: British Overseas ; measures: Value and General health: Very good health,Country of birth: British Overseas ; measures: Value and General health: Good health,Country of birth: British Overseas ; measures: Value and General health: Fair health,Country of birth: British Overseas ; measures: Value and General health: Bad health,Country of birth: British Overseas ; measures: Value and General health: Very bad health
0,North East,2647013,0.427175,0.32059,0.143242,0.051804,0.015414,0.415628,0.311924,0.13937,...,0.000501,0.000376,0.000168,6.1e-05,1.8e-05,4.6e-05,3.5e-05,1.5e-05,6e-06,2e-06
1,North West,7417399,0.443078,0.303584,0.124034,0.044244,0.013308,0.421636,0.288893,0.118032,...,0.000646,0.000442,0.000181,6.4e-05,1.9e-05,0.000121,8.3e-05,3.4e-05,1.2e-05,4e-06
2,Yorkshire and The Humber,5480774,0.431241,0.320402,0.127919,0.041943,0.012282,0.409387,0.304165,0.121436,...,0.000651,0.000484,0.000193,6.3e-05,1.9e-05,0.000109,8.1e-05,3.2e-05,1.1e-05,3e-06
3,East Midlands,4880054,0.428388,0.322631,0.125818,0.038491,0.011009,0.39902,0.300514,0.117193,...,0.000664,0.0005,0.000195,6e-05,1.7e-05,0.00016,0.00012,4.7e-05,1.4e-05,4e-06
4,West Midlands,5950759,0.418999,0.310489,0.122302,0.03945,0.011834,0.393608,0.291674,0.114891,...,0.000581,0.00043,0.000169,5.5e-05,1.6e-05,0.000216,0.00016,6.3e-05,2e-05,6e-06
5,East,6335074,0.446153,0.32011,0.115472,0.03307,0.009259,0.410843,0.294775,0.106333,...,0.00122,0.000875,0.000316,9e-05,2.5e-05,0.000114,8.2e-05,3e-05,8e-06,2e-06
6,London,8799726,0.40107,0.238512,0.07736,0.02416,0.007845,0.317908,0.189057,0.06132,...,0.003909,0.002324,0.000754,0.000235,7.6e-05,0.000365,0.000217,7e-05,2.2e-05,7e-06
7,South East,9278065,0.454765,0.309519,0.107123,0.030123,0.008448,0.420998,0.286537,0.099169,...,0.001882,0.001281,0.000443,0.000125,3.5e-05,0.000187,0.000127,4.4e-05,1.2e-05,3e-06
8,South West,5701186,0.45157,0.324549,0.124507,0.03744,0.010667,0.427408,0.307184,0.117845,...,0.001473,0.001059,0.000406,0.000122,3.5e-05,0.000159,0.000115,4.4e-05,1.3e-05,4e-06
9,Wales,3107494,0.445113,0.311564,0.139332,0.050849,0.016043,0.430216,0.301136,0.134669,...,0.000676,0.000473,0.000212,7.7e-05,2.4e-05,4.7e-05,3.3e-05,1.5e-05,5e-06,2e-06


In [39]:
#function takes an input of a list of characteristic codes. assumes data has been imported via import_data or import_all_data functions, 
#assmues geography code column has been removed by calling cleanup_all(data,False,True)
#assumes factor_in_age function has been called where applicable ---> find a way to do this automatically via chekcing headers of total column?

def create_prob_table(list):
    
    count = 1
    
    for i in list:
        if count == 1:
            dfprobability = combineprobabilitytables(data[list[0]], data[list[1]])
        elif count != len(list):
            dfprobability = combineprobabilitytables(dfprobability, data[list[count]])
        
        count+= 1

    return dfprobability

In [46]:
def risk_assessment(dataframe):
    def risk(v):
        try:
            if v <= 10:
                return f"High risk: {v}"
            elif v <= 100:
                return f"Low risk: {v}"
            return v
        except TypeError:
            return v
    
    def highlight(styler):
        styler.set_caption("Risk Level")
        styler.format(risk)
        styler.background_gradient(axis=1, cmap="autumn")
        return styler
        
    return dataframe.style.pipe(highlight)