In [46]:
import os, pathlib
import pandas as pd
import warnings

In [47]:
root = pathlib.Path("data")

In [48]:
codes = pd.read_csv(root / "census_codes.csv")
codes

Unnamed: 0,Code,Description,Filename
0,TS001,Number of usual residents in households and co...,census2021-ts001.zip
1,TS002,Legal partnership status,census2021-ts002.zip
2,TS003,Household composition,census2021-ts003.zip
3,TS004,Country of birth,census2021-ts004.zip
4,TS005,Passports held,census2021-ts005.zip
...,...,...,...
72,TS039,Provision of unpaid care,census2021-ts039.zip
73,TS040,Number of disabled people in the household,census2021-ts040.zip
74,TS037ASP,General health - age-standardised proportions,census2021-ts037asp.zip
75,TS038ASP,Disability - age-standardised proportions,census2021-ts038asp.zip


In [49]:
codes["Filename"] = (
    codes["Filename"].str.split(".")
    .apply(lambda x: x[0])
)
codes


Unnamed: 0,Code,Description,Filename
0,TS001,Number of usual residents in households and co...,census2021-ts001
1,TS002,Legal partnership status,census2021-ts002
2,TS003,Household composition,census2021-ts003
3,TS004,Country of birth,census2021-ts004
4,TS005,Passports held,census2021-ts005
...,...,...,...
72,TS039,Provision of unpaid care,census2021-ts039
73,TS040,Number of disabled people in the household,census2021-ts040
74,TS037ASP,General health - age-standardised proportions,census2021-ts037asp
75,TS038ASP,Disability - age-standardised proportions,census2021-ts038asp


In [50]:
region_sizes = {"Country" : "ctry",
                "Region" : "rgn",
                "Upper-Tier Local Authority" : "utla",      #153 in England
                "Lower-Tier Local Authority" : "ltla",      #296 in England
                "Middle-Layer Super Output Area" : "msoa",  #2000-6000 households; 5000-15000 persons
                "Lower-Layer Super Output Area" : "lsoa",   #400-1200 households; 1000-3000 persons
                "Output Area" : "oa"}                       #40-250 households; 100-625 persons

regions = list(region_sizes.values())
regions

['ctry', 'rgn', 'utla', 'ltla', 'msoa', 'lsoa', 'oa']

In [110]:
def import_data(region : str, wanted_groups : list = None, wanted_codes: list = None): 
    if wanted_groups == None and wanted_codes == None:
        raise ValueError("Need specified groups to import")
    
    region = str(region)
    if region not in regions:
        regions_error = '\n'.join(f"{key}  :  {val}" for key, val in region_sizes.items())
        raise ValueError(f"Region code must be one of the following: \nRegion  :  Region Code\n{regions_error}")
    
    codes = pd.read_csv(root / "census_codes.csv")
    codes["Filename"] = (
    codes["Filename"].str.split(".")
    .apply(lambda x: x[0])
    )

    if wanted_groups:
        valid_wanted_groups = [group for group in wanted_groups if group in codes["Description"].values]
        if len(valid_wanted_groups) == 0:
            raise ValueError("Inputs not found in code database, ensure the group names are as found on https://www.nomisweb.co.uk/census/2021/bulk")
        erroneous_groups = [group for group in wanted_groups if group not in valid_wanted_groups]
        output_erroneous_groups = '\n'.join(erroneous_groups)
        if len(erroneous_groups) > 0:
            warnings.warn(f"The following groups were not imported as they are invalid: {output_erroneous_groups}\nEnsure the group names are as found on https://www.nomisweb.co.uk/census/2021/bulk")
        
    if wanted_codes:
        valid_wanted_codes = [code for code in wanted_codes if code in codes["Code"].values]
        if len(valid_wanted_codes) == 0:
            raise ValueError("Inputs not found in code database, ensure codes are in the form 'TSXXX' where X's are digits, as found on https://www.nomisweb.co.uk/census/2021/bulk")
        erroneous_codes = [code for code in wanted_codes if code not in valid_wanted_codes]
        output_erroneous_codes = '\n'.join(erroneous_codes)
        if len(erroneous_codes) > 0:
            warnings.warn(f"The following codes were not imported as they are invalid: {output_erroneous_codes}\nEnsure codes are in the form 'TSXXX' where X's are digits, as found on https://www.nomisweb.co.uk/census/2021/bulk")
    
    if wanted_groups:
        wanted_codes = wanted_codes if wanted_codes is not None else []
        for group in valid_wanted_groups:
            valid_wanted_codes.append(codes.loc[codes["Description"] == group,"Code"].item())

    data = {}
    for code in valid_wanted_codes:
        folder = codes.loc[codes["Code"] == code, "Filename"].item()
        try:
            data.update({code : pd.read_csv(root / folder / f"{folder}-{region}.csv").drop(columns="date")})
        except FileNotFoundError:
            warnings.warn(f"File {folder}-{region}.csv not found in data/{folder}")
    
    return data

def import_all_data(region : str):
    region = str(region)
    if region not in regions:
        regions_error = '\n'.join(f"{key}  :  {val}" for key, val in region_sizes.items())
        raise ValueError(f"Region code must be one of the following: \nRegion  :  Region Code\n{regions_error}")

    codes = pd.read_csv(root / "census_codes.csv")
    codes["Filename"] = (
    codes["Filename"].str.split(".")
    .apply(lambda x: x[0])
    )
    
    data = {}
    for code in codes["Code"].values:
        folder = codes.loc[codes["Code"] == code, "Filename"].item()
        try:
            data.update({code : pd.read_csv(root / folder / f"{folder}-{region}.csv").drop(columns="date")})
        except FileNotFoundError:
            warnings.warn(f"File {folder}-{region}.csv not found in data/{folder}")
    
    return data

def cleanup(data : dict, remove_geography : bool = True, remove_geography_code : bool = True):
    for key in data.keys():
        dataframe = data[key]
        try:
            if remove_geography:
                dataframe = dataframe.drop(columns="geography")
        except KeyError:
            pass
        try:
            if remove_geography_code:
                dataframe = dataframe.drop(columns="geography code")
        except KeyError:
            pass
        data[key] = dataframe
    return data

In [111]:
data = import_data("ctry", wanted_codes=["TS004"])
data["TS004"]

Unnamed: 0,geography,geography code,Country of birth: Total; measures: Value,Country of birth: Europe; measures: Value,Country of birth: Europe: United Kingdom; measures: Value,Country of birth: Europe: EU countries; measures: Value,Country of birth: Europe: EU countries: European Union EU14; measures: Value,Country of birth: Europe: EU countries: European Union EU8; measures: Value,Country of birth: Europe: EU countries: European Union EU2; measures: Value,Country of birth: Europe: EU countries: All other EU countries; measures: Value,Country of birth: Europe: Non-EU countries; measures: Value,Country of birth: Europe: Non-EU countries: All other non-EU countries; measures: Value,Country of birth: Africa; measures: Value,Country of birth: Middle East and Asia; measures: Value,Country of birth: The Americas and the Caribbean; measures: Value,Country of birth: Antarctica and Oceania (including Australasia) and Other; measures: Value,Country of birth: British Overseas ; measures: Value
0,England and Wales,K04000001,59597542,53722544,49579570,3643242,1620266,1225623,689094,108259,499732,499732,1584575,3311030,785754,172547,21092
1,England,E92000001,56490049,50730336,46687506,3551766,1578673,1190391,677216,105486,491064,491064,1555856,3241701,773377,168004,20775
2,Wales,W92000004,3107494,2992209,2892064,91477,41593,35233,11878,2773,8668,8668,28719,69329,12377,4543,317


In [112]:
data = cleanup(data, remove_geography_code=False)


Unnamed: 0,Country of birth: Total; measures: Value,Country of birth: Europe; measures: Value,Country of birth: Europe: United Kingdom; measures: Value,Country of birth: Europe: EU countries; measures: Value,Country of birth: Europe: EU countries: European Union EU14; measures: Value,Country of birth: Europe: EU countries: European Union EU8; measures: Value,Country of birth: Europe: EU countries: European Union EU2; measures: Value,Country of birth: Europe: EU countries: All other EU countries; measures: Value,Country of birth: Europe: Non-EU countries; measures: Value,Country of birth: Europe: Non-EU countries: All other non-EU countries; measures: Value,Country of birth: Africa; measures: Value,Country of birth: Middle East and Asia; measures: Value,Country of birth: The Americas and the Caribbean; measures: Value,Country of birth: Antarctica and Oceania (including Australasia) and Other; measures: Value,Country of birth: British Overseas ; measures: Value
0,59597542,53722544,49579570,3643242,1620266,1225623,689094,108259,499732,499732,1584575,3311030,785754,172547,21092
1,56490049,50730336,46687506,3551766,1578673,1190391,677216,105486,491064,491064,1555856,3241701,773377,168004,20775
2,3107494,2992209,2892064,91477,41593,35233,11878,2773,8668,8668,28719,69329,12377,4543,317
