# Filter raw data

This notebook is to be used locally once when trying to create the filtered raw data to make sure we are able to push our commits to GitHub.

We filtered the columns to only be the ones necessary for the analysis. We only took the english postcodes.

To save more space, we downcast the numerical columns. All files are saved in file "data/filtered".

We were able to cut down data size from 1.14GB to 88MB.

In [1]:
import pandas as pd 
import numpy as np 
import os

In [2]:
# create folder for filtered outputs
if not os.path.exists('data/filtered'):
    os.mkdir('data/filtered')

In [3]:
def multi_assign(df, transform_fn, condition):
    df_to_use = df.copy()

    return (df_to_use
        .assign(
            **{col: transform_fn(df_to_use[col])
               for col in condition(df_to_use)})
           )
           
def downcast_all(df, target_type, inital_type=None):
    #Gotta specify floats, unsigned, or integer
    #If integer, gotta be 'integer', not 'int'
    #Unsigned should look for Ints
    if inital_type is None:
        inital_type = target_type

    df_to_use = df.copy()

    transform_fn = lambda x: pd.to_numeric(x,
                                downcast=target_type)

    condition = lambda x: list(x
                    .select_dtypes(include=[inital_type])
                    .columns)

    return multi_assign(df_to_use, transform_fn, condition)

## Postcode data
This raw data has been sourced from the UK ONS. We need to filter only English postcodes and only the necessary columns to do our analysis.

In [4]:

# Read the raw data
pcd_raw = pd.read_csv("data/raw/NSPL_NOV_2019_UK/NSPL_NOV_2019_UK_RAW.csv", low_memory= False)

# Filter only England postcodes
pcd = pcd_raw[pcd_raw['ctry'] == 'E92000001'].copy()

# filter only needed columns
pcd = pcd.loc[:,[
    'pcds',
    'lsoa11',
    'lat', 
    'long',
    ]]

# reset the index to the default one
pcd.reset_index(drop = True, inplace= True)

# downcast all numerical columns
pcd = (pcd
    .pipe(downcast_all, "float")
    .pipe(downcast_all, "integer")
)

# Write into pickle file
pcd.to_pickle("data/filtered/postcode_filtered.pickle")


## Flood data
This data has been sourced from https://www.getthedata.com/flood-map/PE.

In [5]:
# read raw file
flood_raw  = pd.read_csv('data/raw/open_flood_risk_by_postcode.csv')

# rename columns as necessary
flood_raw.columns = [
    'postcode',
    'id',
    'flood_risk',
    'suitability',
    'date',
    'risk_for_insurance',
    'easting',
    'northing',
    'latitude',
    'longitude'
    ]

# re-create the 2 interesting columns in numerical type to save space
flood_raw['flood_risk_int'] = np.select(
    condlist = [
        flood_raw['flood_risk'] == 'Very Low',
        flood_raw['flood_risk'] == 'Low',
        flood_raw['flood_risk'] == 'Medium',
        flood_raw['flood_risk'] == 'High'
    ],
    choicelist = [1,2,3,4],
    default = 0).astype(int)
flood_raw['risk_for_insurance_int'] = (flood_raw['risk_for_insurance'] == 'Yes').astype(int)

# only take necessary columns
flood = flood_raw.loc[:,[
    'postcode',
    'flood_risk_int',
    'risk_for_insurance_int'
    ]]

# reset the index to default
flood.reset_index(drop = True, inplace= True)

# downcast all numerical columns
flood = (flood
    .pipe(downcast_all, "float")
    .pipe(downcast_all, "integer")
)

# save file
flood.to_pickle("data/filtered/flood_risk_filtered.pickle")

## Elevation data
The raw data was sourced from https://data.world/

In [6]:
# read raw file
elevation_raw  = pd.read_csv(
    'data/raw/open_postcode_elevation.csv',
    names = ['postcode','elevation']
)

# downcast all numerical columns
elevation_raw = (elevation_raw
    .pipe(downcast_all, "float")
    .pipe(downcast_all, "integer")
)

# save file
elevation_raw.to_pickle("data/filtered/elevation_filtered.pickle")

# Pets data
The raw data was sourced from https://data.world/

In [7]:
# read raw files
cats_raw = pd.read_csv(
    'data/raw/cat-population-per-postcode-district-1.csv'
)
dogs_raw = pd.read_csv(
    'data/raw/dogs-per-household-per-postcode-district-lower-95th-percentile-1.csv'
)

# rename columns
cats_raw.columns = ['district','estimated_cat_population']
dogs_raw.columns = ['district','dog_per_household_lower95']

# correct values of column
cats_raw['estimated_cat_population'] = cats_raw['estimated_cat_population'].str.replace(',','').astype(float)

# merge the 2 data
pets = cats_raw.merge(
    dogs_raw,
    how = 'inner',
    on = 'district'
)

# downcast all numerical columns
pets = (pets
    .pipe(downcast_all, "float")
    .pipe(downcast_all, "integer")
)

# save filtered data
pets.to_pickle("data/filtered/pets_filtered.pickle")

## Index of Multiple Deprivation data
The raw data was sourced from https://www.gov.uk/government/statistics/english-indices-of-deprivation-2019

In [8]:
# read raw file
imd_raw = pd.read_excel(
    "data/raw/File_2_-_IoD2019_Domains_of_Deprivation.xlsx",
    sheet_name = 'IoD2019 Domains'
)

# drop unnecessary columns
imd_raw.drop(
    [
        'LSOA name (2011)',
        'Local Authority District code (2019)',
        'Local Authority District name (2019)'
    ],
    axis = 1,
    inplace = True
)

# downcast all numerical columns
imd_raw = (imd_raw
    .pipe(downcast_all, "float")
    .pipe(downcast_all, "integer")
)

# save filtered data
imd_raw.to_pickle("data/filtered/imd_filtered.pickle")