# Check of postode quality

In [1]:
# packages import
import pandas as pd 
import numpy as np 

## Data import
First, we import the filtered tables

In [2]:
data = pd.read_pickle('data/filtered/postcode_filtered.pickle')
pets = pd.read_pickle('data/filtered/pets_filtered.pickle')
flood = pd.read_pickle('data/filtered/flood_risk_filtered.pickle')
imd = pd.read_pickle('data/filtered/imd_filtered.pickle')
elevation = pd.read_pickle('data/filtered/elevation_filtered.pickle')

ValueError: unsupported pickle protocol: 5

In [None]:
rows_starting = data.shape[0]

## Join data
### Create district and sector data
From the postcode data, we create district and sector datasets to make sure we have the full list of districts and sectors to join other tables.

In [108]:
# Let's create the district and sector columns
data[['district', 'dis1']] = data['pcds'].str.split(' ', 1, expand=True)
data.drop(['dis1'], axis = 1, inplace = True)

data['sector'] = data['district'].str[0:2]
data['sector'] = data['sector'].str.replace(
    pat = r"[0-9]+", 
    repl = "",
    regex = True
)

In [86]:
# create data with one row per district
district_data = data[['district','sector']].\
    drop_duplicates().\
    reset_index(drop = True).copy()

In [89]:
# create data with one row per sector
lsoa_data = data[['lsoa11']].\
    drop_duplicates().\
    reset_index(drop = True).copy()

### Create full data
Now, let's create full tables, as most of them do not cover all postcodes/districts/sectors.
#### Pets

In [87]:
# join the district data to pets
district_data = district_data.merge(
    pets,
    on = 'district',
    how = 'left'
)

# let's take the districts that do contain values
pets_existing = district_data[~district_data['estimated_cat_population'].isna()].copy()
pets_existing['pets_value_from'] = 'raw_district'

# rows that have missing values
pets_missing = district_data[district_data['estimated_cat_population'].isna()].copy()

# # group the existing values by sector and average the values
pets_sector_avg = pets_existing.groupby(['sector']).agg(np.mean)
pets_missing = pets_missing[['sector']].merge(
    pets_sector_avg,
    how = 'inner',
    on = 'sector'
)
pets_missing['pets_value_from'] = 'avg_sector'

# put data back together
district_data = pd.concat(
    [pets_existing,pets_missing]
)

# clean memory
del pets, pets_existing, pets_missing, pets_sector_avg

#### IMD

In [90]:
imd.head()

Unnamed: 0,lsoa,imd_global_rank,imd_global_decile,imd_income_rank,imd_income_decile,imd_employment_rank,imd_employment_decile,imd_education_rank,imd_education_decile,imd_health_rank,imd_health_decile,imd_crime_rank,imd_crime_decile,imd_services_rank,imd_services_decile,imd_living_environment_rank,imd_living_environment_decile
0,E01000001,29199,9,32831,10,32742,10,32842,10,32113,10,32662,10,7319,3,7789,3
1,E01000002,30379,10,29901,10,31190,10,32832,10,29705,10,32789,10,11707,4,13070,4
2,E01000003,14915,5,18510,6,15103,5,26386,9,17600,6,29363,9,2157,1,4092,2
3,E01000005,8678,3,6029,2,7833,3,12370,4,17907,6,31059,10,2217,1,9397,3
4,E01000006,14486,5,14023,5,21692,7,17511,6,21581,7,18848,6,1033,1,10629,4


In [97]:
imd.rename(
    columns = {'lsoa':'lsoa11'},
    inplace = True
)

# join the lsoa data to IMD data
lsoa_data = lsoa_data.merge(
    imd,
    on = 'lsoa11',
    how = 'inner'
)

del imd

#### Flood

In [109]:
# merge on all postcodes
flood_pcd = data[['pcds', 'district','sector']].merge(
    flood,
    how = 'left',
    left_on = 'pcds',
    right_on = 'postcode'
)

# CREATE SUB TABLES BY DISTRICT, OR SECTOR

# take the subdata where values are not missing and group by district
flood_dst = flood_pcd[ ~flood_pcd['flood_risk_int'].isna()].\
    groupby(['district']).agg(np.mean)
# round to nearest int
flood_dst['flood_risk_int'] = np.round(flood_dst['flood_risk_int'],0)
flood_dst['risk_for_insurance_int'] = np.round(flood_dst['risk_for_insurance_int'],0)

# take the subdata where values are not missing and group by sector
flood_sct = flood_pcd[ ~flood_pcd['flood_risk_int'].isna()].\
    groupby(['sector']).agg(np.mean)
# round to nearest int
flood_sct['flood_risk_int'] = np.round(flood_sct['flood_risk_int'],0)
flood_sct['risk_for_insurance_int'] = np.round(flood_sct['risk_for_insurance_int'],0)

In [None]:
### CREATE THE SUBSETS OF OUTPUT TABLE

# by postcode, when value exist by postcode
flood_pcd_e = flood_pcd[ ~flood_pcd['flood_risk_int'].isna()]
flood_pcd_e['flood_value_from'] = 'raw_postcode'

# when value missing by postcode
flood_pcd_m = flood_pcd[ flood_pcd['flood_risk_int'].isna()]

# join flood_pcd_m to district values
flood_pcd_m_dst = flood_pcd_m[["pcds","district", "sector"]].merge(
    flood_dst,
    on = 'district',
    how = 'left'
)

# by district, when value exist averaged by district
flood_pcd_m_dst_e = flood_pcd_m_dst[ ~flood_pcd_m_dst['flood_risk_int'].isna()]
flood_pcd_m_dst_e['flood_value_from'] = 'average_district'

# when value is missing by district
flood_pcd_m_dst_m = flood_pcd_m_dst[ flood_pcd_m_dst['flood_risk_int'].isna()]

# join flood_sct to sector values
flood_pcd_m_dst_m_sct = flood_pcd_m_dst_m[["sector"]].merge(
    flood_sct,
    on = "sector",
    how = "left"
)

# by sector, when value exist averaged by sector
flood_pcd_m_dst_m_sct_e = flood_pcd_m_dst_m_sct[ ~flood_pcd_m_dst_m_sct['flood_risk_int'].isna()]
flood_pcd_m_dst_m_sct_e['flood_value_from'] = 'average_sector'

# when value is missing by sector
flood_pcd_m_dst_m_sct_m = flood_pcd_m_dst_m_sct[ flood_pcd_m_dst_m_sct['flood_risk_int'].isna()]

print(f"We have {flood_pcd_m_dst_m_sct_m.shape[0]} rows with missing sector values of flood")

# group together
flood_final = pd.concat(
    [
        flood_pcd_e,
        flood_pcd_m_dst_e,
        flood_pcd_m_dst_m_sct_e
    ]
)

# delete un-necessary 
del (flood_pcd, flood_dst, flood_sct, flood_pcd_e, flood_pcd_m, flood_pcd_m_dst,
     flood_pcd_m_dst_e, flood_pcd_m_dst_m, flood_pcd_m_dst_m_sct,
     flood_pcd_m_dst_m_sct_e, flood_pcd_m_dst_m_sct_m)

#### Elevation

In [None]:
# merge on all postcodes
elev_pcd = data[['pcds', 'district','sector']].merge(
    flood,
    how = 'left',
    left_on = 'pcds',
    right_on = 'postcode'
)

# CREATE SUB TABLES BY DISTRICT, OR SECTOR

# take the subdata where values are not missing and group by district
elev_dst = elev_pcd[ ~elev_pcd['flood_risk_int'].isna()].\
    groupby(['district']).agg(np.mean)
# round to nearest int
elev_dst['flood_risk_int'] = np.round(elev_dst['flood_risk_int'],0)
elev_dst['risk_for_insurance_int'] = np.round(elev_dst['risk_for_insurance_int'],0)

# take the subdata where values are not missing and group by sector
elev_sct = elev_pcd[ ~elev_pcd['flood_risk_int'].isna()].\
    groupby(['sector']).agg(np.mean)
# round to nearest int
elev_sct['flood_risk_int'] = np.round(elev_sct['flood_risk_int'],0)
elev_sct['risk_for_insurance_int'] = np.round(elev_sct['risk_for_insurance_int'],0)


In [None]:
### CREATE THE SUBSETS OF OUTPUT TABLE

# by postcode, when value exist by postcode
elev_pcd_e = elev_pcd[ ~elev_pcd['flood_risk_int'].isna()]
elev_pcd_e['flood_value_from'] = 'raw_postcode'

# when value missing by postcode
elev_pcd_m = elev_pcd[ elev_pcd['flood_risk_int'].isna()]

# join flood_pcd_m to district values
elev_pcd_m_dst = elev_pcd_m[["pcds","district", "sector"]].merge(
    elev_dst,
    on = 'district',
    how = 'left'
)

# by district, when value exist averaged by district
elev_pcd_m_dst_e = elev_pcd_m_dst[ ~elev_pcd_m_dst['flood_risk_int'].isna()]
elev_pcd_m_dst_e['flood_value_from'] = 'average_district'

# when value is missing by district
elev_pcd_m_dst_m = elev_pcd_m_dst[ elev_pcd_m_dst['flood_risk_int'].isna()]

# join flood_sct to sector values
elev_pcd_m_dst_m_sct = elev_pcd_m_dst_m[["sector"]].merge(
    elev_sct,
    on = "sector",
    how = "left"
)

# by sector, when value exist averaged by sector
elev_pcd_m_dst_m_sct_e = elev_pcd_m_dst_m_sct[ ~elev_pcd_m_dst_m_sct['flood_risk_int'].isna()]
elev_pcd_m_dst_m_sct_e['flood_value_from'] = 'average_sector'

# when value is missing by sector
elev_pcd_m_dst_m_sct_m = elev_pcd_m_dst_m_sct[ elev_pcd_m_dst_m_sct['flood_risk_int'].isna()]

print(f"We have {elev_pcd_m_dst_m_sct_m.shape[0]} rows with missing sector values of flood")

# group together
elev_final = pd.concat(
    [
        elev_pcd_e,
        elev_pcd_m_dst_e,
        elev_pcd_m_dst_m_sct_e
    ]
)

# delete un-necessary 
del (elev_pcd, elev_dst, elev_sct, elev_pcd_e, elev_pcd_m, elev_pcd_m_dst,
     elev_pcd_m_dst_e, elev_pcd_m_dst_m, elev_pcd_m_dst_m_sct,
     elev_pcd_m_dst_m_sct_e, elev_pcd_m_dst_m_sct_m)

#### Full join

In [None]:
data = data[['pcds','lsoa11','district','sector','lat','long']]./
    merge(
        district_data,
        on = 'district',
        how = 'left'
    )./
    merge(
        lsoa_data,
        on = 'lsoa11',
        how = 'left'
    )./
    merge(
        flood_final,
        on = 'pcds',
        how = 'left'
    )./
    merge(
        elev_final,
        on = 'pcds',
        how = 'left'
    )

In [None]:
# check number of rows didn't change
data.shape[0] - rows_starting == 0

## Analysis

### postcode input

In [None]:
my_postcode = "PE2 6SX"

### Results

In [None]:
# get postcode line
get_data = data[data["pcds"] == my_postcode]

# get district
my_district = data[data['district'] == get_data['district']]

In [None]:
# results
dict_floor = {
    0 : 'None',
    1 : 'Very Low',
    2 : 'Low',
    3 : 'Medium',
    4 : 'High'
}

print(f"""GENERAL INFO FOR POSTCODE {my_postcode}:

The LSOA is {get_data['lsoa11']}

The district is {get_data['district']}

The sector is {get_data['sector']}

The risk of flooding is set to '{get_data['flood_risk'].map(dict_floor)}'. ({get_data['flood_value_from']})

The postcode is at {get_data['elevation']}m above the sea level. ({get_data['elev_value_from']})

We have {get_data['cats_by_district']} cats in the district. ({get_data['pets_value_from']})
This is the {}th quantile in the cats' values distribution.

We have on avg {get_data['dogs_by_household']} dogs by house in the district. ({get_data['pets_value_from']})
This is the {}th quantile in the dogs' values distribution.
""")

In [None]:
# histogram of cat by district and red vertical line where our value is

# Same with dogs

In [None]:
# map where the postcode is


In [None]:
print(f"""
Since the 1970s the Ministry of Housing, Communities and Local Government and its predecessors
have calculated local measures of deprivation in England.
There are 32844 areas in England, all listed from 1 (worst) to 32844 (best).
There are 7 domains of deprivation, which combine to create the Global one:
- Income
- Employment
- Education
- Health
- Crime
- Barriers to Housing and Services
- Living Environment

Globally the postcode's IMD ranking is {get_data['imd_global_rank']}
""")
# add gauge IMD / 32844 * 100

for cat in [
    'income',
    'employment',
    'education',
    'health',
    'crime',
    'services',
    'living_environment']:
    print(f"The {cat} IMD ranking is {get_data['imd_' + cat + '_rank']}")
    
    # add gauge