# Check of postode quality

In [4]:
# packages import
import pandas as pd 
import numpy as np 

## Data import
First, we import the filtered tables

In [62]:
data = pd.read_csv('data/filtered/postcode_filtered.zip')
pets = pd.read_csv('data/filtered/pets_filtered.zip')
flood = pd.read_csv('data/filtered/flood_risk_filtered.zip')
imd = pd.read_csv('data/filtered/imd_filtered.zip')
elevation = pd.read_csv('data/filtered/elevation_filtered.zip')

In [63]:
rows_starting = data.shape[0]

## Join data
### Create district and sector data
From the postcode data, we create district and sector datasets to make sure we have the full list of districts and sectors to join other tables.

In [64]:
# Let's create the district and sector columns
data[['district', 'dis1']] = data['pcds'].str.split(pat = ' ',n = 1, expand=True)
data.drop(['dis1'], axis = 1, inplace = True)

data['sector'] = data['district'].str[0:2]
data['sector'] = data['sector'].str.replace(
    pat = r"[0-9]+", 
    repl = "",
    regex = True
)

In [35]:
# create data with one row per district
district_data = data[['district','sector']].\
    drop_duplicates().\
    reset_index(drop = True).copy()

In [36]:
# create data with one row per sector
lsoa_data = data[['lsoa11']].\
    drop_duplicates().\
    reset_index(drop = True).copy()

### Create full data
Now, let's create full tables, as most of them do not cover all postcodes/districts/sectors.
#### Pets

In [38]:
# join the district data to pets
pets_pcds = district_data.merge(
    pets,
    on = 'district',
    how = 'left'
)

# let's take the districts that do contain values
pets_existing = pets_pcds[~pets_pcds['estimated_cat_population'].isna()].copy()
pets_existing['pets_value_from'] = 'raw_district'

# rows that have missing values
pets_missing = pets_pcds[pets_pcds['estimated_cat_population'].isna()].copy()

# # group the existing values by sector and average the values
pets_sector_avg = pets_existing.groupby(['sector'])\
    .agg({
        'estimated_cat_population': 'mean',
        'dog_per_household_lower95': 'mean'
        }
    )
pets_missing = pets_missing[['district','sector']].merge(
    pets_sector_avg,
    how = 'inner',
    on = 'sector'
)
pets_missing['pets_value_from'] = 'avg_sector'


# put data back together
pets_pcds = pd.concat(
    [pets_existing,pets_missing]
)

# clean memory
del pets_existing, pets_missing, pets_sector_avg

  district sector  estimated_cat_population  dog_per_household_lower95  \
0      AL1     AL                   2570.54                   0.168922   
1     AL10     AL                   4027.15                   0.308025   
2      AL2     AL                   7389.91                   0.797166   
3      AL3     AL                   4061.57                   0.357930   
4      AL4     AL                   3089.77                   0.293759   

  pets_value_from  
0    raw_district  
1    raw_district  
2    raw_district  
3    raw_district  
4    raw_district  


#### IMD

In [41]:
imd.rename(
    columns = {'lsoa':'lsoa11'},
    inplace = True
)

# join the lsoa data to IMD data
lsoa_data = lsoa_data.merge(
    imd,
    on = 'lsoa11',
    how = 'inner'
)

del imd

#### Flood

In [46]:
# merge on all postcodes
flood_pcd = data[['pcds', 'district','sector']].merge(
    flood,
    how = 'left',
    left_on = 'pcds',
    right_on = 'postcode'
)

# CREATE SUB TABLES BY DISTRICT, OR SECTOR

# take the subdata where values are not missing and group by district
flood_dst = flood_pcd[ ~flood_pcd['flood_risk_int'].isna()]\
    .groupby(['district'])\
    .agg({
        'flood_risk_int': 'mean',
        'risk_for_insurance_int': 'mean'
        }
    )
# round to nearest int
flood_dst['flood_risk_int'] = np.round(flood_dst['flood_risk_int'],0)
flood_dst['risk_for_insurance_int'] = np.round(flood_dst['risk_for_insurance_int'],0)

# take the subdata where values are not missing and group by sector
flood_sct = flood_pcd[ ~flood_pcd['flood_risk_int'].isna()]\
    .groupby(['sector'])\
    .agg({
        'flood_risk_int': 'mean',
        'risk_for_insurance_int': 'mean'
        }
    )
# round to nearest int
flood_sct['flood_risk_int'] = np.round(flood_sct['flood_risk_int'],0)
flood_sct['risk_for_insurance_int'] = np.round(flood_sct['risk_for_insurance_int'],0)

In [47]:
### CREATE THE SUBSETS OF OUTPUT TABLE

# by postcode, when value exist by postcode
flood_pcd_e = flood_pcd[ ~flood_pcd['flood_risk_int'].isna()].copy()
flood_pcd_e['flood_value_from'] = 'raw_postcode'

# when value missing by postcode
flood_pcd_m = flood_pcd[ flood_pcd['flood_risk_int'].isna()].copy()

# join flood_pcd_m to district values
flood_pcd_m_dst = flood_pcd_m[["pcds","district", "sector"]].merge(
    flood_dst,
    on = 'district',
    how = 'left'
)

# by district, when value exist averaged by district
flood_pcd_m_dst_e = flood_pcd_m_dst[ ~flood_pcd_m_dst['flood_risk_int'].isna()].copy()
flood_pcd_m_dst_e['flood_value_from'] = 'average_district'

# when value is missing by district
flood_pcd_m_dst_m = flood_pcd_m_dst[ flood_pcd_m_dst['flood_risk_int'].isna()].copy()

# join flood_sct to sector values
flood_pcd_m_dst_m_sct = flood_pcd_m_dst_m[["sector"]].merge(
    flood_sct,
    on = "sector",
    how = "left"
)

# by sector, when value exist averaged by sector
flood_pcd_m_dst_m_sct_e = flood_pcd_m_dst_m_sct[ ~flood_pcd_m_dst_m_sct['flood_risk_int'].isna()].copy()
flood_pcd_m_dst_m_sct_e['flood_value_from'] = 'average_sector'

# when value is missing by sector
flood_pcd_m_dst_m_sct_m = flood_pcd_m_dst_m_sct[ flood_pcd_m_dst_m_sct['flood_risk_int'].isna()].copy()

print(f"We have {flood_pcd_m_dst_m_sct_m.shape[0]} rows with missing sector values of flood")

# group together
flood_final = pd.concat(
    [
        flood_pcd_e,
        flood_pcd_m_dst_e,
        flood_pcd_m_dst_m_sct_e
    ]
)

# delete un-necessary 
del (flood_dst, flood_sct, flood_pcd_e, flood_pcd_m, flood_pcd_m_dst,
     flood_pcd_m_dst_e, flood_pcd_m_dst_m, flood_pcd_m_dst_m_sct,
     flood_pcd_m_dst_m_sct_e, flood_pcd_m_dst_m_sct_m)

We have 0 rows with missing sector values of flood


#### Elevation

In [60]:
elevation.head()

Unnamed: 0,postcode,elevation
0,AB10 1AB,20
1,AB10 1AF,20
2,AB10 1AG,20
3,AB10 1AH,10
4,AB10 1AL,20


In [66]:
# merge on all postcodes
elev_pcd = data[['pcds', 'district','sector']].merge(
    elevation,
    how = 'left',
    left_on = 'pcds',
    right_on = 'postcode'
)

# CREATE SUB TABLES BY DISTRICT, OR SECTOR

# take the subdata where values are not missing and group by district
elev_dst = elev_pcd[ ~elev_pcd['elevation'].isna()]\
    .groupby(['district'])\
    .agg({
        'elevation': 'mean'
        }
    )
# round to nearest int
elev_dst['elevation'] = np.round(elev_dst['elevation'],0)

# take the subdata where values are not missing and group by sector
elev_sct = elev_pcd[ ~elev_pcd['elevation'].isna()]\
    .groupby(['sector'])\
    .agg({
        'elevation': 'mean'
        }
    )
# round to nearest int
elev_sct['elevation'] = np.round(elev_sct['elevation'],0)


In [68]:
### CREATE THE SUBSETS OF OUTPUT TABLE

# by postcode, when value exist by postcode
elev_pcd_e = elev_pcd[ ~elev_pcd['elevation'].isna()].copy()
elev_pcd_e['elevation_value_from'] = 'raw_postcode'

# when value missing by postcode
elev_pcd_m = elev_pcd[ elev_pcd['elevation'].isna()].copy()

# join elev_pcd_m to district values
elev_pcd_m_dst = elev_pcd_m[["pcds","district", "sector"]].merge(
    elev_dst,
    on = 'district',
    how = 'left'
)

# by district, when value exist averaged by district
elev_pcd_m_dst_e = elev_pcd_m_dst[ ~elev_pcd_m_dst['elevation'].isna()].copy()
elev_pcd_m_dst_e['elevation_value_from'] = 'average_district'

# when value is missing by district
elev_pcd_m_dst_m = elev_pcd_m_dst[ elev_pcd_m_dst['elevation'].isna()].copy()

# join elev_sct to sector values
elev_pcd_m_dst_m_sct = elev_pcd_m_dst_m[["sector"]].merge(
    elev_sct,
    on = "sector",
    how = "left"
)

# by sector, when value exist averaged by sector
elev_pcd_m_dst_m_sct_e = elev_pcd_m_dst_m_sct[ ~elev_pcd_m_dst_m_sct['elevation'].isna()].copy()
elev_pcd_m_dst_m_sct_e['elevation_value_from'] = 'average_sector'

# when value is missing by sector
elev_pcd_m_dst_m_sct_m = elev_pcd_m_dst_m_sct[ elev_pcd_m_dst_m_sct['elevation'].isna()].copy()

print(f"We have {elev_pcd_m_dst_m_sct_m.shape[0]} rows with missing sector values of elevation")

# group together
elev_final = pd.concat(
    [
        elev_pcd_e,
        elev_pcd_m_dst_e,
        elev_pcd_m_dst_m_sct_e
    ]
)

# delete un-necessary 
del (elev_dst, elev_sct, elev_pcd_e, elev_pcd_m, elev_pcd_m_dst,
     elev_pcd_m_dst_e, elev_pcd_m_dst_m, elev_pcd_m_dst_m_sct,
     elev_pcd_m_dst_m_sct_e, elev_pcd_m_dst_m_sct_m)

We have 0 rows with missing sector values of flood


#### Full join

In [69]:
data.head()

Unnamed: 0,pcds,lsoa11,lat,long,district,sector
0,AL1 1AA,E01023743,51.749084,-0.341337,AL1,AL
1,AL1 1AB,E01023667,51.74783,-0.301865,AL1,AL
2,AL1 1AD,E01023667,51.74783,-0.301865,AL1,AL
3,AL1 1AE,E01023667,51.74783,-0.301865,AL1,AL
4,AL1 1AF,E01023743,51.749084,-0.341337,AL1,AL


In [53]:
pets_pcds.head()

Unnamed: 0,district,sector,estimated_cat_population,dog_per_household_lower95,pets_value_from
0,AL1,AL,2570.54,0.168922,raw_district
1,AL10,AL,4027.15,0.308025,raw_district
2,AL2,AL,7389.91,0.797166,raw_district
3,AL3,AL,4061.57,0.35793,raw_district
4,AL4,AL,3089.77,0.293759,raw_district


In [74]:
data = data\
    .merge(
        pets_pcds[['district','estimated_cat_population','dog_per_household_lower95','pets_value_from']],
        on = 'district',
        how = 'left'
    )\
    .merge(
        lsoa_data,
        on = 'lsoa11',
        how = 'left'
    )\
    .merge(
        flood_final[['pcds','flood_risk_int','risk_for_insurance_int','flood_value_from']],
        on = 'pcds',
        how = 'left'
    )\
    .merge(
        elev_final[['pcds','elevation','elevation_value_from']],
        on = 'pcds',
        how = 'left'
    )

In [73]:
# check number of rows didn't change
data.shape[0] - rows_starting == 0

True

In [76]:
# check data quality
print("Describe:")
print(data.describe())


Describe:
                lat          long  estimated_cat_population  \
count  2.191924e+06  2.191924e+06              2.191924e+06   
mean   5.232197e+01 -1.218408e+00              4.810207e+03   
std    1.153853e+00  1.228830e+00              3.405291e+03   
min    4.989198e+01 -6.352647e+00              0.000000e+00   
25%    5.145023e+01 -2.118069e+00              2.359460e+03   
50%    5.203282e+01 -1.254042e+00              4.072930e+03   
75%    5.338127e+01 -1.992340e-01              6.498840e+03   
max    5.579742e+01  1.760443e+00              2.354445e+04   

       dog_per_household_lower95  imd_global_rank  imd_global_decile  \
count               2.191924e+06     2.191924e+06       2.191924e+06   
mean                4.108048e-01     1.631906e+04       5.472304e+00   
std                 3.085774e-01     9.300788e+03       2.817771e+00   
min                 0.000000e+00     1.000000e+00       1.000000e+00   
25%                 1.955238e-01     8.428000e+03       3.0000

In [79]:

print("Missing values:")
data.isna().sum()[data.isna().sum()>0]

Missing values:


flood_risk_int            76450
risk_for_insurance_int    76450
flood_value_from          76450
elevation                 76450
elevation_value_from      76450
dtype: int64

In [None]:
lsoa_data.head()

Unnamed: 0,lsoa11,imd_global_rank,imd_global_decile,imd_income_rank,imd_income_decile,imd_employment_rank,imd_employment_decile,imd_education_rank,imd_education_decile,imd_health_rank,imd_health_decile,imd_crime_rank,imd_crime_decile,imd_services_rank,imd_services_decile,imd_living_environment_rank,imd_living_environment_decile
0,E01023743,30564,10,31326,10,29785,10,32763,10,32740,10,14230,5,18610,6,18321,6
1,E01023667,30628,10,26110,8,29898,10,30669,10,32485,10,25090,8,23013,8,14879,5
2,E01023741,30070,10,29518,9,29644,10,31705,10,32056,10,8865,3,21913,7,24526,8
3,E01023684,28935,9,29266,9,29438,9,32336,10,32592,10,20634,7,5832,2,23889,8
4,E01023726,29172,9,31253,10,32254,10,28162,9,32460,10,10627,4,22364,7,11422,4


## Analysis

### postcode input

In [None]:
my_postcode = "PE2 6SX"

### Results

In [None]:
# get postcode line
get_data = data[data["pcds"] == my_postcode]

# get district
my_district = data[data['district'] == get_data['district']]

In [None]:
# results
dict_floor = {
    0 : 'None',
    1 : 'Very Low',
    2 : 'Low',
    3 : 'Medium',
    4 : 'High'
}

print(f"""GENERAL INFO FOR POSTCODE {my_postcode}:

The LSOA is {get_data['lsoa11']}

The district is {get_data['district']}

The sector is {get_data['sector']}

The risk of flooding is set to '{get_data['flood_risk'].map(dict_floor)}'. ({get_data['flood_value_from']})

The postcode is at {get_data['elevation']}m above the sea level. ({get_data['elev_value_from']})

We have {get_data['cats_by_district']} cats in the district. ({get_data['pets_value_from']})
This is the {}th quantile in the cats' values distribution.

We have on avg {get_data['dogs_by_household']} dogs by house in the district. ({get_data['pets_value_from']})
This is the {}th quantile in the dogs' values distribution.
""")

In [None]:
# histogram of cat by district and red vertical line where our value is

# Same with dogs

In [None]:
# map where the postcode is


In [None]:
print(f"""
Since the 1970s the Ministry of Housing, Communities and Local Government and its predecessors
have calculated local measures of deprivation in England.
There are 32844 areas in England, all listed from 1 (worst) to 32844 (best).
There are 7 domains of deprivation, which combine to create the Global one:
- Income
- Employment
- Education
- Health
- Crime
- Barriers to Housing and Services
- Living Environment

Globally the postcode's IMD ranking is {get_data['imd_global_rank']}
""")
# add gauge IMD / 32844 * 100

for cat in [
    'income',
    'employment',
    'education',
    'health',
    'crime',
    'services',
    'living_environment']:
    print(f"The {cat} IMD ranking is {get_data['imd_' + cat + '_rank']}")
    
    # add gauge