# Check of postode quality

In [1]:
# packages import
import pandas as pd 
import numpy as np 

## Data import
First, we import the filtered tables

In [107]:
data = pd.read_pickle('data/filtered/postcode_filtered.pickle')
pets = pd.read_pickle('data/filtered/pets_filtered.pickle')
flood = pd.read_pickle('data/filtered/flood_risk_filtered.pickle')
imd = pd.read_pickle('data/filtered/imd_filtered.pickle')
elevation = pd.read_pickle('data/filtered/elevation_filtered.pickle')

## Join data
### Create district and sector data
From the postcode data, we create district and sector datasets to make sure we have the full list of districts and sectors to join other tables.

In [108]:
# Let's create the district and sector columns
data[['district', 'dis1']] = data['pcds'].str.split(' ', 1, expand=True)
data.drop(['dis1'], axis = 1, inplace = True)

data['sector'] = data['district'].str[0:2]
data['sector'] = data['sector'].str.replace(
    pat = r"[0-9]+", 
    repl = "",
    regex = True
)

In [86]:
# create data with one row per district
district_data = data[['district','sector']].\
    drop_duplicates().\
    reset_index(drop = True).copy()

In [89]:
# create data with one row per sector
lsoa_data = data[['lsoa11']].\
    drop_duplicates().\
    reset_index(drop = True).copy()

### Create full data
Now, let's create full tables, as most of them do not cover all postcodes/districts/sectors.
#### Pets

In [87]:
# join the district data to pets
district_data = district_data.merge(
    pets,
    on = 'district',
    how = 'left'
)

# let's take the districts that do contain values
pets_existing = district_data[~district_data['estimated_cat_population'].isna()].copy()
pets_existing['pets_value_from'] = 'raw_district'

# rows that have missing values
pets_missing = district_data[district_data['estimated_cat_population'].isna()].copy()

# # group the existing values by sector and average the values
pets_sector_avg = pets_existing.groupby(['sector']).agg(np.mean)
pets_missing = pets_missing[['sector']].merge(
    pets_sector_avg,
    how = 'inner',
    on = 'sector'
)
pets_missing['pets_value_from'] = 'avg_sector'

# put data back together
district_data = pd.concat(
    [pets_existing,pets_missing]
)

# clean memory
del pets, pets_existing, pets_missing, pets_sector_avg

#### IMD

In [90]:
imd.head()

Unnamed: 0,lsoa,imd_global_rank,imd_global_decile,imd_income_rank,imd_income_decile,imd_employment_rank,imd_employment_decile,imd_education_rank,imd_education_decile,imd_health_rank,imd_health_decile,imd_crime_rank,imd_crime_decile,imd_services_rank,imd_services_decile,imd_living_environment_rank,imd_living_environment_decile
0,E01000001,29199,9,32831,10,32742,10,32842,10,32113,10,32662,10,7319,3,7789,3
1,E01000002,30379,10,29901,10,31190,10,32832,10,29705,10,32789,10,11707,4,13070,4
2,E01000003,14915,5,18510,6,15103,5,26386,9,17600,6,29363,9,2157,1,4092,2
3,E01000005,8678,3,6029,2,7833,3,12370,4,17907,6,31059,10,2217,1,9397,3
4,E01000006,14486,5,14023,5,21692,7,17511,6,21581,7,18848,6,1033,1,10629,4


In [97]:
imd.rename(
    columns = {'lsoa':'lsoa11'},
    inplace = True
)

# join the lsoa data to IMD data
lsoa_data = lsoa_data.merge(
    imd,
    on = 'lsoa11',
    how = 'inner'
)

del imd

#### Flood

In [109]:

# merge on all postcodes
flood_pcd = data[['pcds', 'district','sector']].merge(
    flood,
    how = 'left',
    left_on = 'pcds',
    right_on = 'postcode'
)

In [112]:
# take the subdata where values are not missing and group by district
flood_dst = flood_pcd[ ~flood_pcd['flood_risk_int'].isna()].\
    groupby(['district']).agg(np.mean)
# round to nearest int
flood_dst['flood_risk_int'] = np.round(flood_dst['flood_risk_int'],0)
flood_dst['risk_for_insurance_int'] = np.round(flood_dst['risk_for_insurance_int'],0)

# take the subdata where values are not missing and group by sector
flood_sct = flood_pcd[ ~flood_pcd['flood_risk_int'].isna()].\
    groupby(['sector']).agg(np.mean)
# round to nearest int
flood_sct['flood_risk_int'] = np.round(flood_sct['flood_risk_int'],0)
flood_sct['risk_for_insurance_int'] = np.round(flood_sct['risk_for_insurance_int'],0)

# take the subdata where values are not missing and group by sector
flood_sct = flood_pcd[ ~flood_pcd['flood_risk_int'].isna()].\
    groupby(['sector']).agg(np.mean)
# round to nearest int
flood_sct['flood_risk_int'] = np.round(flood_sct['flood_risk_int'],0)
flood_sct['risk_for_insurance_int'] = np.round(flood_sct['risk_for_insurance_int'],0)

flood_dst.head()

Unnamed: 0_level_0,flood_risk_int,risk_for_insurance_int
district,Unnamed: 1_level_1,Unnamed: 2_level_1
AL1,0.0,0.0
AL10,0.0,0.0
AL2,0.0,0.0
AL3,0.0,0.0
AL4,0.0,0.0


#### Elevation

## Analysis

### postcode input

### Results