# Common Agricultural Policy (CAP) Data


In [None]:
%matplotlib inline

from collections import OrderedDict
import json
import os

import pandas as pd

In [None]:
DAERA = pd.read_excel('input/2016_All_CAP_Search_Results_Data_P14.xlsx', sheet_name=0)
SGRPID = pd.read_excel('input/2016_All_CAP_Search_Results_Data_P14.xlsx', sheet_name=1)
WG = pd.read_excel('input/2016_All_CAP_Search_Results_Data_P14.xlsx', sheet_name=2)
RPA = pd.read_excel('input/2016_All_CAP_Search_Results_Data_P14.xlsx', sheet_name=3)
RPA2 = pd.read_excel('input/2016_All_CAP_Search_Results_Data_P14.xlsx', sheet_name=4)

In [None]:
RPA2.head()

In [None]:
[RPA2.PayingAgencyLink.isna().sum(), RPA2.PayingAgencyLink.value_counts()]

In [None]:
raw_cap = pd.concat([DAERA, SGRPID, WG, RPA, RPA2])
raw_cap.shape

In [None]:
raw_cap.count()

## Postcode District Validation

Check the supplied postcode prefixes against a list of all valid postcode districts. 

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv.gz')
ukpostcodes.shape

In [None]:
ukpostcodes['district'] = ukpostcodes['postcode'].str.replace(r'^(.+)\s.+$', r'\1')
ukpostcodes['sector'] = ukpostcodes['postcode'].str.replace(r'^(.+)\s([0-9]).+$', r'\1 \2')

In [None]:
ukpostcodes.head()

In [None]:
postcode_districts = ukpostcodes['district'].unique()
len(postcode_districts)

In [None]:
postcode_sectors = ukpostcodes['sector'].unique()
len(postcode_sectors)

In [None]:
pd.merge(
    pd.DataFrame({'district': postcode_districts}),
    raw_cap,
    left_on='district', right_on='PostcodePrefix_F202B').shape

In [None]:
raw_cap['postcode_district'] = raw_cap['PostcodePrefix_F202B'].str.upper().str.strip()
pd.merge(
    pd.DataFrame({'district': ukpostcodes['district'].unique()}),
    raw_cap,
    left_on='district', right_on='postcode_district').shape

In [None]:
raw_cap[raw_cap['postcode_district'].isin(postcode_sectors)]

In [None]:
def coarsen_sectors():
    is_sector = raw_cap['postcode_district'].isin(postcode_sectors)
    sectors = raw_cap['postcode_district'][is_sector]
    raw_cap.loc[is_sector, 'postcode_district'] = sectors.str.replace(r'^(.+)\s[0-9]$', r'\1')
coarsen_sectors()
pd.merge(
    pd.DataFrame({'district': postcode_districts}),
    raw_cap,
    left_on='district', right_on='postcode_district').shape

In [None]:
def find_unmatched_districts():
    unmatched = raw_cap[~raw_cap['postcode_district'].isin(postcode_districts)]
    pairs = unmatched[['PostcodePrefix_F202B', 'TownCity_F202C']]
    return pd.DataFrame({
        'unmatched': pairs.apply(lambda x: ' / '.join(x), axis=1).unique()
    }).sort_values('unmatched')
find_unmatched_districts()

Ok, neither of these are things that we should expect to match!

In [None]:
cap = raw_cap[raw_cap['postcode_district'].isin(postcode_districts)].copy()

In [None]:
cap[[
    'Year', 'BeneficiaryCode', 'BeneficiaryName_F201',
    'OtherEAGFTotal', 'DirectEAGFTotal', 'RuralDevelopmentTotal',
    'postcode_district']].to_pickle('output/cap_2016.pkl.gz')

## Aggregation to Postcode District

In [None]:
cap_by_district = cap.groupby(['PayingAgencyLink', 'postcode_district']).aggregate(OrderedDict([
    ('OtherEAGFTotal', sum),
    ('DirectEAGFTotal', sum),
    ('RuralDevelopmentTotal', sum),
    ('Total', [sum, len]),
    ('Year', max)
]))
cap_by_district.reset_index(inplace=True)
cap_by_district.columns = [
    'agency',
    'postcode_district',
    'otherEAGF',
    'directEAGF',
    'ruralDevelopment',
    'total',
    'count',
    'year'
]
PROPERTY_COLUMNS = [
    'otherEAGF', 'directEAGF', 'ruralDevelopment', 'total', 'count'
]
for column in PROPERTY_COLUMNS:
    cap_by_district[column] = cap_by_district[column].round().astype('int32')
    
cap_by_district.shape

In [None]:
cap_by_district.head()

In [None]:
cap_by_district.agency.unique()

In [None]:
cap_by_district.describe()

## Aggregation to Postcode Area

In [None]:
cap['postcode_area'] = \
    cap['postcode_district'].str.replace(r'^([A-Z]{1,2}).+$', r'\1')
cap.head()

In [None]:
cap_by_area = cap.groupby('postcode_area').sum()

cap_by_area = cap.groupby('postcode_area').aggregate(OrderedDict([
    ('OtherEAGFTotal', sum),
    ('DirectEAGFTotal', sum),
    ('RuralDevelopmentTotal', sum),
    ('Total', [sum, len]),
    ('Year', max)
]))
cap_by_area.reset_index(inplace=True)
cap_by_area.columns = [
    'postcode_area',
    'otherEAGF',
    'directEAGF',
    'ruralDevelopment',
    'total',
    'count',
    'year'
]
print(cap_by_area['total'].max()) # still a 32-bit integer?
for column in PROPERTY_COLUMNS:
    cap_by_area[column] = cap_by_area[column].round().astype('int32')
cap_by_area.head()

In [None]:
cap_by_area.describe()

In [None]:
cap_by_area.to_pickle('output/cap_by_area_2016.pkl.gz')