# Common Agricultural Policy (CAP) Data 2015

These data come from http://cap-payments.defra.gov.uk/ .

In [None]:
%matplotlib inline

from collections import OrderedDict
import json
import os

import pandas as pd

In [None]:
CAP_FILES = [
    'DAERA-Table 1.csv',
    'RPA-Table 1.csv',
    'RPA2-Table 1.csv',
    'SGRPID-Table 1.csv',
    'WG-Table 1.csv'
]
raw_cap = pd.concat([
    pd.read_csv(os.path.join('input/2015', file))
    for file in CAP_FILES
])
raw_cap.shape

In [None]:
raw_cap.columns

## Postcode District Validation

Check the supplied postcode prefixes against a list of all valid postcode districts. 

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv.gz')
ukpostcodes.shape

In [None]:
ukpostcodes['district'] = ukpostcodes['postcode'].str.replace(r'^(.+)\s.+$', r'\1')
ukpostcodes['sector'] = ukpostcodes['postcode'].str.replace(r'^(.+)\s([0-9]).+$', r'\1 \2')

In [None]:
ukpostcodes.head()

In [None]:
postcode_districts = ukpostcodes['district'].unique()
len(postcode_districts)

In [None]:
postcode_sectors = ukpostcodes['sector'].unique()
len(postcode_sectors)

Note: these are a bit low compared to https://www.ons.gov.uk/methodology/geography/ukgeographies/postalgeography . Possibly that's because this dataset doesn't contain some of the special ones, but it might be out of date.

In [None]:
pd.merge(
    pd.DataFrame({'district': postcode_districts}),
    raw_cap,
    left_on='district', right_on='PostcodePrefix_F202B').shape

In [None]:
raw_cap['postcode_district'] = raw_cap['PostcodePrefix_F202B'].str.upper().str.strip()
pd.merge(
    pd.DataFrame({'district': ukpostcodes['district'].unique()}),
    raw_cap,
    left_on='district', right_on='postcode_district').shape

So, most of them match valid districts after upper casing and stripping spaces. Some look like they may be sectors rather than districts. For matching to districts, we can make them less granular.

In [None]:
raw_cap[raw_cap['postcode_district'].isin(postcode_sectors)]

In [None]:
def coarsen_sectors():
    is_sector = raw_cap['postcode_district'].isin(postcode_sectors)
    sectors = raw_cap['postcode_district'][is_sector]
    raw_cap.loc[is_sector, 'postcode_district'] = sectors.str.replace(r'^(.+)\s[0-9]$', r'\1')
coarsen_sectors()
pd.merge(
    pd.DataFrame({'district': postcode_districts}),
    raw_cap,
    left_on='district', right_on='postcode_district').shape

In [None]:
def find_unmatched_districts():
    unmatched = raw_cap[~raw_cap['postcode_district'].isin(postcode_districts)]
    pairs = unmatched[['PostcodePrefix_F202B', 'TownCity_F202C']]
    return pd.DataFrame({
        'unmatched': pairs.apply(lambda x: ' / '.join(x), axis=1).unique()
    }).sort_values('unmatched')
find_unmatched_districts()

Many look like digit transpositions or entry errors (or things not in the UK). We could probably salvage a few of them, but let's carry on for now.

In [None]:
cap = raw_cap[raw_cap['postcode_district'].isin(postcode_districts)].copy()

In [None]:
cap.count()

In [None]:
cap[[
    'Year', 'BeneficiaryCode', 'BeneficiaryName_F201',
    'OtherEAGFTotal', 'DirectEAGFTotal', 'RuralDevelopmentTotal',
    'postcode_district']].to_pickle('output/cap_2015.pkl.gz')

## Aggregation to Postcode District

In [None]:
cap_by_district = cap.groupby(['PayingAgencyLink', 'postcode_district']).aggregate(OrderedDict([
    ('OtherEAGFTotal', sum),
    ('DirectEAGFTotal', sum),
    ('RuralDevelopmentTotal', sum),
    ('Total', [sum, len]),
    ('Year', max)
]))
cap_by_district.reset_index(inplace=True)
cap_by_district.columns = [
    'agency',
    'postcode_district',
    'otherEAGF',
    'directEAGF',
    'ruralDevelopment',
    'total',
    'count',
    'year'
]
PROPERTY_COLUMNS = [
    'otherEAGF', 'directEAGF', 'ruralDevelopment', 'total', 'count'
]
for column in PROPERTY_COLUMNS:
    cap_by_district[column] = cap_by_district[column].round().astype('int32')
    
cap_by_district.shape

In [None]:
cap_by_district.head()

In [None]:
cap_by_district.agency.unique()

In [None]:
cap_by_district.describe()

## Aggregation to Postcode Area

In [None]:
cap['postcode_area'] = \
    cap['postcode_district'].str.replace(r'^([A-Z]{1,2}).+$', r'\1')
cap.head()

In [None]:
cap_by_area = cap.groupby('postcode_area').sum()

cap_by_area = cap.groupby('postcode_area').aggregate(OrderedDict([
    ('OtherEAGFTotal', sum),
    ('DirectEAGFTotal', sum),
    ('RuralDevelopmentTotal', sum),
    ('Total', [sum, len]),
    ('Year', max)
]))
cap_by_area.reset_index(inplace=True)
cap_by_area.columns = [
    'postcode_area',
    'otherEAGF',
    'directEAGF',
    'ruralDevelopment',
    'total',
    'count',
    'year'
]
print(cap_by_area['total'].max()) # still a 32-bit integer?
for column in PROPERTY_COLUMNS:
    cap_by_area[column] = cap_by_area[column].round().astype('int32')
cap_by_area.head()

In [None]:
cap_by_area.describe()

In [None]:
cap_by_area.to_pickle('output/cap_by_area.pkl.gz')

## Save Data for Map

In [None]:
with open('../postcodes/output/postcode-area-boundaries-simplified.geo.json') as file:
    postcode_areas_json = json.load(file)
len(postcode_areas_json['features'])

In [None]:
postcode_areas_features = pd.DataFrame.from_dict({
    'name': [feature['properties']['name'] for feature in postcode_areas_json['features']],
    'geometry': [feature['geometry'] for feature in postcode_areas_json['features']],
})
postcode_areas_features.head()

In [None]:
postcode_areas_data = pd.merge(
    postcode_areas_features, cap_by_area,
    left_on='name', right_on='postcode_area', validate='m:1')
postcode_areas_data.head()

In [None]:
postcode_area_names = pd.read_excel('../postcodes/input/postcode-area-names.xlsx')
postcode_area_names.head()

In [None]:
postcode_area_names.shape

In [None]:
postcode_areas_data = pd.merge(
    postcode_areas_data,
    postcode_area_names,
    validate='1:1'
)
postcode_areas_data.head()

In [None]:
def make_cap_by_area_geo_json(data):
    def make_feature(row):
        properties = {
            property: row[property] for property in PROPERTY_COLUMNS
        }
        properties['postcodeArea'] = row['postcode_area']
        properties['postcodeAreaName'] = row['postcode_area_name']
        return {
            'type': 'Feature',
            'geometry': row['geometry'],
            'properties': properties
        }
    features = list(data.apply(make_feature, axis=1))
    return { 'type': 'FeatureCollection', 'features': features }
with open('output/cap_by_area.geo.json', 'w') as file:
    json.dump(make_cap_by_area_geo_json(postcode_areas_data), file, sort_keys=True)