# Build Aggregate Data

In [None]:
%matplotlib inline

import json

import pandas as pd

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 100)

## Load Cleaned Data

### CAP

The FAQs say:

> I understand that some CAP payments are jointly paid for by EU and UK funding. How are such payments shown on the website?
> These are known as co-financed payments and mostly concern Rural Development measures. For these payment types, the website shows the total amounts received by the beneficiary, not just the EU financed element. (http://cap-payments.defra.gov.uk/FAQs.aspx)

According to https://www.instituteforgovernment.org.uk/explainers/common-agricultural-policy, in 2015 the UK contribution to rural development was €250M out of €959M, so we can use that to adjust the total.

In [None]:
raw_cap_by_area = pd.concat([
    pd.read_pickle('../cap/output/cap_by_area_{}.pkl.gz'.format(year))
    for year in range(2014, 2018)
])
raw_cap_by_area.head()

In [None]:
cap_by_area = raw_cap_by_area.copy()
cap_by_area['eagf'] = cap_by_area.otherEAGF + cap_by_area.directEAGF
cap_by_area['eafrd'] = cap_by_area.ruralDevelopment * ((959 - 250) / 959)
cap_by_area.drop(['otherEAGF', 'directEAGF', 'ruralDevelopment', 'total'], axis=1, inplace=True)
cap_by_area['funds'] = 'CAP'
cap_by_area['total'] = cap_by_area.eagf + cap_by_area.eafrd
cap_by_area.head()

### CORDIS

In [None]:
def add_postcode_area(df):
    df['postcode_area'] = df['postcode'].str.replace(r'^([A-Z]{1,2}).+$', r'\1')
    return df

In [None]:
fp7_organizations = pd.read_pickle('../cordis/output/fp7_organizations.pkl.gz')
fp7_projects = pd.read_pickle('../cordis/output/fp7_projects.pkl.gz')
fp7 = pd.merge(
    fp7_projects, fp7_organizations,
    left_on='rcn', right_on='project_rcn', validate='1:m'
)
add_postcode_area(fp7)
fp7['my_eu_id'] = 'fp7_' + fp7.project_rcn.astype('str') + '_' + fp7.organization_id.astype('str')
fp7['funds'] = 'FP7'
fp7.head()

In [None]:
h2020_organizations = pd.read_pickle('../cordis/output/h2020_organizations.pkl.gz')
h2020_projects = pd.read_pickle('../cordis/output/h2020_projects.pkl.gz')
h2020 = pd.merge(
    h2020_projects, h2020_organizations,
    left_on='rcn', right_on='project_rcn', validate='1:m'
)
add_postcode_area(h2020)
h2020['my_eu_id'] = 'h2020_' + h2020.project_rcn.astype('str') + '_' + h2020.organization_id.astype('str')
h2020['funds'] = 'H2020'
h2020.head()

We only care about a common set of cordis columns (no briefs etc.)

In [None]:
cordis_columns = [
    'postcode_area', 'postcode', 'my_eu_id',
    'start_date', 'end_date',
    'max_contribution_eur', 'contribution_eur', 'eur_gbp',
    'acronym', 'title', 'name', 'num_organizations', 'funds'
]
cordis = pd.concat([fp7[cordis_columns], h2020[cordis_columns]])
cordis.shape

In [None]:
cordis.head()

#### Estimate Missing Contributions

For some records, we don't know the contribution. For aggregation purposes, estimate it by dividing the max contribution (for the project as a whole) by the number of organisations.

In [None]:
[cordis.shape[0], cordis.contribution_eur.isna().sum()]

In [None]:
def estimate_missing_cordis_contributions():
    c = cordis.contribution_eur.copy()
    c[c.isna()] = cordis.max_contribution_eur[c.isna()] / cordis.num_organizations[c.isna()]
    cordis['estimated_contribution_eur'] = c
estimate_missing_cordis_contributions()
cordis[cordis.contribution_eur.isna()].head()

In [None]:
cordis['contribution_gbp'] = cordis.contribution_eur * cordis.eur_gbp
cordis['estimated_contribution_gbp'] = cordis.estimated_contribution_eur * cordis.eur_gbp

### Creative Europe

In [None]:
creative_organisations = pd.read_pickle('../creative/output/creative_europe_organisations.pkl.gz')
creative_projects = pd.read_pickle('../creative/output/creative_europe_projects.pkl.gz')
creative = pd.merge(creative_projects, creative_organisations, on='project_number', validate='1:m')
creative.shape

In [None]:
add_postcode_area(creative)
assert creative.funds.unique().shape[0] == 1
creative['funds'] = 'CE'
creative['my_eu_id'] = \
    'creative_' + creative.project_number + '_' + \
    creative.partner_number.apply('{:.0f}'.format).\
    str.replace('nan', 'coordinator', regex=False)
assert creative.shape[0] == creative.my_eu_id.unique().shape[0]
creative.head()

#### Estimate Missing Contributions

We don't know the contribution per-organisation for the Creative Europe data. For aggregation purposes, estimate it by dividing the max contribution (for the project as a whole) by the number of organisations.

In [None]:
[creative.shape[0], creative.max_contribution_eur.isna().sum()]

In [None]:
creative['estimated_contribution_eur'] = \
    creative.max_contribution_eur / creative.num_organisations
creative['estimated_contribution_gbp'] = creative.estimated_contribution_eur * creative.eur_gbp
creative['max_contribution_gbp'] = creative.max_contribution_eur * creative.eur_gbp
creative.head()

### ESIF (ESF/ERDF)

In [None]:
esif = pd.concat([
    pd.read_pickle('../esif/output/esif_{}.pkl.gz'.format(dataset))
    for dataset in ['england_2014_2020', 'ni_2014_2020', 'scotland', 'wales']
], sort=True)
add_postcode_area(esif)
esif.head()

### Erasmus

In [None]:
erasmus_organisations = pd.read_pickle('../erasmus/output/erasmus_mobility_organisations.pkl.gz')
erasmus_projects = pd.read_pickle('../erasmus/output/erasmus_mobility_projects.pkl.gz')
erasmus = pd.merge(erasmus_projects, erasmus_organisations, on='project_identifier', validate='1:m')
erasmus.shape

In [None]:
add_postcode_area(erasmus)
assert erasmus.funds.unique().shape[0] == 1
erasmus['funds'] = 'Erasmus'
erasmus['my_eu_id'] = \
    'erasmus_' + erasmus.project_identifier + '_' + \
    erasmus.partner_number.apply('{:.0f}'.format).\
    str.replace('nan', 'coordinator', regex=False)
assert erasmus.shape[0] == erasmus.my_eu_id.unique().shape[0]
erasmus.head() 

#### Estimate Missing Contributions

We don't know the contribution per-organisation for the Erasmus data. For aggregation purposes, estimate it by dividing the max contribution (for the project as a whole) by the number of organisations.

In [None]:
[erasmus.shape[0], erasmus.max_contribution_eur.isna().sum()]

In [None]:
erasmus['estimated_contribution_eur'] = \
    erasmus.max_contribution_eur / erasmus.num_organisations
erasmus['estimated_contribution_gbp'] = erasmus.estimated_contribution_eur * erasmus.eur_gbp
erasmus['max_contribution_gbp'] = erasmus.max_contribution_eur * erasmus.eur_gbp
erasmus.head()

#### Estimate Start and End Dates

We don't have start and end dates; use the call year for now (but can't aggregate with it).

In [None]:
erasmus['start_date'] = pd.to_datetime(erasmus.call_year.apply(str) + '-01-01')
erasmus['end_date'] = pd.to_datetime(erasmus.call_year.apply(str) + '-12-31')
erasmus.head()

## Idea 2: Aggregate over each Area

### Time Ranges

In [None]:
[
    (cap_by_area.year.min(), cap_by_area.year.max()),
    (fp7_projects.start_date.min(), fp7_projects.start_date.max()),
    (h2020_projects.start_date.min(), h2020_projects.start_date.max()),
    (creative.start_date.min(), creative.start_date.max()),
    (esif.start_date.min(), esif.start_date.max())
]

### Aggregate by Year

In [None]:
cap_annual_total = cap_by_area.copy()
cap_annual_total = cap_annual_total[['funds', 'year', 'total']]
cap_annual_total.groupby('year').sum()

In [None]:
def find_annual_sum(df, keys, column):
    def reweight(row):
        # create timeseries from start to end
        days = pd.date_range(row.start_date, row.end_date, closed='left')
        daily = pd.DataFrame({
            'year_start': days,
            column: row[column] / days.shape[0]
        })
        annual = daily.resample('AS', on='year_start').sum()
        for key in keys:
            annual[key] = row[key]
        return annual
    result = pd.concat(list(df.apply(reweight, axis=1)))
    result.reset_index(inplace=True)
    result['year'] = result.year_start.apply(lambda x: x.year)
    result.drop('year_start', axis=1, inplace=True)
    return result[['year'] + keys + [column]]

cordis_annual = find_annual_sum(cordis, ['postcode_area', 'my_eu_id', 'funds'], 'estimated_contribution_gbp')
cordis_annual.shape

In [None]:
cordis_annual.head()

In [None]:
def find_fund_annual_totals(annual, column):
    annual_total = annual.groupby(['funds', 'year'])[column].sum()
    annual_total = annual_total.reset_index()
    annual_total.rename(columns={column: 'total'}, inplace=True)
    return annual_total
cordis_annual_total = find_fund_annual_totals(cordis_annual, 'estimated_contribution_gbp')
cordis_annual_total.head()

In [None]:
def find_fund_annual_totals(annual, column):
    annual_total = annual.groupby(['funds', 'year'])[column].sum()
    annual_total = annual_total.reset_index()
    annual_total.rename(columns={column: 'total'}, inplace=True)
    return annual_total
cordis_annual_total = find_fund_annual_totals(cordis_annual, 'estimated_contribution_gbp')
cordis_annual_total.head()

In [None]:
creative_annual = find_annual_sum(creative, ['funds', 'postcode_area', 'my_eu_id'], 'estimated_contribution_gbp')
creative_annual_total = find_fund_annual_totals(creative_annual, 'estimated_contribution_gbp')
creative_annual_total

In [None]:
esif_annual = find_annual_sum(esif, ['funds', 'postcode_area', 'my_eu_id'], 'eu_investment')
esif_annual_total = find_fund_annual_totals(esif_annual, 'eu_investment')
esif_annual_total.head()

In [None]:
# erasmus_annual = find_annual_sum(erasmus, ['funds', 'postcode_area', 'my_eu_id'], 'estimated_contribution_gbp')
# erasmus_annual_total = find_fund_annual_totals(erasmus_annual, 'estimated_contribution_gbp')
# erasmus_annual_total.head()

In [None]:
# TODO: add erasmus_annual_total when we have start and end dates
annual_totals = pd.concat([
    cap_annual_total,
    cordis_annual_total,
    creative_annual_total,
    esif_annual_total
]).\
groupby(['funds', 'year']).sum().reset_index().\
pivot(index='year', columns='funds', values='total')
annual_totals

In [None]:
annual_totals.plot.bar(stacked=True)

### Aggregate by Area and Year

In [None]:
cap_by_area.head()

In [None]:
def find_fund_annual_area_totals(annual, column):
    annual_total = annual.groupby(['funds', 'postcode_area', 'year'])[column].sum()
    annual_total = annual_total.reset_index()
    annual_total.rename(columns={column: 'total'}, inplace=True)
    return annual_total
cordis_annual_area_total = find_fund_annual_area_totals(cordis_annual, 'estimated_contribution_gbp')
cordis_annual_area_total.head()

In [None]:
creative_annual_area_total = find_fund_annual_area_totals(creative_annual, 'estimated_contribution_gbp')
creative_annual_area_total.head()

In [None]:
esif_annual_area_total = find_fund_annual_area_totals(esif_annual, 'eu_investment')
esif_annual_area_total.head()

In [None]:
# erasmus_annual_area_total = find_fund_annual_area_totals(erasmus_annual, 'estimated_contribution_gbp')
# erasmus_annual_area_total.head()

In [None]:
# TODO: add erasmus_annual_area_total when start and end dates available
annual_area_total = pd.concat([
    cap_by_area[['funds', 'postcode_area', 'year', 'total']],
    cordis_annual_area_total,
    creative_annual_area_total,
    esif_annual_area_total
])
annual_area_total.shape

In [None]:
annual_area_total.head()

In [None]:
area_mean_2014_2017 = annual_area_total[
    (annual_area_total.year >= 2014) & (annual_area_total.year <= 2017)
].groupby(['postcode_area']).total.sum() / 4
area_mean_2014_2017 = area_mean_2014_2017.reset_index()
area_mean_2014_2017.total = area_mean_2014_2017.total.round().astype('int32')
area_mean_2014_2017.sort_values('total', ascending=False).head()

In [None]:
area_mean_2014_2017.total.sum()

Northern Ireland receives a lot of CAP funding:

In [None]:
cap_by_area[cap_by_area.postcode_area == 'BT']

In [None]:
annual_area_total[annual_area_total.postcode_area == 'BT'].groupby(['funds']).total.sum()

### Top Projects by Area
#### CORDIS

In [None]:
TOP_N = 3
cordis_top = cordis[~cordis.contribution_gbp.isna()][[
    'postcode_area', 'postcode', 'my_eu_id', 'start_date', 'end_date',
    'contribution_gbp', 'acronym', 'title', 'name'
]]
cordis_top = cordis_top.\
    sort_values(['postcode_area', 'contribution_gbp'], ascending=[True, False]).\
    groupby('postcode_area').head(TOP_N)
cordis_top.contribution_gbp = cordis_top.contribution_gbp.round().astype('int32')
cordis_top

#### Creative Europe

In [None]:
creative_top = creative.\
    sort_values(['postcode_area', 'max_contribution_gbp'], ascending=[True, False]).\
    groupby('postcode_area').head(TOP_N)\
    [[
        'postcode_area', 'postcode', 'my_eu_id',
        'start_date', 'end_date', 'max_contribution_gbp',
        'project', 'organisation_name', 'num_organisations', 'num_countries'
    ]]
creative_top.max_contribution_gbp = creative_top.max_contribution_gbp.round().astype('int32')
creative_top.head()

#### ESIF

In [None]:
esif_top = esif.\
    sort_values(['postcode_area', 'eu_investment'], ascending=[True, False]).\
    groupby('postcode_area').head(TOP_N)\
    [[
        'postcode_area', 'postcode', 'my_eu_id',
        'start_date', 'end_date', 'eu_investment',
        'project', 'beneficiary'
    ]]
esif_top.eu_investment = esif_top.eu_investment.round().astype('int32')
esif_top.head()

#### Erasmus

In [None]:
erasmus_top = erasmus.\
    sort_values(['postcode_area', 'max_contribution_gbp'], ascending=[True, False]).\
    groupby('postcode_area').head(TOP_N)\
    [[
        'postcode_area', 'postcode', 'my_eu_id',
        'start_date', 'end_date', 'max_contribution_gbp',
        'project', 'organisation_name', 'num_organisations', 'num_countries'
    ]]
erasmus_top.max_contribution_gbp = erasmus_top.max_contribution_gbp.round().astype('int32')
erasmus_top.head()

### Counts and Totals of Projects by Area

For CORDIS and ESIF, the totals are redundant, because they're already included in totalAmounts, but for Creative Europe and Erasmus, the amounts are estimated, which makes the UI confusing, so also include those.

#### CORDIS

In [None]:
area_cordis_projects = cordis.groupby('postcode_area').my_eu_id.count().\
    reset_index().rename(columns={'my_eu_id': 'count'})
area_cordis_projects['kind'] = 'cordis'
area_cordis_projects['total'] = float('nan')
area_cordis_projects.head()

#### Creative Europe

In [None]:
creative.groupby('postcode_area').max_contribution_gbp.sum()

In [None]:
area_creative_projects = creative.groupby('postcode_area').my_eu_id.count().\
    reset_index().rename(columns={'my_eu_id': 'count'})
area_creative_projects['kind'] = 'creative'
area_creative_projects = pd.merge(
    area_creative_projects,
    creative.groupby('postcode_area').max_contribution_gbp.sum().\
        reset_index().rename(columns={'max_contribution_gbp': 'total'}),
    on='postcode_area', validate='1:1'
)
area_creative_projects.total = area_creative_projects.total.round().astype('int32')
area_creative_projects.shape

In [None]:
area_creative_projects.head()

In [None]:
[
    area_creative_projects['count'].sum(),
    area_creative_projects.total.sum(),
    creative.max_contribution_gbp.sum()
]

#### ESIF

In [None]:
area_esif_projects = esif.groupby('postcode_area').my_eu_id.count().\
    reset_index().rename(columns={'my_eu_id': 'count'})
area_esif_projects['kind'] = 'esif'
area_esif_projects['total'] = float('nan')
area_esif_projects.head()

#### Erasmus

In [None]:
erasmus.groupby('postcode_area').max_contribution_gbp.sum()

In [None]:
area_erasmus_projects = erasmus.groupby('postcode_area').my_eu_id.count().\
    reset_index().rename(columns={'my_eu_id': 'count'})
area_erasmus_projects['kind'] = 'erasmus'
area_erasmus_projects = pd.merge(
    area_erasmus_projects,
    erasmus.groupby('postcode_area').max_contribution_gbp.sum().\
        reset_index().rename(columns={'max_contribution_gbp': 'total'}),
    on='postcode_area', validate='1:1'
)
area_erasmus_projects.total = area_erasmus_projects.total.round().astype('int32')
area_erasmus_projects.shape

In [None]:
area_erasmus_projects.head()

In [None]:
[
    area_erasmus_projects['count'].sum(),
    area_erasmus_projects.total.sum(),
    erasmus.max_contribution_gbp.sum()
]

#### Build Project List

In [None]:
projects = pd.concat([
    area_cordis_projects,
    area_creative_projects,
    area_esif_projects,
    area_erasmus_projects
]).sort_values(['postcode_area', 'kind'])\
    [['postcode_area', 'kind', 'count', 'total']]
projects.head()

### Save for Map

In [None]:
with open('../postcodes/output/postcode-area-boundaries-simplified.geo.json') as file:
    postcode_areas_json = json.load(file)
len(postcode_areas_json['features'])

In [None]:
postcode_areas_features = pd.DataFrame.from_dict({
    'name': [feature['properties']['name'] for feature in postcode_areas_json['features']],
    'geometry': [feature['geometry'] for feature in postcode_areas_json['features']],
})
postcode_areas_features.head()

In [None]:
postcode_areas_data = pd.merge(
    postcode_areas_features, area_mean_2014_2017,
    left_on='name', right_on='postcode_area', validate='m:1')
postcode_areas_data.head()

In [None]:
def make_area_geo_json(data):
    def make_feature(row):
        properties = {
            'total': row['total'],
            'postcodeArea': row['postcode_area']
        }
        return {
            'type': 'Feature',
            'geometry': row['geometry'],
            'properties': properties
        }
    features = list(data.apply(make_feature, axis=1))
    return { 'type': 'FeatureCollection', 'features': features }
with open('output/postcode_areas.geo.json', 'w') as file:
    json.dump(make_area_geo_json(postcode_areas_data), file, sort_keys=True)

### Save Area Summaries

- Separate JSON file from the geojson? Probably decreases first load time.
- One JSON file for the whole map, or one per area? Let's see how big the whole dataset is. Probably not worth splitting it.
- For CAP, just the numbers by year; 4 per year.
- For CORDIS, want a total amount and count, and the top say 20 projects by amount; can just say 'browse the map for more'. Include their postcode and ID for lookup. For each project, also want amount, name and organisation. Maybe also start and end year.
- For Creative, similar
- For ESIF, similar
- For FTS, not sure; maybe the same, but we don't have as much data.

In [None]:
annual_area_total.funds.unique()

In [None]:
annual_area_total_rounded = annual_area_total.\
    sort_values(['postcode_area', 'year', 'funds'])\
    [['postcode_area', 'year', 'funds', 'total']]
annual_area_total_rounded.year = annual_area_total_rounded.year.astype('int32')
annual_area_total_rounded.total = annual_area_total_rounded.total.round().astype('int32')
annual_area_total_rounded = \
    annual_area_total_rounded[annual_area_total_rounded.total > 0]
annual_area_total_rounded.head()

In [None]:
cap_by_area_rounded = cap_by_area.drop(columns='funds').copy()
cap_by_area_rounded.eagf = cap_by_area_rounded.eagf.round().astype('int32')
cap_by_area_rounded.eafrd = cap_by_area_rounded.eafrd.round().astype('int32')
cap_by_area_rounded.total = cap_by_area_rounded.total.round().astype('int32')
cap_by_area_rounded.head()

In [None]:
# Dump to JSON using pandas, because it doesn't produce invalid
# JSON when there are NaNs, then load into dicts.
def to_pandas_json(df):
    return json.loads(df.to_json(orient='split', date_unit='s', index=False))

with open('output/postcode_areas.data.json', 'w') as file:
    json.dump({
        'totals': to_pandas_json(annual_area_total_rounded),
        'projects': to_pandas_json(projects),
        'cap': to_pandas_json(cap_by_area_rounded),
        'cordis': to_pandas_json(cordis_top.drop(columns='postcode_area')),
        'creative': to_pandas_json(creative_top.drop(columns='postcode_area')),
        'erasmus': to_pandas_json(erasmus_top.drop(columns='postcode_area')),
        'esif': to_pandas_json(esif_top.drop(columns='postcode_area'))
    }, file, sort_keys=True)