In [None]:
import json

import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 100)

## Load Data and Restrict to UK

In [None]:
#import the data for 2017
fts_bgue_2017 = pd.read_excel('input/export_2017_en.xlsx')
fts_bgue_2017.shape

In [None]:
fts_bgue_2017.columns

In [None]:
#rename columns to more usable format

fts_bgue_2017=fts_bgue_2017.rename(index=str, columns={'Country / Territory': 'country', 
        'Reference of the Legal Commitment (LC)': 'reference_of_lc', 
        'Commitment position key': 'commitment_position_key',
        'Type': 'type',
       'Year': 'year',
       'Name of beneficiary': 'beneficiary',
        'Coordinator': 'coordinator',
       'VAT Number of beneficiary': 'beneficiary_vat', 
        'Postal code': 'raw_postcode',
        'Amount': 'amount',
       'Source of (estimated) detailed amount': 'source_of_amount', 
        'Geographical Zone': 'geographical_zone',
       'Expense Type': 'expense_type', 
        'Total amount': 'total_amount', 
        'Subject of grant or contract':'subject_of_grant_or_contract',
       'Responsible Department': 'responsible_department', 
        'Budget line name and number': 'budget_line_name_and_number', 'Action Type': 'action_type',
       'Funding Type': 'funding_type', 'LE Acct Group Code': 'le_acct_group_code', 
        'LE Acct Group Desc': 'le_acct_group_desc'})

In [None]:
# The `total_amount` comes up zero rather than NA when the cell is unmerged.
fts_bgue_2017.total_amount.isna().sum()

In [None]:
# Is it safe to replace zeros with NaNs and then fill?
cps_total_amounts = fts_bgue_2017.groupby('commitment_position_key', sort=False).aggregate({'total_amount': sum})
nonzero_total_amounts = fts_bgue_2017.total_amount[fts_bgue_2017.total_amount > 0]
[
    cps_total_amounts.total_amount.shape[0],
    nonzero_total_amounts.values.shape[0],    
    (cps_total_amounts.total_amount.values == nonzero_total_amounts.values).sum()
]

In [None]:
fts_bgue_2017.loc[fts_bgue_2017.total_amount == 0, 'total_amount'] = float('nan')

In [None]:
fts_bgue_2017[fts_bgue_2017.reference_of_lc == 'JAG.105092']

In [None]:
#forward fill amounts, as there are merged cells in the original 2017 database which are being filled with NaNs


fts_bgue_2017.commitment_position_key = pd.Series(fts_bgue_2017.commitment_position_key).fillna(method='ffill')
fts_bgue_2017.year = pd.Series(fts_bgue_2017.year).fillna(method='ffill')
fts_bgue_2017.expense_type = pd.Series(fts_bgue_2017.expense_type).fillna(method='ffill')
fts_bgue_2017.total_amount = fts_bgue_2017.total_amount.fillna(method='ffill')
fts_bgue_2017.subject_of_grant_or_contract = pd.Series(fts_bgue_2017.subject_of_grant_or_contract).fillna(method='ffill')
fts_bgue_2017.responsible_department = pd.Series(fts_bgue_2017.responsible_department).fillna(method='ffill')
fts_bgue_2017.budget_line_name_and_number = pd.Series(fts_bgue_2017.budget_line_name_and_number).fillna(method='ffill')
fts_bgue_2017.action_type = pd.Series(fts_bgue_2017.action_type).fillna(method='ffill')
fts_bgue_2017.funding_type = pd.Series(fts_bgue_2017.funding_type).fillna(method='ffill')


In [None]:
#filter just UK funding

fts = fts_bgue_2017[fts_bgue_2017.country == 'United Kingdom']
fts.shape

## Columns

### `reference_of_lc`

In [None]:
#is this column being used at all?
fts_bgue_2017.reference_of_lc.unique()

Oh, it is in the 2017 data, interesting, wasn't there in the 2016 data

### `year`


In [None]:
fts.year.isna().sum()

In [None]:
fts.year.unique()

### `beneficiary`

Many seem to have asterisks; we may want to get rid of content after the asterisk.

In [None]:
fts.beneficiary.isna().sum()

In [None]:
(fts.beneficiary != fts.beneficiary.str.strip()).sum()

In [None]:
(fts.beneficiary != fts.beneficiary.str.upper()).sum()

In [None]:
[fts.shape, len(fts.beneficiary.unique())]

In [None]:
fts.beneficiary.str.contains('*', regex=False).sum()

In [None]:
fts[fts.beneficiary.str.contains('*', regex=False)].beneficiary.unique()[0:20]

### `Type`


In [None]:
fts.type.isna().sum()

In [None]:
fts.type.unique()

### `Coordinator`

In [None]:
fts.coordinator.isna().sum()

In [None]:
fts.coordinator.unique()

### `amount`

Probably what we are most interested in. Missing for some, where only a total amount is known but not the per-beneficiary amount.

In [None]:
[fts.shape, fts.amount.isna().sum()]

In [None]:
fts.amount.describe()

In [None]:
(fts.amount == 0).sum()

In [None]:
fts[fts.amount == 0].head()

### `source_of_amount`
 unlike 2016 data, this is set

In [None]:
fts.source_of_amount.isna().sum()

In [None]:
fts.source_of_amount.unique()

### `NUTS2`


In [None]:
(~fts.NUTS2.isna()).sum()

In [None]:
fts[~fts.NUTS2.isna()].head()

### `geographical_zone`

Present for some international aide projects. May be worth including.

In [None]:
fts.geographical_zone.isna().sum()

In [None]:
fts.geographical_zone.sort_values().unique()

In [None]:
fts[~fts.geographical_zone.isna()].head()

### `expense_type`

In [None]:
fts.expense_type.isna().sum()

In [None]:
fts.expense_type.unique()

### `total_amount`


In [None]:
fts.total_amount.isna().sum()

In [None]:
(fts.amount > fts.total_amount).sum()

In [None]:
(fts.amount / fts.total_amount).describe()

In [None]:
fts.loc[(fts.beneficiary_vat == 'GB823847609') & (fts.commitment_position_key == 'JAG.105092.1'), 'total_amount'] = 200_000

### `subject_of_grant_or_contract`

In [None]:
fts.subject_of_grant_or_contract.isna().sum()

In [None]:
[fts.shape, fts.subject_of_grant_or_contract.unique().shape]

In [None]:
fts[fts.subject_of_grant_or_contract.duplicated(keep=False)].sort_values('subject_of_grant_or_contract').head()

### `responsible_department`

In [None]:
fts.responsible_department.isna().sum()

In [None]:
fts.responsible_department.unique().shape

In [None]:
fts.groupby('responsible_department').size().\
    reset_index(name='size').sort_values('size', ascending=False).head(10)

### `budget_line_name_and_number`

In [None]:
fts.budget_line_name_and_number.isna().sum()

In [None]:
fts.budget_line_name_and_number.unique().shape

In [None]:
fts.groupby('budget_line_name_and_number').size().\
    reset_index(name='size').sort_values('size', ascending=False).head(10)

In [None]:
fts.groupby(['responsible_department', 'budget_line_name_and_number']).size().\
    reset_index(name='size').sort_values('size', ascending=False).head(10)

### `action_type`

In [None]:
fts.action_type.isna().sum()

In [None]:
fts.action_type.unique().shape

In [None]:
fts.groupby(['action_type']).size().\
    reset_index(name='size').sort_values('size', ascending=False).head(20)

### `funding_type`

In [None]:
fts.funding_type.isna().sum()

In [None]:
fts.funding_type.unique()

### Drop uninteresting columns

In [None]:
clean_fts = fts.drop([
    'reference_of_lc', 'commitment_position_key', 'year',
    'type', 'beneficiary_vat', 'Address', 'City', 'country',
    'commitment_position_key', 'source_of_amount',
    'le_acct_group_code', 'le_acct_group_desc'
], axis=1)
clean_fts.head()

### `postcode`

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv.gz')
ukpostcodes.shape

In [None]:
clean_fts.raw_postcode.isin(ukpostcodes.postcode).sum()

In [None]:
clean_fts['postcode'] = clean_fts.raw_postcode.\
    str.upper().\
    str.strip().\
    str.replace(r'[^A-Z0-9]', '').\
    str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2')

In [None]:
clean_fts.postcode.isin(ukpostcodes.postcode).sum()

In [None]:
clean_fts.raw_postcode[~clean_fts.postcode.isin(ukpostcodes.postcode)].unique()

### Overlaps with other datasets

### `creative europe`

Overlap here between the two datasets, so we will remove

In [None]:
fts_creative =  fts[fts.action_type.str.contains('Creative')]
fts_creative.shape

In [None]:
fts_creative.action_type.unique()

In [None]:
fts =  fts[~fts.action_type.str.contains('Creative')]
fts.shape

### `horizon 2020`

Overlap with existing data, so removing these

In [None]:
fts_h2020 =  fts[fts.action_type.str.contains('Horizon 2020')]
fts_h2020.shape

In [None]:
fts =  fts[~fts.action_type.str.contains('Horizon 2020')]
fts.shape

### `FP7`

Research: Framework programme n°7 - again overlap with existing data, so removing these	

In [None]:
fts_fp7 =  fts[fts.action_type.str.contains('n°7')]
fts_fp7.shape

In [None]:
fts_fp7.action_type.unique()

In [None]:
fts =  fts[~fts.action_type.str.contains('n°7')]
fts.shape

### `Erasmus +`


In [None]:
fts_erasmus =  fts[fts.action_type.str.contains('Erasmus')]
fts_erasmus.shape

In [None]:
fts_erasmus.action_type.unique()

In [None]:
fts = fts[~fts.action_type.str.contains('Erasmus')]
fts.shape

## Filter projects with funding < €1000

In [None]:
fts[fts.total_amount < 1000].shape

In [None]:
fts = fts[fts.total_amount >= 1000]
fts.shape

## Convert to GBP

All we have is a year, so just use the average annual exchange rate.

In [None]:
eur_gbp = pd.read_pickle('../exchange_rates/output/exchange_rates.pkl.gz')
eur_gbp.tail()

In [None]:
def find_average_eur_gbp_rate():
    # create timeseries from start to end
    days = pd.date_range('2017-01-01', '2018-01-01', closed='left')
    daily = pd.DataFrame({
        'month_start': days,
        'weight': 1.0 / days.shape[0]
    })
    monthly = daily.resample('MS', on='month_start').sum()
    monthly = pd.merge(monthly, eur_gbp, on='month_start', validate='1:1')
    return (monthly.weight * monthly.rate).sum()
clean_fts['eur_gbp'] = find_average_eur_gbp_rate()
clean_fts.eur_gbp.head()

## Output

In [None]:
output_fts = clean_fts[
    clean_fts.postcode.isin(ukpostcodes.postcode) &
    (clean_fts.amount > 0)
].copy()
output_fts.drop(['raw_postcode', 'NUTS2'], axis=1, inplace=True)
output_fts.rename(columns={'total_amount': 'total_amount_eur'}, inplace=True)
output_fts.shape

In [None]:
output_fts.head()

In [None]:
output_fts['my_eu_id'] = 'fts_2017_' + output_fts.index.map(str)
output_fts.my_eu_id.head()

In [None]:
output_fts.to_pickle('output/fts_2017.pkl.gz')