# Scotland ESF and ERDF

In [None]:
import json
import re

import pandas as pd

In [None]:
raw_esif = pd.read_excel('input/ESF and ERDF operations funding - approved to May 2018.xlsx', skiprows=7)
raw_esif.shape

In [None]:
# Drop extra columns, and space between rows
raw_esif = raw_esif[~raw_esif['Programme '].isnull()]
raw_esif = raw_esif[raw_esif['Operation Name'] !='Operation Name']
raw_esif.shape

In [None]:
raw_esif.columns

In [None]:
#Add missing column names
esif = raw_esif.rename(index=str, columns={
    'Recipient organisation1': 'beneficiary',
    'Operation Name': 'project',
    'Description': 'summary',
    'Start and end dates': 'start_and_end_dates',
    'ESF/ ERDF Grant requested': 'eu_investment',
    'Total eligible expenditure': 'project_cost',
    'Union co-financing rate' : 'prop_eu_financed',
    'Programme ': 'funds',
    'Investment Priority': 'investment_priority',
    'Postcode ': 'raw_postcode',
    "Categories of Intervention": "nuts_area", 
    "Unnamed: 11": "category", 
    "Unnamed: 12": "intervention_field_2", 
    "Unnamed: 13": "intervention_field_3", 
    "Unnamed: 14": "form_of_finance", 
    "Unnamed: 15": "territory_type", 
    "Unnamed: 16": "economic_activity"})
esif.head()

In [None]:
esif.dtypes

### Benificiary

In [None]:
esif.beneficiary[esif.beneficiary.str.strip() != esif.beneficiary]

In [None]:
esif.beneficiary = esif.beneficiary.str.strip()

In [None]:
esif.project.unique().shape

### Project

In the absence of any IDs, do we have any duplicates? Sometimes the same project gets both ERDF and ESF funding. 

In [None]:
esif.project[esif.project.str.strip() != esif.project] # lots
esif.project = esif.project.str.strip()

In [None]:
esif.project.unique().shape

In [None]:
esif[esif.duplicated(['beneficiary', 'project', 'funds'], keep=False)].sort_values('project')

Doesn't look like there are actually any duplicates

### Summary


In [None]:
esif.summary.isna().sum()

In [None]:
(esif.summary != esif.summary.str.strip()).sum() # lots
esif.summary = esif.summary.str.strip()

### Funds

In [None]:
esif.funds.isna().sum()

In [None]:
esif.funds.unique()

### Project Cost

In [None]:
esif.project_cost.isna().sum()

In [None]:
esif.project_cost = esif.project_cost.map(str).str.strip()
project_cost_bad = esif.project_cost.str.match(re.compile(r'.*[^0-9.].*'))
esif.project_cost[project_cost_bad]

In [None]:
esif.project_cost = esif.project_cost.astype('float')

### EU Investment


In [None]:
esif.eu_investment.isna().sum()

In [None]:
esif.eu_investment = esif.eu_investment.map(str).str.strip()
eu_investment_bad = esif.eu_investment.str.match(re.compile(r'.*[^0-9.].*'))
esif.eu_investment[eu_investment_bad]

In [None]:
esif.eu_investment = esif.eu_investment.astype('float')

### Overfunding


In [None]:
overfunded = (esif.eu_investment > esif.project_cost)
esif[overfunded]

### Prop EU Financed

This provides a useful check. The [ESF guidance for 2014-2020](https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/710305/ESF_Guidance_for_2014_2020_v2.pdf) says that contributions over 50% are unlikely in the UK.

Need to look at these and see what is going on

In [None]:
esif.prop_eu_financed.isna().sum()

In [None]:
esif.prop_eu_financed = esif.prop_eu_financed.astype('float')

In [None]:
esif.prop_eu_financed.describe()

In [None]:
esif['actual_prop'] = esif.eu_investment / esif.project_cost
esif.actual_prop.describe()

In [None]:
esif[(esif.actual_prop - esif.prop_eu_financed).abs() > 0.05]

### Postcode


In [None]:
[esif.shape, esif.raw_postcode.isna().sum()]

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv.gz')
ukpostcodes.shape

In [None]:
esif.raw_postcode.isin(ukpostcodes.postcode).sum()

In [None]:
esif['postcode'] = esif.raw_postcode.\
    str.upper().\
    str.strip().\
    str.replace(r'[^A-Z0-9]', '').\
    str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2')

In [None]:
esif.postcode.isin(ukpostcodes.postcode).sum()

In [None]:
esif.postcode[~esif.postcode.isin(ukpostcodes.postcode)].unique()

In [None]:
esif[~esif.postcode.isin(ukpostcodes.postcode)]

In [None]:
esif = esif[esif.postcode.isin(ukpostcodes.postcode)].copy()

### Start and End Dates

In [None]:
date_rx = r'[0-9]{2}/[0-9]{1,2}/[0-9]{4}'
date_range_rx = re.compile(r'({})\s*[–-]\s*({})'.format(date_rx, date_rx))
esif.start_and_end_dates[~esif.start_and_end_dates.str.match(date_range_rx)]

In [None]:
# Not sure how those 3s got there, but it looks safe to remove them.
date_range_rx = re.compile(r'({})\s*[–-]\s*3?({})'.format(date_rx, date_rx))
esif.start_and_end_dates[~esif.start_and_end_dates.str.match(date_range_rx)]

In [None]:
esif['start_date'] = pd.to_datetime(
    esif.start_and_end_dates.str.replace(date_range_rx, r'\1'),
    dayfirst=True)
esif['end_date'] = pd.to_datetime(
    esif.start_and_end_dates.str.replace(date_range_rx, r'\2'),
    dayfirst=True)

In [None]:
esif[['start_date', 'end_date']].describe()

In [None]:
(esif.start_date > esif.end_date).sum()

### Category


In [None]:
esif.category.isna().sum()

In [None]:
esif.category.unique()

## Save Data

In [None]:
esif.columns

In [None]:
clean_esif = esif.drop([
    'start_and_end_dates', 'prop_eu_financed', 'raw_postcode', 'category',
    'investment_priority', 'nuts_area', 'intervention_field_2', 'intervention_field_3',
    'form_of_finance', 'territory_type', 'economic_activity', 'actual_prop'
], axis=1)
clean_esif.head()

In [None]:
clean_esif['my_eu_id'] = clean_esif.funds.str.lower() + '_scotland_' + clean_esif.index.map(str)
clean_esif.my_eu_id.head()

In [None]:
clean_esif.to_pickle('output/esif_scotland.pkl.gz')

## Save Map Data

In [None]:
clean_esif_locations = pd.merge(clean_esif, ukpostcodes, validate='m:1')
clean_esif_locations.head()

In [None]:
def make_esif_data_geo_json(data):
    def make_feature(row):
        properties = {
            property: row[property]
            for property in ['beneficiary', 'project', 'project_cost', 'eu_investment']
        }
        return {
            'type': 'Feature',
            'geometry': {
                "type": "Point",
                "coordinates": [row['longitude'], row['latitude']]
            },
            'properties': properties
        }
    features = list(data.apply(make_feature, axis=1))
    return { 'type': 'FeatureCollection', 'features': features }
with open('output/scotland_data.geo.json', 'w') as file:
    json.dump(make_esif_data_geo_json(clean_esif_locations), file, sort_keys=True)