In [None]:
import json
import re

import pandas as pd

### Import ESIF data

In [None]:
wales = pd.read_excel('input/wales-approved-projects-2015.xlsx')
wales.head()

In [None]:
#remove title lines
wales = wales[~wales['EU Programme'].isnull()]

### Manually put in postcodes

Initially I tried to merge with GRID data, but this proved inefficient

In [None]:
wales.shape

In [None]:
wales[['Lead Organisation']].sort_values(axis=0, by='Lead Organisation').tail(40)

In [None]:
wales_locations_lookup = pd.DataFrame([
    ('Aberystwyth Innovation and Enterprise Campus Ltd', 'SY233EE'),
    ('Aberystwyth University', 'SY23 3FL'),
    ('Bangor University', 'LL572DG'),
    ('Blaenau Gwent County Borough Council', 'NP236XB'),
    ('Bridgend County Borough Council', 'CF314WB'),
    ('Caerphilly County Borough Council', 'CF827PG', 'CF827WF'),
    ('Cardiff University', 'CF103AT'),
    ('Cardiff Metropolitan University', 'CF52YB'),
    ('Carmarthenshire County Council', 'SA183EL'),
    ('Ceredigion County Council', 'SY233UE'),
    ('Chwarae Teg', 'CF245JW'),
    ('Coleg Cambria', 'LL137UH', 'CH5 4BR', 'LL152LB', 'CH76AA'),
    ('Coleg Gwent', 'NP19 4TS'),
    ('Coleg Y Cymoedd', 'CF818RD', 'CF448EN', 'CF157QY', 'CF827XR'),
    ('Conwy County Borough Council', 'LL328DU'),
    ('Denbighshire County Borough Council', 'LL159AZ'),
    ('Engineering Education Scheme Wales', 'CF313WT'),
    ('Finance Wales Plc', 'CF104BZ', 'SA148LQ', 'SY161RB', 'LL170JD'),
    ('Gower College Swansea', 'SA2 9EB', 'SA4 6RD', 'SA1 4QA'),
    ('Grwp Llandrillo-Menai', 'LL284HZ', 'LL572TP', 'LL402SW'),
    ('Higher Education Funding Council for Wales', 'CF83 8WT'),
    ('Isle of Anglesey County Council', 'LL777TW'),
    ('Marine Power Systems Limited', 'SA18AS'),
    ('Menter Mon', 'LL777LR'),
    ('Minesto UK LTD', 'LL651UN', 'BT221NZ'),
    ('Neath Port Talbot County Borough Council', 'SA131PJ'),
    ('Newport City Council', 'NP204UR'),
    ('Pembrokeshire Coastal Forum', 'SA726UL'),
    ('Pembrokeshire County Council', 'SA611TP'),
    ('Powys County Council', 'LD15LG'),
    ('Rhondda Cynon Taff Council', 'CF402XX'),
    ('SOVA', 'CF101FE'),
    ('Swansea Council', 'SA13SN'),
    ('Swansea University', 'SA2 8PP'),
    ('TWI Ltd', 'SA131SB'),
    ('Torfaen County Borough Council', 'NP46YB'),
    ('University of South Wales', 'CF37 1DL'),
    ('University of Wales Trinity Saint David', 'SA487ED'),
    ('WG - Department for Education and Skills', 'CF103NQ'),
    ('WG - Department for Health and Social Services', 'CF103NQ'),
    ('WG - Department of Local Government and Communities', 'CF103NQ'),
    ('WG - Department for Economy, Science and Transport', 'CF103NQ'),
    ('Wales Co operative Centre', 'CF832AX'),
    ('Wales Council for Voluntary Action', 'CF832AX', 'CF105FH', 'SY233AH'),
    ('Wave Hub Limited', 'TR274DD'),
    ('Wave-Tricity Limited', 'SA726YH'),
    ('Welsh Contact Centre Forum Ltd', 'CF642AG'),
    ], columns=['Lead Organisation', 'postcode', 'postcode2',
                'postcode3', 'postcode4'])
wales_locations_lookup.head()

Not found             ('WAG The Economic Development Group within DEandT'), WAG DEandT WEFO. More locations for Gower College Swansea


### Merge back together and rename

In [None]:
wales_locations = pd.merge(
    wales, wales_locations_lookup,
    how='left', on='Lead Organisation')
wales_locations.shape

In [None]:
wales_locations.columns

In [None]:
wales_locations[wales_locations.postcode.isnull()][['Lead Organisation']].sort_values('Lead Organisation').head(40)

In [None]:
wales_locations = wales_locations.rename(index=str, columns={
    'Project Title': 'project',
    'Lead Organisation': 'beneficiary',
    'EU funds awarded': 'eu_investment',
    'Total Project cost': 'project_cost',
    'EU Programme': 'funds',
    'Project Description': 'summary',
    'Project start date': 'start_date',
    'Project end date': 'end_date',
    'Union co-financing rate, as per Priority Axis': 'prop_eu_financed',
    'Case ID': 'case_id',
    'postcode': 'raw_postcode',
})

In [None]:
wales_locations.raw_postcode.isna().sum()

In [None]:
esif = wales_locations
esif.shape

In [None]:
esif.columns

### Beneficiary

In [None]:
esif.beneficiary[esif.beneficiary.str.strip() != esif.beneficiary]

### Case ID

Appears to be unique.

In [None]:
[esif.shape[0], esif.case_id.nunique(), esif.case_id.isna().sum()]

In [None]:
esif.case_id = esif.case_id.astype('int32')
esif.case_id.nunique()

### Project

Do we have any duplicate project names? Does not look like it.

In [None]:
esif.project[esif.project.str.contains('\n')]

In [None]:
esif.project[esif.project.str.strip() != esif.project]
esif.project = esif.project.str.strip()

In [None]:
esif.project.unique().shape

In [None]:
esif[esif.duplicated(['beneficiary', 'project', 'funds'], keep=False)].sort_values('project')

### Summary

In [None]:
esif.summary.isna().sum()

In [None]:
(esif.summary != esif.summary.str.strip()).sum()

In [None]:
esif.summary = esif.summary.str.strip()

### Funds

In [None]:
esif.funds.isna().sum()

In [None]:
esif.funds.unique()

In [None]:
esif.funds = esif.funds.\
    str.strip().str.replace('East Wales ERDF', 'ERDF').\
    str.replace('West Wales and the Valleys ERDF', 'ERDF').\
    str.replace('East Wales ESF', 'ESF').\
    str.replace('West Wales and the Valleys ESF', 'ESF')

esif.funds.unique()

### Project Cost

In [None]:
esif.project_cost.isna().sum()

In [None]:
esif.project_cost = esif.project_cost.map(str).str.strip()
project_cost_bad = esif.project_cost.str.match(re.compile(r'.*[^0-9.].*'))
esif.project_cost[project_cost_bad]

### EU Investment


In [None]:
esif.eu_investment.isna().sum()

In [None]:
esif.eu_investment = esif.eu_investment.map(str).str.strip()
eu_investment_bad = esif.eu_investment.str.match(re.compile(r'.*[^0-9.].*'))
esif.eu_investment[eu_investment_bad]

### Overfunding


In [None]:
esif.project_cost = esif.project_cost.astype('float')
esif.eu_investment = esif.eu_investment.astype('float')

In [None]:
overfunded = esif.eu_investment > esif.project_cost
esif[overfunded]

### Prop EU Financed

This provides a useful check. The [ESF guidance for 2014-2020](https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/710305/ESF_Guidance_for_2014_2020_v2.pdf) says that contributions over 50% are unlikely in the UK.

Most are not the stated value, but they don't look obviously wrong.

In [None]:
esif.prop_eu_financed.isna().sum()

In [None]:
esif.prop_eu_financed.describe()

In [None]:
esif['actual_prop'] = esif.eu_investment / esif.project_cost
esif.actual_prop.describe()

In [None]:
esif[(esif.actual_prop - esif.prop_eu_financed).abs() > 0.05]

### Postcode


In [None]:
[esif.shape, esif.raw_postcode.isna().sum()]

In [None]:
esif = esif[~esif.raw_postcode.isna()].copy()
esif.shape

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv.gz')
ukpostcodes.shape

In [None]:
esif.raw_postcode.isin(ukpostcodes.postcode).sum()

In [None]:
esif['postcode'] = esif.raw_postcode.\
    str.upper().\
    str.strip().\
    str.replace(r'[^A-Z0-9]', '').\
    str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2')

In [None]:
esif.postcode.isin(ukpostcodes.postcode).sum()

In [None]:
esif.postcode[~esif.postcode.isin(ukpostcodes.postcode)].unique()

### Start and End Dates


In [None]:
[esif.start_date.isna().sum(), esif.start_date.dtype]

In [None]:
[esif.end_date.isna().sum(), esif.end_date.dtype]

In [None]:
(esif.start_date >= esif.end_date).sum()

In [None]:
esif[['start_date', 'end_date']].describe()

## Save Data

In [None]:
clean_esif = esif.drop([
    'Priority',
    'prop_eu_financed',
    'raw_postcode',
    'postcode2',
    'postcode3',
    'postcode4',
    'Source: WEFO, 30/04/2018',
    'Welsh Government Targeted Match Funding, Yes / No’',
    'Sector ',
    'Joint Sponsors',
    'Regional area(s)',
    'Category of intervention',
    'actual_prop'
], axis=1)
clean_esif.head()

In [None]:
clean_esif['my_eu_id'] = clean_esif.funds.str.lower() + '_wales_' + clean_esif.case_id.astype('str')
clean_esif.my_eu_id.head()

In [None]:
clean_esif.drop('case_id', axis=1, inplace=True)

In [None]:
clean_esif.to_pickle('output/esif_wales.pkl.gz')

## Save Map Data

In [None]:
clean_esif_locations = pd.merge(clean_esif, ukpostcodes, validate='m:1')
clean_esif_locations.head()

In [None]:
def make_esif_data_geo_json(data):
    def make_feature(row):
        properties = {
            property: row[property]
            for property in ['beneficiary', 'project', 'project_cost', 'eu_investment']
        }
        return {
            'type': 'Feature',
            'geometry': {
                "type": "Point",
                "coordinates": [row['longitude'], row['latitude']]
            },
            'properties': properties
        }
    features = list(data.apply(make_feature, axis=1))
    return { 'type': 'FeatureCollection', 'features': features }
with open('output/wales_data.geo.json', 'w') as file:
    json.dump(make_esif_data_geo_json(clean_esif_locations), file, sort_keys=True)