In [None]:
import json

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import urllib.request
import requests

from urllib import parse

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 180)

In [None]:
all_life = pd.read_csv('input/LIFE-projects.csv')
all_life.shape

In [None]:
all_life.columns

In [None]:
all_life.rename(columns={ 
    all_life.columns[0]: "project_websummary",
    all_life.columns[1]: "project_title",
    all_life.columns[2]: "project_no",
    all_life.columns[3]: "project_website",
    all_life.columns[4]: "year_of_finance",
    all_life.columns[5]: "lead_partner_country",
    all_life.columns[6]: "type_of_beneficiary",
    all_life.columns[7]: "country",
    all_life.columns[8]: "themes",
    all_life.columns[9]: "keywords",
    all_life.columns[10]: "habitats",
    all_life.columns[11]: "species"
}, inplace=True)

In [None]:
all_life['base_url'] = 'http://ec.europa.eu/environment/life/project/Projects/'

In [None]:
all_life['project_title'].nunique()

In [None]:
all_life.head(2)

In [None]:
all_life.tail(2)

In [None]:
all_life['project_url'] = all_life['base_url'] + all_life['project_websummary']

In [None]:
def get_param_from_url(url, param_name):
    return [i.split("=")[-1] for i in url.split("?", 1)[-1].split("&") if i.startswith(param_name + "=")][0]

In [None]:
all_life['project_id'] = all_life.apply(lambda row: get_param_from_url(row['project_url'], 'n_proj_id'), axis=1)

In [None]:
uk_life = all_life[all_life.country=='United Kingdom'].copy()
# uk_life = uk_life.sort_values(by=['project_id'])
uk_life.shape
uk_life.head(3)

Now need to examine web page to see if we attempt geolocation using Natura 2000 dataset. Project id 6699 has none, 6812 has a few

In [None]:
uk_life.shape

## Scrape Natura sites from Life website

In [None]:
def scrapeNaturaSites(check_url):
    natura_project = pd.DataFrame([], columns=["area_type", "area_code", "area_name"])
    raw_contents = urllib.request.urlopen(check_url)
    charset=raw_contents.info().get_content_charset()
    contents=raw_contents.read().decode(charset)
    soup = BeautifulSoup(contents, 'html5lib')
    # html tags to find 
    ''' <span class="txtheadergreen">Natura 2000 sites</span> '''
    natura_span=soup.find('span',string='Natura 2000 sites')
    # html tags to find 
    ''' <table border="0" cellpadding="0" cellspacing="0" width="100%">
            <tbody><tr><td valign="top">SPA</td>
            <td valign="top">UK9010101</td>
            <td valign="top">Dorset Heathlands</td>
            </tr> '''
    natura_table1=natura_span.findNext('table')
    # If the first table has 3 elements then it appears to be single row col containing 'Not applicable'"
    natura_first_td=natura_table1.findNext('td')
    if natura_first_td.string == 'Not applicable':
        print('No Natura Sites found')
        return natura_project
    # Convert html table to a dictionary
    table_rows = natura_table1.find_all('tr')
    print(table_rows.count)
    res = []
    for tr in table_rows:
        td = tr.find_all('td')
        row = [tr.text.strip() for tr in td if tr.text.strip()]
        if row:
            res.append(row)
    print(type(res))
    #natura_project = natura_project.append(res, ignore_index=True)
    natura_project = pd.DataFrame(res, columns=["area_type", "area_code", "area_name"])
    return natura_project

# Main Loop

In [None]:
project_postcodes = pd.DataFrame(columns=['project_id', 'area_name', 'postcode'])
index = 0
for index, row in uk_life.iterrows():
    print('In loop, index is ' + str(index))
    #if index > 50:
    #    break
    print(row['project_id'], row['project_title'])
    # Should return dataframe containing "area_type", "area_code", "area_name" (if there are any Natura sites)
    # natura_sites = downloadNaturaSites('http://ec.europa.eu/environment/life/project/Projects/index.cfm?fuseaction=search.dspPage&n_proj_id=6812')
    natura_sites = scrapeNaturaSites(row['project_url'])
    if natura_sites.empty:
        print('No Natura sites for project_id ' + row['project_id'])
    else:
        print('Natura sites found for project_id ' + row['project_id'])
        print(natura_sites)
        natura_sites['project_id'] = row['project_id']
        natura_sites['postcode'] = natura_sites.apply(lambda x: getPostcodeForNaturaSite(x['area_code']), axis=1)
        project_postcodes = project_postcodes.append(natura_sites,ignore_index=True)
project_postcodes.to_csv('output/projectpostcodes.csv', encoding='utf-8')       

In [None]:
projectNaturaSite = scrapeNaturaSites('http://ec.europa.eu/environment/life/project/Projects/index.cfm?fuseaction=search.dspPage&n_proj_id=6698')
projectNaturaSite.head(2)                                        

TODO: We will need to split the budget across this many areas. Do the other data sets do this?

## Find the Long and Lat for a specific Natura area code

Natura site info can be downloaded from https://www.eea.europa.eu/data-and-maps/data/natura-9

"Stretching over 18 % of the EU’s land area and almost 6 % of its marine territory, it is the largest coordinated network of protected areas in the world. It offers a haven to Europe's most valuable and threatened species and habitats."

In [None]:
def getPostcodeForNaturaSite(this_natura_area_code):
    latAndLong = findLatAndLong(this_natura_area_code)
    if latAndLong == [0,0]:
        return 'N/A NoArea'
    postcode = findNearestPostcode(latAndLong[0],latAndLong[1])
    print('Postcode for Natura area code ' + this_natura_area_code + ' is ' + postcode)
    return postcode

Test it out. IE0004009 is an ROI code that's slipped in. This should return a N/A NoArea postcode.

In [None]:
my_postcode = getPostcodeForNaturaSite('IE0004009')
print(my_postcode)

UK9005151 is Bowland Forest and should have East Lancs postcode beginning with BB7 

In [None]:
my_postcode = getPostcodeForNaturaSite('UK9005151')
print(my_postcode)

In [None]:
def findLatAndLong(natura_area_code):
    print('Finding lag/long for area code ' + str(natura_area_code))
    # http://ec.europa.eu/environment/life/project/Projects/index.cfm
    all_natura_sites = pd.read_csv('input/NATURA2000SITES.csv')
    uk_natura_sites = all_natura_sites[all_natura_sites.COUNTRY_CODE=='uk'].copy()
    # Stip out spurious columns
    uk_natura_sites_reduced = uk_natura_sites[['SITECODE', 'SITENAME', 'LONGITUDE','LATITUDE']].copy()
    uk_natura_sites_reduced.rename(columns={ 
        uk_natura_sites_reduced.columns[0]: "site_code",
        uk_natura_sites_reduced.columns[1]: "site_name",
        uk_natura_sites_reduced.columns[2]: "longitude",
        uk_natura_sites_reduced.columns[3]: "latitude"
        }, inplace=True)
    area_code_row=uk_natura_sites_reduced[uk_natura_sites_reduced.site_code==natura_area_code].copy()
    if area_code_row.empty:
        longitude = 0
        latitude = 0
    else:
        longitude = area_code_row['longitude'].values[0]
        latitude = area_code_row['latitude'].values[0]
    return [latitude, longitude]

In [None]:
findLatAndLong('UK0013027')

Find the nearest postcode for a lat/long using the marvellous postcodes.io

In [None]:
def findNearestPostcode(lat, long):
    url = 'http://api.postcodes.io/postcodes'
    data = '''{
        "geolocations": [{
        "longitude": ''' + str(long) + ''',
        "latitude": ''' + str(lat) + ''',
        "radius": 10,
        "limit": 1,
        "wideSearch" : true
        }]
    }'''
    dump=json.dumps(data, indent=4, sort_keys=True)
    # print(dump)
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
    response = requests.post(url, data=data, headers=headers)
    # print(response)
    if response.json()['result'][0]['result'] is None:
        return 'N/A NoPostcode'
    else:
        # print('Result item value ' + response.json()['result'][0]['result'])
        postcode = response.json()['result'][0]['result'][0]['postcode']
        # print('Postcode is ' + postcode)
        return postcode

In [None]:
findNearestPostcode(54.350556000000005,-3.429722)
#findNearestPostcode(54.119167000000004,-2.961667)

# Process step 2

Read this back in from disk as the main loop is fairly inefficient currently and takes 60 sec or so. Also not sure what the throttling is on postcodes.io

In [None]:
postcode_to_project = pd.read_csv('output/projectpostcodes.csv', encoding='utf-8')
postcode_to_project.head(10)

In [None]:
postcode_to_project["project_id"] = pd.to_numeric(postcode_to_project["project_id"])
uk_life["project_id"] = pd.to_numeric(uk_life["project_id"])
uk_life_with_postcodes = pd.merge(uk_life, postcode_to_project, on='project_id', how='left')
uk_life_with_postcodes.shape

### Postcodes

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv.gz')
ukpostcodes.shape

In [None]:
uk_life_with_postcodes.postcode.isin(ukpostcodes.postcode).sum()

In [None]:
uk_life_with_postcodes['clean_postcode'] = uk_life_with_postcodes.postcode.\
    str.upper().\
    str.strip().\
    str.replace(r'[^A-Z0-9]', '').\
    str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2')

In [None]:
uk_life_with_postcodes.postcode[~uk_life_with_postcodes.clean_postcode.isin(ukpostcodes.postcode)].unique()

In [None]:
ukpostcodes.head(2)

Lets simply drop all life rows without a valid postcode

In [None]:
clean_life = pd.merge(uk_life_with_postcodes, ukpostcodes, on='postcode', how='inner')
clean_life.shape

### Scrape project info not in xls download from the project details web page

Unfortunately not all the information we need is in the downloadable xls so we'll need to go to the project url and parse the html

In [None]:
def scrapeProjectDetails(project_id, check_url):
    # Not much useful structure in the project webpages so just simple span search looking for strings
    import re
    raw_contents = urllib.request.urlopen(check_url)
    charset = raw_contents.info().get_content_charset()
    contents = raw_contents.read().decode(charset)
    soup = BeautifulSoup(contents, 'html5lib')
    # Get the beneficiary organisation (coordinator)
    ''' <span class="txtheadergreen">Coordinator</span> '''
    coordinator_span = soup.find('span',string='Coordinator')
    ''' <td width="60%" align="left">Natural England</td> '''
    coordinator = coordinator_span.findNext('td').string
    # Then get the total budget
    # html tags to find 
    ''' <span class="txtheadergreen">Total budget</span> '''
    life_total_budget_span = soup.find('span',string='Total budget')
    # html tags to find 
    ''' <td width="60%" align="left">8,522,712.00&nbsp;€</td> '''
    raw_total_budget = life_total_budget_span.findNext('td').string
    total_budget = re.sub('[^0-9.]', '', raw_total_budget)
    #total_budget = int(stripNonNumeric(raw_total_budget ))
    # Then get the EU contribution
    # html tags to find 
    ''' <span class="txtheadergreen">EU contribution</span> '''
    life_eu_contribution_span = soup.find('span',string='EU contribution')
    # html tags to find 
    ''' <td width="60%" align="left">5,113,627.00&nbsp;€</td> '''
    raw_eu_contribution=life_eu_contribution_span.findNext('td').string
    eu_contribution = re.sub('[^0-9.]', '', raw_eu_contribution)
    # Then get the project background blurb
    ''' <span class="txtheadergreen">Background</span> '''
    life_background_span = soup.find('span',string='Background')
    ''' <p>    A decline in the quality of sand dune habitats ...   </p>'''
    # Text within a <p> can contain tags like <i>. This feels a bit clunky.
    background = str(life_background_span.findNext('p'))
    #background = re.sub('[<p>]', '', background)
    #background = re.sub('[</p>]', '', background)
    if background is not None:
        # Remove the paragraph tags then we can strip out the tabs and new lines
        background = background.replace('<p>','')
        background = background.replace('</p>','')
        background = background.strip()
    else:
        background = ''
    dictProjDetails = {}
    dictProjDetails['project_id'] = project_id
    dictProjDetails['coordinator'] = coordinator
    dictProjDetails['total_budget'] = total_budget
    dictProjDetails['eu_contribution'] = eu_contribution
    dictProjDetails['background'] = background
    print( [coordinator, total_budget, eu_contribution, background[:25] + '...'])
    return dictProjDetails

In [None]:
project_details = pd.DataFrame(columns=['project_id', 'coordinator', 'total_budget', 'eu_contribution', 'background'])
for index, row in uk_life.iterrows():
    dictProjectDetails = scrapeProjectDetails(row['project_id'], row['project_url'])
    project_details = project_details.append(dictProjectDetails,ignore_index=True)                                        

In [None]:
project_details.head(2)

In [None]:
project_details.to_csv('output/projectdetails.csv', encoding='utf-8') 

In [None]:
project_details = pd.read_csv('output/projectdetails.csv')

In [None]:
project_details["project_id"] = pd.to_numeric(project_details["project_id"])
clean_life = pd.merge(clean_life, project_details, on='project_id', how='inner')
clean_life.shape

In [None]:
clean_life.head(2)

In [None]:
#life_budgets = scrapeProjectDetails('http://ec.europa.eu/environment/life/project/Projects/index.cfm?fuseaction=search.dspPage&n_proj_id=5344')
#print(life_budgets)

In [None]:
clean_life.columns

In [None]:
clean_life.to_csv('output/cleanlife.csv', encoding='utf-8')

## Convert to GBP

All we have is a year in life_clean, so just use the average annual exchange rate. Project range is specified in the project website page but we will ingore that for now. This code taken from FTS ingest.

In [None]:
eur_gbp = pd.read_pickle('../exchange_rates/output/exchange_rates.pkl.gz')
eur_gbp.tail()

In [None]:
def find_average_eur_gbp_rate():
    # create timeseries from start to end
    days = pd.date_range('2016-01-01', '2017-01-01', closed='left')
    daily = pd.DataFrame({
        'month_start': days,
        'weight': 1.0 / days.shape[0]
    })
    monthly = daily.resample('MS', on='month_start').sum()
    monthly = pd.merge(monthly, eur_gbp, on='month_start', validate='1:1')
    return (monthly.weight * monthly.rate).sum()
clean_life['eur_gbp'] = find_average_eur_gbp_rate()
clean_life.eur_gbp.head()

In [None]:
clean_life.columns

In [None]:
output_life = clean_life[
    clean_life.postcode.isin(ukpostcodes.postcode) &
    (clean_life.eu_contribution > 0)
].copy()

In [None]:
output_life.drop(['project_websummary', 'project_no', 'lead_partner_country', 'country', 'themes', \
                 'keywords', 'habitats', 'species', 'base_url', 'area_code', 'latitude', 'longitude' \
                ], axis=1, inplace=True)


In [None]:
output_life.rename(columns={'year_of_finance': 'year'}, inplace=True)
output_life.rename(columns={'total_budget': 'total_budget_eur'}, inplace=True)
output_life.rename(columns={'eu_contribution': 'eu_contribution_eur'}, inplace=True)
output_life.rename(columns={'project_website': 'website'}, inplace=True)
output_life.shape

In [None]:
output_life['my_eu_id'] = 'life_' + output_life.index.map(str)
output_life.my_eu_id.head()

In [None]:
output_life.to_pickle('output/life.pkl.gz')