# CORDIS FP7

In [None]:
import json
import re
import urllib

from titlecase import titlecase
import pandas as pd

pd.set_option('display.max_columns', 50)

## Read in Data

In [None]:
all_projects = pd.read_excel('input/fp7/cordis-fp7projects.xlsx')
all_projects.shape

In [None]:
all_organizations = pd.read_excel('input/fp7/cordis-fp7organizations.xlsx')
all_organizations.shape

In [None]:
all_briefs = pd.read_excel('input/fp7/cordis-fp7briefs.xlsx')
all_briefs.shape

## Count Organisations and Countries

It is useful to know the total number of organisations and the number of countries involved, to deal with cases where the contribution of each organisation is unknown.

In [None]:
all_organizations[['projectRcn', 'id', 'country']].count()

In [None]:
[
    all_organizations.country.isna().sum(),
    (all_organizations.country[~all_organizations.country.isna()] != 
         all_organizations.country[~all_organizations.country.isna()].str.strip()).sum(),
    (all_organizations.country[~all_organizations.country.isna()] != 
         all_organizations.country[~all_organizations.country.isna()].str.upper()).sum(),
]

In [None]:
project_num_organizations = all_organizations.groupby('projectRcn').\
    id.nunique().reset_index().rename(columns={'id': 'num_organizations'})
project_num_organizations.shape

In [None]:
project_num_countries = all_organizations.groupby('projectRcn').\
    country.nunique().reset_index().rename(columns={'country': 'num_countries'})
project_num_countries.shape

In [None]:
project_num_organizations_and_countries = pd.merge(
    project_num_countries, project_num_organizations,
    on='projectRcn', validate='1:1'
)
project_num_organizations_and_countries.shape

In [None]:
project_num_organizations_and_countries.head()

## Restrict to UK

We are only interested in projects and organizations where the coordinator or at least one participant institution is in the UK.

In [None]:
uk_organizations = all_organizations[all_organizations.country == 'UK']
uk_organizations.shape

In [None]:
uk_organizations.head()

In [None]:
uk_projects = all_projects[all_projects.id.isin(uk_organizations.projectID)]
uk_projects.shape

In [None]:
uk_projects.head()

In [None]:
uk_briefs = all_briefs[all_briefs.projectRcn.isin(uk_projects.rcn)]
uk_briefs.shape

In [None]:
uk_briefs.head()

## Examples

### Coordinator outside UK

The UK has two participant institutions. It appears that `projects.ecMaxContribution` is the sum of all `organizations.ecContribution`s for all coordinator and participant institutions.

In [None]:
uk_projects[uk_projects.rcn == 101244]

In [None]:
uk_organizations[uk_organizations.projectRcn == 101244]

In [None]:
all_organizations[all_organizations.projectRcn == 101244].ecContribution.max()

In [None]:
all_organizations[all_organizations.projectRcn == 101244].ecContribution.sum()

In [None]:
all_briefs[all_briefs.projectRcn == 101244]

### Coordinator in UK

This one is also interesting in that it seems to have a lot of duplicate records that don't have titles, for some reason. We will need to filter those out.

In [None]:
uk_projects[uk_projects.rcn == 99464]

In [None]:
uk_organizations[uk_organizations.projectRcn == 99464]

In [None]:
uk_organizations[uk_organizations.projectRcn == 99464].ecContribution.unique().sum()

In [None]:
all_briefs[all_briefs.projectRcn == 99464]

## Duplicate Projects

It looks like it's safe to just drop projects without titles; those seem to be the only duplicates.

In [None]:
[uk_projects.rcn.nunique(), uk_projects.id.nunique(), uk_projects.shape]

In [None]:
uk_projects[uk_projects.duplicated('rcn', keep=False)]

In [None]:
uk_projects[pd.isnull(uk_projects.title)]

In [None]:
clean_projects = uk_projects[~pd.isnull(uk_projects.title)].copy()
# Could include coordinator and participants... would need some extra cleaning.
clean_projects.drop([
    'id', 'programme', 'topics', 'frameworkProgramme', 'call',
    'fundingScheme', 'coordinator', 'participants', 'subjects'
], axis=1, inplace=True)
clean_projects.rename(columns={
    'startDate': 'start_date',
    'endDate': 'end_date',
    'projectUrl': 'project_url',
    'totalCost': 'total_cost_eur',
    'ecMaxContribution': 'max_contribution_eur',
    'coordinatorCountry': 'coordinator_country',
    'participantCountries': 'participant_countries'
}, inplace=True)
clean_projects.shape

In [None]:
clean_projects.describe()

In [None]:
clean_projects.head()

## Check Project Columns

In [None]:
clean_projects.count()

### Acronym

Just missing one.

In [None]:
clean_projects[clean_projects.acronym.isna()]

### Status

Some projects are listed as cancelled. It's not clear what this means exactly. Spot checks reveal that some of them apparently received at least partial funding and delivered some results, so it does not seem appropriate to remove them altogether.

- https://cordis.europa.eu/result/rcn/237795_en.html (TORTELLEX)
- https://cordis.europa.eu/result/rcn/196663_en.html (YSCHILLER)
- https://cordis.europa.eu/project/rcn/188111_en.html (MICARTREGEN) - no results

In [None]:
clean_projects.status.value_counts()

In [None]:
clean_projects[clean_projects.status == 'CAN'].head()

### Title

In [None]:
(clean_projects.title.str.strip() != clean_projects.title).sum()

### Start and End Dates

Some are missing. Discard for now. There is some overlap with the cancelled projects, but it is not exact.

In [None]:
(clean_projects.start_date.isna() | clean_projects.end_date.isna()).sum()

In [None]:
((clean_projects.status == 'CAN') & (clean_projects.start_date.isna() | clean_projects.end_date.isna())).sum()

In [None]:
((clean_projects.status != 'CAN') & (clean_projects.start_date.isna() | clean_projects.end_date.isna())).sum()

In [None]:
clean_projects = clean_projects[
    ~clean_projects.start_date.isna() | ~clean_projects.end_date.isna()
]
clean_projects.shape

In [None]:
(clean_projects.start_date > clean_projects.end_date).sum()

### Project URL

Looks pretty clean.

In [None]:
(~clean_projects.project_url.isna()).sum()

In [None]:
def is_valid_url(url):
    result = urllib.parse.urlparse(str(url))
    return bool((result.scheme == 'http' or result.scheme == 'https') and result.netloc)

project_url_bad = ~clean_projects.project_url.isna() & ~clean_projects.project_url.apply(is_valid_url)
project_url_bad.sum()

In [None]:
clean_projects[project_url_bad]

In [None]:
clean_projects.loc[project_url_bad, 'project_url'] = 'http://' + clean_projects.loc[project_url_bad, 'project_url']

In [None]:
(~clean_projects.project_url.isna() & ~clean_projects.project_url.apply(is_valid_url)).sum()

### Objective

In [None]:
(clean_projects.objective.str.strip() != clean_projects.objective).sum()

In [None]:
clean_projects.objective = clean_projects.objective.str.strip()

### Total Cost and EC Max Contribution

In [None]:
clean_projects.total_cost_eur.describe()

In [None]:
clean_projects.max_contribution_eur.describe()

In [None]:
(clean_projects.max_contribution_eur > clean_projects.total_cost_eur).sum()

## Clean Up Organizations

I notice several issues:

- Some are missing IDs (but do have postcodes)
- Some are missing postcodes
- Some postcodes are clearly typo'd (digit substitutions, etc);
- Some postcodes have been terminated (searched for them with google)

There are only 2993 unique organization IDs, so this is probably the result of a join.

For now, drop all organizations that don't have both an ID and a valid postcode. (It does look possible to match names to find IDs, and many without postcodes still have addresses, which we could geocode.)

Would be interesting to try this: https://codereview.stackexchange.com/questions/117801/uk-postcode-validation-and-format-correction-tool

In [None]:
[
    uk_organizations.shape,
    uk_organizations.id.notna().sum(),
    uk_organizations.id.isna().sum(),
    uk_organizations.id[uk_organizations.id.notna()].nunique(),
    uk_organizations.postCode.isna().sum(),
    uk_organizations.postCode[uk_organizations.postCode.notna()].nunique()
]

In [None]:
organizations = uk_organizations[uk_organizations.id.notna() & uk_organizations.postCode.notna()].copy()
organizations.id = organizations.id.astype('int64')
organizations.postCode = organizations.postCode.astype('str')
[
    organizations.shape,
    organizations.id.nunique(),
    organizations.postCode.nunique()
]

In [None]:
ukpostcodes = pd.read_csv('../postcodes/input/ukpostcodes.csv.gz')
ukpostcodes.shape

In [None]:
organizations.postCode.isin(ukpostcodes.postcode).sum()

In [None]:
organizations['cleanPostcode'] = organizations.postCode.\
    str.upper().\
    str.strip().\
    str.replace(r'[^A-Z0-9]', '').\
    str.replace(r'^(\S+)([0-9][A-Z]{2})$', r'\1 \2')

In [None]:
organizations.cleanPostcode.isin(ukpostcodes.postcode).sum()

In [None]:
organizations.cleanPostcode[~organizations.cleanPostcode.isin(ukpostcodes.postcode)].unique()

In [None]:
organizations = organizations[organizations.cleanPostcode.isin(ukpostcodes.postcode)]
organizations.shape

In [None]:
clean_projects = clean_projects[clean_projects.rcn.isin(organizations.projectRcn)]
clean_projects.shape

## Clean Up Duplicate Organizations

I think there is also a join on the contacts, because we get multiple rows for some project-organization pairs. The main thing is that we want the `ecContribution` to be consistent. Otherwise, any row will do.

In [None]:
organizations.sort_values(['projectRcn', 'id']).\
    groupby(['projectRcn', 'id']).\
    filter(lambda x: x.shape[0] > 1)

In [None]:
organizations.groupby(['projectRcn', 'id']).\
    filter(lambda x: x.ecContribution.nunique() > 1).shape

In [None]:
clean_organizations = organizations.groupby(['projectRcn', 'id']).first()
clean_organizations.reset_index(inplace=True)
clean_organizations.drop([
    'projectID', 'projectAcronym', 'shortName', 'activityType', 'endOfParticipation',
    'country', 'street', 'city', 'postCode',
    'contactType', 'contactTitle', 'contactFirstNames', 'contactLastNames',
    'contactFunction', 'contactTelephoneNumber', 'contactFaxNumber', 'contactEmail'
], axis=1, inplace=True)
clean_organizations.rename({
    'projectRcn': 'project_rcn',
    'id': 'organization_id',
    'ecContribution': 'contribution_eur',
    'organizationUrl': 'organization_url',
    'cleanPostcode': 'postcode'
}, axis=1, inplace=True)
clean_organizations.name = clean_organizations.name.apply(titlecase)
clean_organizations.shape

In [None]:
clean_organizations.head()

## Check Organisations

In [None]:
clean_organizations.count()

### Role

In [None]:
clean_organizations.role.value_counts()

### Name

In [None]:
(clean_organizations.name.str.strip() != clean_organizations.name).sum()

### Contribution EUR

Missing for some organisations.

In [None]:
clean_organizations.contribution_eur.describe()

In [None]:
clean_organizations.contribution_eur.isna().sum()

### Organisation URL

Mostly clean. Found a couple with a `;` delimiting two URLs, neither of which resolved, so we can get rid of those.

In [None]:
(~clean_organizations.organization_url.isna()).sum()

In [None]:
organization_url_bad = ~clean_organizations.organization_url.isna() & \
    ~clean_organizations.organization_url.apply(is_valid_url)
organization_url_bad.sum()

In [None]:
clean_organizations.loc[organization_url_bad, 'organization_url'] = \
    'http://' + clean_organizations.loc[organization_url_bad, 'organization_url']

In [None]:
organization_url_bad = ~clean_organizations.organization_url.isna() & \
    ~clean_organizations.organization_url.apply(is_valid_url)
organization_url_bad.sum()

In [None]:
clean_organizations[
    ~clean_organizations.organization_url.isna() & \
    clean_organizations.organization_url.str.match('http.*http')].organization_url.unique()

In [None]:
clean_organizations.loc[
    ~clean_organizations.organization_url.isna() & \
    clean_organizations.organization_url.str.match('http.*http'), 'organization_url'] = float('nan')

## Briefs

Might as well merge these into the projects where we have them. We have a few duplicates to take care of.

In [None]:
clean_briefs = uk_briefs[
    uk_briefs.projectRcn.isin(clean_projects.rcn) &\
    (uk_briefs.title.notna() | uk_briefs.teaser.notna() | uk_briefs.article.notna())
].copy()
clean_briefs.shape

In [None]:
clean_briefs[clean_briefs.projectRcn.duplicated(keep=False)]

In [None]:
clean_briefs = clean_briefs.sort_values('lastUpdateDate')
clean_briefs = clean_briefs[~clean_briefs.projectRcn.duplicated(keep='last')]
clean_briefs.shape

In [None]:
clean_briefs.drop([
    'rcn', 'language', 'lastUpdateDate', 'country', 'projectAcronym',
    'programme', 'topics', 'relatedReportRcn'
], axis=1, inplace=True)
clean_briefs.rename({
    'projectRcn': 'rcn',
    'title': 'brief_title',
    'relatedReportTitle': 'related_report_title',
    'imageUri': 'image_path'
}, axis=1, inplace=True)
clean_briefs.head()

In [None]:
clean_projects_with_briefs = pd.merge(
    clean_projects, clean_briefs, on='rcn', how='left', validate='1:1'
)
clean_projects_with_briefs.head()

## Checks

In [None]:
clean_organizations[clean_organizations.project_rcn == 101244]

In [None]:
clean_projects_with_briefs[clean_projects_with_briefs.rcn == 101244]

In [None]:
clean_organizations[clean_organizations.project_rcn == 99464]

In [None]:
clean_projects_with_briefs[clean_projects_with_briefs.rcn == 99464]

In [None]:
project_organizations = pd.merge(
    clean_projects_with_briefs, clean_organizations,
    left_on='rcn', right_on='project_rcn', validate='1:m')
project_organizations.drop(['project_rcn'], axis=1, inplace=True)
project_organizations.shape

In [None]:
project_organizations.head()

In [None]:
uk_contributions = project_organizations.groupby('rcn').aggregate({'contribution_eur': sum})
uk_contributions.reset_index(inplace=True)
uk_contributions.head()

In [None]:
project_uk_contributions = pd.merge(
    clean_projects_with_briefs,
    uk_contributions,
    on='rcn', validate='1:1')
project_uk_contributions.head()

In [None]:
project_uk_contributions[project_uk_contributions.contribution_eur > project_uk_contributions.max_contribution_eur + 0.1].shape

In [None]:
project_organization_uk_contributions = pd.merge(
    project_uk_contributions, clean_organizations,
    left_on='rcn', right_on='project_rcn', validate='1:m'
)
project_organization_uk_contributions = pd.merge(
    project_organization_uk_contributions, ukpostcodes, on='postcode', validate='m:1'
)
project_organization_uk_contributions.shape

In [None]:
project_organization_uk_contributions.head()

In [None]:
(project_uk_contributions.contribution_eur < 1000).value_counts()

### Add Numbers of Organisations and Countries

Add these back on and do a sanity check against the `participant_countries` field. They mostly match up, except for a few relatively small discrepancies. 

In [None]:
clean_projects_with_briefs.shape

In [None]:
clean_projects_with_briefs = pd.merge(
    clean_projects_with_briefs, project_num_organizations_and_countries,
    left_on='rcn', right_on='projectRcn', validate='1:1')
clean_projects_with_briefs.drop('projectRcn', axis=1, inplace=True)
clean_projects_with_briefs.shape

In [None]:
clean_projects_with_briefs.head()

In [None]:
[
    clean_projects_with_briefs.num_countries.isna().sum(),
    clean_projects_with_briefs.coordinator_country.isna().sum(),
    clean_projects_with_briefs.participant_countries.isna().sum()
]

In [None]:
def check_num_countries():
    ccs = clean_projects_with_briefs.coordinator_country
    pcs = clean_projects_with_briefs.participant_countries
    ncs = clean_projects_with_briefs.num_countries
    pcs_isna = pcs.isna()
    
    coordinator_mismatch = clean_projects_with_briefs[pcs_isna][ncs[pcs_isna] != 1].copy()
    coordinator_mismatch['check'] = 1
    
    cs = ccs[~pcs_isna] + ';' + pcs[~pcs_isna]
    check_ncs = cs.apply(lambda x: len(set(x.split(';'))))
    
    participant_mismatch = clean_projects_with_briefs[~pcs_isna][ncs[~pcs_isna] != check_ncs].copy()
    participant_mismatch['check'] = check_ncs
    
    return pd.concat([coordinator_mismatch, participant_mismatch])\
        [['rcn', 'coordinator_country', 'participant_countries', 'num_countries', 'check', 'num_organizations']]
check_num_countries()

In [None]:
all_organizations.country[all_organizations.projectRcn == 100467].unique()

In [None]:
all_organizations.country[all_organizations.projectRcn == 203681].unique()

In [None]:
all_organizations.country[all_organizations.projectRcn == 90982].unique()

I suspect a problem with handling of `NA`; that is a valid code (Namibia), but maybe in some cases it is being used for Not Available.

### Convert to GBP

In [None]:
eur_gbp = pd.read_pickle('../exchange_rates/output/exchange_rates.pkl.gz')
eur_gbp.tail()

In [None]:
def find_average_eur_gbp_rate(row):
    # create timeseries from start to end
    days = pd.date_range(row.start_date, row.end_date, closed='left')
    daily = pd.DataFrame({
        'month_start': days,
        'weight': 1.0 / days.shape[0]
    })
    monthly = daily.resample('MS', on='month_start').sum()
    monthly = pd.merge(monthly, eur_gbp, on='month_start', validate='1:1')
    return (monthly.weight * monthly.rate).sum()

clean_projects_with_briefs['eur_gbp'] = \
    clean_projects_with_briefs.apply(
        find_average_eur_gbp_rate, axis=1, result_type='reduce')

In [None]:
clean_projects_with_briefs.head()

## Save Data

In [None]:
clean_projects_with_briefs.to_pickle('output/fp7_projects.pkl.gz')

In [None]:
clean_organizations.to_pickle('output/fp7_organizations.pkl.gz')