# Homework 3 - Interactive Viz

In [None]:
import folium
import requests as rq
from bs4 import BeautifulSoup as bfs
import numpy as np
import pandas as pd
import collections
import os
%matplotlib inline
import matplotlib.pyplot as plt
from dateutil import relativedelta
import scipy.stats as stats
import math
import json
from pprint import pprint

## Token access

In [None]:
env_file = r'env.json'

with open('env.json') as file:    
    env = json.load(file)

pprint(env)

## Test Map

In [None]:
state_geo = r'ch-cantons.topojson.json'

with open(state_geo) as data_file:    
    ch_cantons_json = json.load(data_file)

cantons_id = [canton['id'] for canton in ch_cantons_json['objects']['cantons']['geometries']]
data_test = pd.DataFrame({
        'canton': cantons_id,
        'value': np.random.randint(0, 100, len(cantons_id))
    })
data_test

In [None]:
#Let Folium determine the scale
map = folium.Map(location=[46.8, 8.1], zoom_start=8)
map.choropleth(
    geo_path=state_geo,
    topojson='objects.cantons',
    data=data_test,
    columns=['canton', 'value'],
    key_on='feature.id',
    fill_color='PuBuGn',
    fill_opacity=0.5,
    line_opacity=0.7,
    threshold_scale=list(np.linspace(data_test['value'].min(), data_test['value'].max(), 6)),
    reset=True
)
map

## SNSF data

### Grant CSV file

In [None]:
snsf_data = pd.read_csv('Data/P3_GrantExport.csv', sep=';', parse_dates=['Start Date', 'End Date'])
snsf_data.head()

In [None]:
snsf_data.dtypes

In [None]:
snsf_data.columns = snsf_data.columns.str.replace(snsf_data.columns[0], 'Project Number')
snsf_data.head()

In [None]:
snsf_data['Project Number'].is_unique

In [None]:
snsf_data.set_index('Project Number', inplace=True)
snsf_data.head()

In [None]:
snsf_data['Approved Amount'] = pd.to_numeric(snsf_data['Approved Amount'], errors='coerce')
snsf_data['Approved Amount'].value_counts(dropna=False)

In [None]:
snsf_data['University'].value_counts(dropna=False)
#snsf_data['University'].value_uniques()

## Test with differences slices of years.

From the doc : "To a lesser extent, there may also be a problem with data quality,
for example data on fellowships for prospective researchers is incomplete for the 1990’s. Moreover,
data on SFGBM for advanced researchers is incomplete before 2005."

> J'essai de voir si il y a des differences entre les années car les données devraient étre plus pertinentes à partir de 2000's

In [None]:
limit_date = np.datetime64('2005');
snsf_data_2005_2018 = snsf_data[snsf_data['Start Date'] >= limit_date ]
snsf_data_2005_2018.head()
snsf_data_2005_2018['University'].value_counts(dropna=False).head()

In [None]:
limit_date_sup = np.datetime64('2005');
limit_date_inf = np.datetime64('2000');

snsf_data_2000_2005 = snsf_data[(snsf_data['Start Date'] < limit_date_sup) & (snsf_data['Start Date'] >= limit_date_inf) ]
snsf_data_2000_2005.head()
snsf_data_2000_2005['University'].value_counts(dropna=False).head()

In [None]:
limit_date_sup = np.datetime64('2000');

snsf_data_1975_2000 = snsf_data[(snsf_data['Start Date'] < limit_date_sup)]
snsf_data_1975_2000.head()
snsf_data_1975_2000['University'].value_counts(dropna=False).head()

> --> ça change pas grand chose...

### Test blanck attribute.

University field : "This is the institution where the project will largely be carried out according to the application. Pick list. This field is only filled if the research is carried out at a Swiss institution, otherwise the field remains blank. In the case of mobility fellowships, it is generally left empty."

> Je cheche quelles sont les attirbuts qui contiennent la "mobility fellowship" car c'est peut etre des universités suisse.

In [None]:
snsf_data['Funding Instrument Hierarchy'].value_counts(dropna=False)

In [None]:
snsf_data[snsf_data['Funding Instrument'].str.contains("fellowships")]['Funding Instrument'].value_counts(dropna=False)

In [None]:
snsf_data[snsf_data['Funding Instrument'].str.contains("Mobility")]['Funding Instrument'].value_counts(dropna=False)

In [None]:
index_link_mobility = snsf_data[snsf_data['Funding Instrument'].str.contains("Mobility")]['Funding Instrument'].value_counts(dropna=False).index

In [None]:
index_array = []
for mobility_attribute in index_link_mobility:
    index_array.append(mobility_attribute)
index_array

In [None]:
data_related_to_mobility_fellowship = snsf_data[snsf_data['Funding Instrument'].isin(index_array)].copy()
data_related_to_mobility_fellowship['University'].value_counts(dropna=False)

In [None]:
data_related_to_mobility_fellowship['Institution'].value_counts(dropna=False)

> --> A chercher plus en profondeur il doit avoir surement quelque chose avec la bouse mobility. Pourquoi il y a des universités suisse ??


In [None]:
# We keep only the university not null, because those university are swiss.
snsf_data_swiss_Data = snsf_data[snsf_data['University'].notnull()].copy()
snsf_data_swiss_Data.head()

### Other CSV files

#### Publication

In [None]:
snsf_publication_data = pd.read_csv('Data/P3_PublicationExport.csv', sep=';')
snsf_publication_data.head()

#### Person

In [None]:
snsf_person_data = pd.read_csv('Data/P3_PersonExport.csv', sep=';')
snsf_person_data.head()

> On a aussi un champ représentant un lieu "Institue Place" mais j'ai peur que l'on s'éloigne trop loin en effectuant un merge (un auteur peut contribuer dans différents projets à différents endroits)

## GeoNames

In [None]:
from geopy.geocoders import GeoNames

# Limit of 2000 request per hour
geo_geonames = GeoNames(username=env['GeoNames-Username'])
result = geo_geonames.geocode('Université de Fribourg CH') # Pas de résultat ... Il faut un code postal
result

In [None]:
payload = {
    #'country': 'CH',
    #'postalcode': 'EPFL VD',
    'placename': 'Université de Fribourg CH',
    'maxRows': 10,
    'username': 'ada_epfl_insa'
}
request_test_page = rq.get('http://api.geonames.org/postalCodeSearch', payload)
test_page = bfs(request_test_page.text, 'xml')
test_page

In [None]:
universities = snsf_data['University'].dropna().unique()

In [None]:
match_location_geonames = {}

for university in universities:
    match_location_geonames[university] = geo_geonames.geocode(university + ' CH')

In [None]:
pd.Series(match_location_geonames).value_counts(dropna=False)

## OpenStreetMap

In [None]:
from geopy.geocoders import Nominatim

geo_nominatim = Nominatim()
location = geo_nominatim.geocode("EPFL CH")
print(location)

In [None]:
match_location_nominatim = {}

for university in universities:
    match_location_nominatim[university] = geo_nominatim.geocode(university + ' CH')

In [None]:
pd.Series(match_location_nominatim).value_counts(dropna=False)

## Google Maps

In [None]:
from geopy.geocoders import GoogleV3

geo_google = GoogleV3(api_key=env['GoogleMaps-Key'])
result = geo_google.geocode('Geneve - CH')
print(result.address)

In [None]:
match_location_google = {}

for university in universities:
    match_location_google[university] = geo_google.geocode(university + ' CH')

In [None]:
pd.Series(match_location_google).value_counts(dropna=False)

## Result of all locations (API or manually)

In [None]:
match_location = pd.read_csv('Data/universities_cantons.csv')
match_location['Canton'].value_counts(dropna=False)

Expliquer pourquoi on enlève NPO (NaN)

In [None]:
amount_by_university = pd.merge(
    snsf_data_swiss_Data[['University','Approved Amount']],
    match_location,
    on=['University'],
    how='inner'
)
amount_by_university.head()

In [None]:
amount_by_cantons = amount_by_university.groupby('Canton').sum()
#amount_by_cantons.head()
amount_by_cantons['Approved Amount Log'] = np.log(amount_by_cantons['Approved Amount'])
amount_by_cantons

In [None]:
cantons_data = pd.DataFrame({
        'Canton': cantons_id
    })
cantons_data.set_index('Canton', inplace=True)

In [None]:
amount_by_all_cantons = pd.merge(cantons_data, amount_by_cantons, right_index=True, left_index=True, how='left')
amount_by_all_cantons.fillna(0, inplace=True)
amount_by_all_cantons.reset_index(inplace=True)

In [None]:
map_final = folium.Map(location=[46.8, 8.1], zoom_start=8)
map_final.choropleth(
    geo_path=state_geo,
    topojson='objects.cantons',
    data=amount_by_all_cantons,
    columns=['Canton', 'Approved Amount Log'],
    key_on='feature.id',
    fill_color='PuBu',
    fill_opacity=0.8,
    line_opacity=0.2,
    threshold_scale=list(np.linspace(amount_by_all_cantons['Approved Amount Log'].min(), amount_by_all_cantons['Approved Amount Log'].max(), 6)),
)

#map_final.choropleth(
#    geo_path='Data/ch-cantons.geojson.json',
#    data=amount_by_all_cantons,
#    columns=['Canton', 'Approved Amount Log'],
#    key_on='feature.id',
#    fill_color='PuBu',
#    fill_opacity=0.8,
#    line_opacity=0.2,
#    threshold_scale=list(np.linspace(amount_by_all_cantons['Approved Amount Log'].min(), amount_by_all_cantons['Approved Amount Log'].max(), 6)),
#)
map_final