# Homework 3 - Interactive Viz

In [None]:
import folium
import requests as rq
from bs4 import BeautifulSoup as bfs
import numpy as np
import pandas as pd
import collections
import os
%matplotlib inline
import matplotlib.pyplot as plt
from dateutil import relativedelta
import scipy.stats as stats
import math
import json
from pprint import pprint
from ipywidgets import FloatProgress
from IPython.display import display
from geopy.geocoders import GeoNames, Nominatim, GoogleV3
import locale

topojson_cantons = r'ch-cantons.topojson.json'

## Test Map

In [None]:
with open(topojson_cantons) as data_file:    
    ch_cantons_json = json.load(data_file)

cantons_id = [canton['id'] for canton in ch_cantons_json['objects']['cantons']['geometries']]
data_test = pd.DataFrame({
        'canton': cantons_id,
        'value': np.random.randint(0, 100, len(cantons_id))
    })
data_test.head()

In [None]:
map_test = folium.Map(location=[46.8, 8.1], zoom_start=8)
map_test.choropleth(
    geo_path=topojson_cantons,
    topojson='objects.cantons',
    data=data_test,
    columns=['canton', 'value'],
    key_on='feature.id',
    fill_color='PuBuGn',
    fill_opacity=0.5,
    line_opacity=0.7,
    threshold_scale=list(np.linspace(data_test['value'].min(), data_test['value'].max(), 6)),
    reset=True
)
map_test

## SNSF data

### Grant CSV file

In [None]:
snsf_data = pd.read_csv('Data/P3_GrantExport.csv', sep=';', parse_dates=['Start Date', 'End Date'])
snsf_data.head()

In [None]:
snsf_data.dtypes

In [None]:
snsf_data.columns = snsf_data.columns.str.replace(snsf_data.columns[0], 'Project Number')
snsf_data['Project Number'].is_unique

In [None]:
snsf_data.set_index('Project Number', inplace=True)
snsf_data.head()

In [None]:
snsf_data['Approved Amount'] = pd.to_numeric(snsf_data['Approved Amount'], errors='coerce')
snsf_data['Approved Amount'].value_counts(dropna=False).head()

In [None]:
snsf_data['University'].value_counts(dropna=False).head()

### Test with differences slices of years

From the doc : "To a lesser extent, there may also be a problem with data quality,
for example data on fellowships for prospective researchers is incomplete for the 1990’s. Moreover,
data on SFGBM for advanced researchers is incomplete before 2005."

> J'essai de voir si il y a des differences entre les années car les données devraient étre plus pertinentes à partir de 2000's

In [None]:
limit_date = np.datetime64('2005');
snsf_data_2005_2018 = snsf_data[snsf_data['Start Date'] >= limit_date ]
snsf_data_2005_2018.head()
snsf_data_2005_2018['University'].value_counts(dropna=False).head()

In [None]:
limit_date_sup = np.datetime64('2005');
limit_date_inf = np.datetime64('2000');

snsf_data_2000_2005 = snsf_data[(snsf_data['Start Date'] < limit_date_sup) & (snsf_data['Start Date'] >= limit_date_inf) ]
snsf_data_2000_2005.head()
snsf_data_2000_2005['University'].value_counts(dropna=False).head()

In [None]:
limit_date_sup = np.datetime64('2000');

snsf_data_1975_2000 = snsf_data[(snsf_data['Start Date'] < limit_date_sup)]
snsf_data_1975_2000.head()
snsf_data_1975_2000['University'].value_counts(dropna=False).head()

### Test blanck attribute

University field : "This is the institution where the project will largely be carried out according to the application. Pick list. This field is only filled if the research is carried out at a Swiss institution, otherwise the field remains blank. In the case of mobility fellowships, it is generally left empty."

> Je cheche quelles sont les attirbuts qui contiennent la "mobility fellowship" car c'est peut etre des universités suisse.

In [None]:
snsf_data['Funding Instrument Hierarchy'].value_counts(dropna=False)

In [None]:
snsf_data[snsf_data['Funding Instrument'].str.contains("fellowships")]['Funding Instrument'].value_counts(dropna=False)

In [None]:
snsf_data[snsf_data['Funding Instrument'].str.contains("Mobility")]['Funding Instrument'].value_counts(dropna=False)

In [None]:
index_link_mobility = snsf_data[snsf_data['Funding Instrument'].str.contains("Mobility")]['Funding Instrument'].value_counts(dropna=False).index

In [None]:
index_array = []
for mobility_attribute in index_link_mobility:
    index_array.append(mobility_attribute)
index_array

In [None]:
data_related_to_mobility_fellowship = snsf_data[snsf_data['Funding Instrument'].isin(index_array)].copy()
data_related_to_mobility_fellowship['University'].value_counts(dropna=False)

In [None]:
data_related_to_mobility_fellowship['Institution'].value_counts(dropna=False).head()

### Clean data

In [None]:
# We keep only the university not null, because those university are swiss.
snsf_data_swiss = snsf_data[snsf_data['University'].notnull()].copy()
snsf_data_swiss.head()

## Find location with university names

In [None]:
universities_list = snsf_data['University'].dropna().unique()

In [None]:
# Use a given Geocoders from geopy to locate a given university in the Switzerland country
# geolocator: the Geocoders instance
# universities: list of all universities to find
def find_location(geolocator, universities=universities_list):

    match_location = {}
    progress_bar = FloatProgress(min=0, max=len(universities))
    display(progress_bar)
    
    for university in universities:
        match_location[university] = geolocator.geocode(university + ' CH')
        progress_bar.value += 1
        
    return pd.Series(match_location)

### Token access

In [None]:
env_file = r'env.json'

with open('env.json') as file:    
    env = json.load(file)

### Fetch from GeoNames

In [None]:
geo_geonames = GeoNames(username=env['GeoNames-Username'])
data_location_geonames = find_location(geo_geonames)
data_location_geonames.value_counts(dropna=False)

### Fetch from OpenStreetMap

In [None]:
geo_nominatim = Nominatim()
data_location_nominatim = find_location(geo_nominatim)
data_location_nominatim.value_counts(dropna=False)

### Fetch from  Google Maps

In [None]:
geo_google = GoogleV3(api_key=env['GoogleMaps-Key'])
data_location_google = find_location(geo_google)
data_location_google.value_counts(dropna=False)

### Result of all locations (API or manually)

In [None]:
match_location = pd.read_csv('Data/universities_cantons.csv')
match_location['Canton'].value_counts(dropna=False)

Expliquer pourquoi on enlève NPO (NaN)

## Merge

In [None]:
data_swiss_universities = pd.merge(
    snsf_data_swiss[['University','Approved Amount']],
    match_location,
    on=['University'],
    how='inner'
)
data_swiss_universities.head()

In [None]:
amount_by_university = data_swiss_universities.groupby('University').sum()
amount_by_university.head()

In [None]:
amount_by_cantons = data_swiss_universities.groupby('Canton').sum()
amount_by_cantons.head()

In [None]:
amount_by_cantons['Approved Amount Log'] = np.log(amount_by_cantons['Approved Amount'])
amount_by_cantons.head()

In [None]:
cantons_data = pd.DataFrame({
        'Canton': cantons_id
    })
cantons_data.set_index('Canton', inplace=True)

In [None]:
amount_by_all_cantons = pd.merge(cantons_data, amount_by_cantons, right_index=True, left_index=True, how='left')
amount_by_all_cantons.fillna(0, inplace=True)
amount_by_all_cantons.reset_index(inplace=True)

## Vizualisation

In [None]:
value_column = 'Approved Amount'

scale = list(
    np.linspace(
        amount_by_all_cantons[value_column].min(),
        amount_by_all_cantons[value_column].max(),
        6
    )
)

map_final = folium.Map(location=[46.8, 8.1], zoom_start=8)
map_final.choropleth(
    geo_path=topojson_cantons,
    topojson='objects.cantons',
    data=amount_by_all_cantons,
    columns=['Canton', value_column],
    key_on='feature.id',
    fill_color='PuBu',
    fill_opacity=0.8,
    line_opacity=0.2,
    threshold_scale=scale
)

# You need to have the locale 'fr_CH.utf8' install
locale_currency = 'fr_CH.utf8'
try :
    locale.setlocale(locale.LC_MONETARY, locale_currency)
except:
    print('Unable to set ' + locale_currency + ' locale, the currency will not be correct')

# Add marker of universities find with google API
for university, location in data_location_google.iteritems():
    if location:
        amount = amount_by_university.loc[university]['Approved Amount']
        message = university + ' (' + locale.currency( amount, grouping=True ) + ')'
        marker = folium.Marker(location[1], popup=message)
        map_final.add_children(marker)

map_final