# Homework 3 - Interactive Viz

In [None]:
import folium
import requests as rq
from bs4 import BeautifulSoup as bfs
import numpy as np
import pandas as pd
import collections
import os
%matplotlib inline
import matplotlib.pyplot as plt
from dateutil import relativedelta
import scipy.stats as stats
import math
import json
from pprint import pprint
from ipywidgets import FloatProgress
from IPython.display import display
from geopy.geocoders import GeoNames, Nominatim, GoogleV3
import locale

topojson_cantons = r'ch-cantons.topojson.json'

## Recommendations (Test Map)

"I also recommend you to use an intermediate viz step for debugging purposes, showing all the universties as markers in your map"

First let's begin to make an intermediate vizualisation : 
    - The Approved Amount is randomly generated 
    - We get the different name of canton in topojson_cantons file

In [None]:
with open(topojson_cantons) as data_file:    
    ch_cantons_json = json.load(data_file)

# We loop on the json file in order to get the initials of each canton.
# Moreover we associate a random number to each cantons.
cantons_id = [canton['id'] for canton in ch_cantons_json['objects']['cantons']['geometries']]
data_test = pd.DataFrame({
        'canton': cantons_id,
        'value': np.random.randint(0, 100, len(cantons_id))
    })
data_test.head()

In [None]:
# We display the map with the dataframe built previously.
map_test = folium.Map(location=[46.8, 8.1], zoom_start=8)
map_test.choropleth(
    geo_path=topojson_cantons,
    topojson='objects.cantons',
    data=data_test,
    columns=['canton', 'value'],
    key_on='feature.id',
    fill_color='PuBuGn',
    fill_opacity=0.5,
    line_opacity=0.7,
    threshold_scale=list(np.linspace(data_test['value'].min(), data_test['value'].max(), 6)),
    reset=True
)
map_test

## SNSF data

### Grant CSV file

After downloading the P3_grantExport, we open it with a data parser in order to manipulate date more easily.

In [None]:
snsf_data = pd.read_csv('Data/P3_GrantExport.csv', sep=';', parse_dates=['Start Date', 'End Date'])
snsf_data.head()

In [None]:
snsf_data.dtypes

### Data processing

Before putting the colums 'Project Number' as index of the dataframe we need to check if it's unique.

In [None]:
snsf_data.columns = snsf_data.columns.str.replace(snsf_data.columns[0], 'Project Number')
snsf_data['Project Number'].is_unique

In [None]:
snsf_data.set_index('Project Number', inplace=True)
snsf_data.head()

As we can see with the dtypes method the values in 'Approved Amount' are objects.
We need cast the differents values to numeric in order to be able to compute the sum easily. <br/>


Here the can notice that 10910 value cannot be cast to numeric for two reason find in documentation :

    - "This amount is not indicated in the case of mobility fellowships since it depends on administrative factors,
    typically the destination, cost of living, family allowances (if applicable) and exchange rate differences."
    
    - "Also in the case of NCCRs this amount is not available in P3"
     
 
>*After trying to get the result from NCCR we can conclude: <br/>
From the http://www.snf.ch/SiteCollectionDocuments/nfs/nccr_guide_2016.pdf we can notice the cantons who reveive
the nccr financial help are the cantons who reveive the most money from the P3 file. The result should not change
a lot if we take in account the cnnr money.*

In [None]:
snsf_data['Approved Amount'] = pd.to_numeric(snsf_data['Approved Amount'], errors='coerce')
snsf_data['Approved Amount'].value_counts(dropna=False).head()

*"This is the institution where the project will largely be carried out according to the application. Pick list. This field is only filled if the research is carried out at a Swiss institution, otherwise the field remains blank. In the case of mobility fellowships, it is generally left empty." <br/> *

> As our study should be on swiss data, we decide to remove all null value.

In [None]:
# We keep only the university not null, because those university are swiss.
snsf_data_swiss = snsf_data[snsf_data['University'].notnull()].copy()
snsf_data_swiss.head()

In [None]:
# We check if we don't have Nan value anymore.
snsf_data_swiss['University'].value_counts(dropna=False).head()

let's see if the mobility fellowship is  really important in the data set.

In [None]:
snsf_data_swiss[snsf_data_swiss['Funding Instrument'].str.contains("fellowships")]['Funding Instrument'].value_counts(dropna=False)

In [None]:
snsf_data_swiss[snsf_data_swiss['Funding Instrument'].str.contains("Mobility")]['Funding Instrument'].value_counts(dropna=False)

## Find location with university names

In [None]:
# We get all the swiss university.
universities_list = snsf_data['University'].dropna().unique()
universities_list

In [None]:
# Use a given Geocoders from geopy to locate a given university in the Switzerland country
# geolocator: the Geocoders instance
# universities: list of all universities to find
def find_location(geolocator, universities=universities_list):

    match_location = {}
    progress_bar = FloatProgress(min=0, max=len(universities))
    display(progress_bar)
    
    for university in universities:
        match_location[university] = geolocator.geocode(university + ' CH')
        progress_bar.value += 1
        
    return pd.Series(match_location)

### Token access

In [None]:
env_file = r'env.json'

with open('env.json') as file:    
    env = json.load(file)

### Fetch from GeoNames

In [None]:
geo_geonames = GeoNames(username=env['GeoNames-Username'])
data_location_geonames = find_location(geo_geonames)
data_location_geonames.value_counts(dropna=False)

### Fetch from OpenStreetMap

In [None]:
geo_nominatim = Nominatim()
data_location_nominatim = find_location(geo_nominatim)
data_location_nominatim.value_counts(dropna=False)

### Fetch from  Google Maps

In [None]:
geo_google = GoogleV3(api_key=env['GoogleMaps-Key'])
data_location_google = find_location(geo_google)
data_location_google.value_counts(dropna=False)

We have noticed that even by using three differents API the results are insufficient. 
<br /> We have decided to complete the missing data manually.

### Result of all locations (API or manually)

In [None]:
match_location = pd.read_csv('Data/universities_cantons.csv')
match_location['Canton'].value_counts(dropna=False)

In the dataset we have find a 'university' NPO, after searching twe find out that si money given all arounf the swiss contry. <br/>
For this reason the we decided to remove the Appoved amount of NPO.

## Merge differents data set.

For each university we compute the sum of 'Approved amount'.

In [None]:
data_swiss_universities = pd.merge(
    snsf_data_swiss[['University','Approved Amount']],
    match_location,
    on=['University'],
    how='inner'
)
data_swiss_universities.head()

In [None]:
amount_by_university = data_swiss_universities.groupby('University').sum()
amount_by_university.head()

For each cantons we compute the sum of 'Approved amount'.

In [None]:
amount_by_cantons = data_swiss_universities.groupby('Canton').sum()
amount_by_cantons.head()

In [None]:
amount_by_cantons.describe()

As we can notice the difference between the max and the min is huge. <br/> the vizualisation is not going to be very representative (with few colors variations) if we take this value. <br/>


>For the reason we decide to apply the log in order to reduce the gap between the amount of money given to each cantons.

In [None]:
amount_by_cantons['Approved Amount Log'] = np.log(amount_by_cantons['Approved Amount']/pow(10,7))
amount_by_cantons.head()

In [None]:
cantons_data = pd.DataFrame({
        'Canton': cantons_id
    })
cantons_data.set_index('Canton', inplace=True)

We need to get the value for each canton, as we can notice in our data some canton does not appear.
> We need to add them with a Appoved Amount equals to 0.

In [None]:
amount_by_all_cantons = pd.merge(cantons_data, amount_by_cantons, right_index=True, left_index=True, how='left')
amount_by_all_cantons.fillna(0, inplace=True)
amount_by_all_cantons.reset_index(inplace=True)

## Vizualisation

In [None]:
value_column = 'Approved Amount Log'

scale = list(
    np.linspace(
        amount_by_all_cantons[value_column].min(),
        amount_by_all_cantons[value_column].max(),
        6
    )
)

# We build the map from the data got before.
map_final = folium.Map(location=[46.8, 8.1], zoom_start=8)
map_final.choropleth(
    geo_path=topojson_cantons,
    topojson='objects.cantons',
    data=amount_by_all_cantons,
    columns=['Canton', value_column],
    key_on='feature.id',
    fill_color='PuBu',
    fill_opacity=0.8,
    line_opacity=0.2,
    threshold_scale=scale
)


# You need to have the locale 'fr_CH.utf8' install
locale_currency = 'fr_CH.utf8'
try :
    locale.setlocale(locale.LC_MONETARY, locale_currency)
except:
    print('Unable to set ' + locale_currency + ' locale, the currency will not be correct')

# Add marker of universities find with google API
for university, location in data_location_google.iteritems():
    if location:
        amount = amount_by_university.loc[university]['Approved Amount']
        message = university + ' (' + locale.currency( amount, grouping=True ) + ')'
        marker = folium.Marker(location[1], popup=message)
        map_final.add_children(marker)
map_final
