# Homework 3: Visualization

In [None]:
import numpy as np
import pandas as pd
import folium
import math
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import json
import requests
import re
%matplotlib inline

## Import data and obtain canton information

We first find the canton ID (abbreviation) in the topo.json file provided and store it in canton_id array.

In [None]:
# find the canton id in the json file
with open('ch-cantons.topojson.json') as data_file:    
    data_json = json.load(data_file)
canton_id=[]
for i in range(len(data_json["objects"]['cantons']['geometries'])):
    canton_id.append(data_json["objects"]['cantons']['geometries'][i]['id'])

We then import grant data.

In [None]:
#compute how much data we identified
round(len(data_sorted)/len(data),2)

This process allows us to know the canton location of 94% of the exploitable data, which is a lot higher that the 66% obtained without the above steps. 

In [None]:
data_sorted.head()

## From institutions to cantons

In this section we study the Institution field more in depth to assign a canton abbreviation to each row.
For this we will build a dictionary (insti2canton) that gives the correspondence between each institution and its canton. 

The process is very similar as for univeristies except that we do not extract the abbreviation (as there is no '-' symbol) and we do not check if words are in a list of words to exclude (as there are many, many different words that we should exclude). Instead, we compare if the word is a major swiss city, and if it is we search with geonames.

However, due to the immense diversity of university (13 000 unique names), we run into geonames limitations and therefore didn't apply this method to the final tables. However this method could easily be implemented in the future if this issue is solved. 

In [None]:
# Get a list of all institutions
institutions = list(set(data_unsorted['Institution'].values))
#institutions = institutions[1:]

Here we download a list of swiss cities and only keep the ones having more than 20 000 people, as they might have a university.

In [None]:
# Get a list of major swiss cities and university abbreviations
swiss_cities=pd.read_excel('swiss_cities.xlsx')
swiss_cities = swiss_cities.drop(swiss_cities.columns[[0,2]], axis=1)
cities = list(swiss_cities[swiss_cities['Population']>20000]['City'].values)
cities[2] = 'Biel'
cities.extend(['Bienne','Gallen','Yverdon','Neuchâtel','Zürich'])
#cities.extend([HERE LIST OF UNIVERSITY CODES])

In [None]:
def extractInstiCity(instiName):
    "Function to extract swiss city from institution name"
    locs = [m.start() for m in re.finditer(' ', instiName)] # find blank spaces in the name
    word = []
    if len(locs)>0: # if there are several words in the institution name
        if instiName[:locs[0]] in cities: # if word corresponds to a major swiss city or institution
            word = [instiName[:locs[0]]]    
        for i,loc in enumerate(locs): # extract all words in name
            if i == len(locs)-1:
                new_word = instiName[locs[i]+1:] 
            else: 
                new_word = instiName[locs[i]+1:locs[i+1]] 
            if new_word in cities: # if word is major swiss city add to our list
                word.append(new_word)
    return word

In [None]:
insti2canton = dict() # dictionary giving the relationship between the institution and its canton

for insti in institutions[:20]:
    
    # Try extracting info from intitution whole name
    to_search = insti
    r = requests.get('http://api.geonames.org/searchJSON?name=%22'+ to_search + '%22&country=CH&maxRows=500&username=mericervi')
    soup = BeautifulSoup(r.content) 
    canton = findCanton(soup)
        
    if not canton: # if we still haven't found the canton
        # Try look at specific words in institution name
        to_search = extractInstiCity(insti)
        for word in to_search:
            r = requests.get('http://api.geonames.org/searchJSON?name=%22'+word + '%22&country=CH&maxRows=500&username=mericervi')
            soup = BeautifulSoup(r.content)
            canton = findCanton(soup)
            if not canton == []:
                break
    if canton =='00':
        canton = []
    insti2canton[insti] = canton


## Build interactive maps

In this section we build the final table containing canton abbreviation and total grant amount, and then plot it in an interactive map with folium.

In [None]:
data_sorted=pd.read_pickle('data_sorted')
#sum all the data per canton
data_grouped=pd.DataFrame([])
money=[]
for i in canton_id:
    money.append(np.sum(data_sorted['Approved Amount'][data_sorted.Canton==i]))
data_grouped['Canton']=canton_id
data_grouped['Money']=money
data_grouped.set_index('Canton')
data_grouped.head()

In [None]:
#divide by 10^6 for visualisation (the data will be in million CHF) and keep only 2 decimals
data_toplot=data_grouped
data_toplot['Money']=np.round(data_toplot['Money']/10**6,2)

In [None]:
#import the topojson map
topo_path = r'ch-cantons.topojson.json'
swiss_map = folium.Map(location=[46.8, 8.4], zoom_start=7)

In [None]:
swiss_map.choropleth(geo_path=topo_path, 
                     data=data_toplot,
                     columns=['Canton', 'Money'],
                     threshold_scale=[0,100,1000,2000,3000,4000],
                     key_on='feature.id',
                     topojson='objects.cantons',
                     fill_color='BuPu',
                     legend_name = 'Grant money im million CHF',
                     fill_opacity=0.7,
                     line_opacity=0.2,
                     reset=True
                    )

To visualize the map open it from the html file provided in the repository.

Notes: 
    <li> the legend values are given in millions
    <li> the legend was selected manually to highlight differences between cantons (as some have way higher grant sums than others)

In [None]:
#import the grant data
data=pd.read_csv('P3_GrantExport.csv',sep=';')

We then clean the data. We create a data frame and then drop the following data:
    <li> unnecessary columns (not relating to the university, institution or grant) 
    <li> rows where university field is empty or 'NA' (data documentation specifies that, in that case, the research has NOT been conducted in a swiss institute)
    <li> rows where we have no grant amount

In [None]:
#drop every column except university institution and approved amount
data=data.drop(data.columns[[0,1,2,3,4,5,8,9,10,11,12,14]], axis=1)

#drop the data for which we dont have university because the description of the data
#mentions that if this field is empty the research is notconducted in a swiss institute
double_NaN_idx = data[pd.isnull(data['University'])].index.values 
data = data.drop(data.index[[double_NaN_idx]],axis=0)
data = data.reset_index('level_0')
data = data.drop('index', axis=1)

#drop the data for which we have no amount
no_amount_idx = data[np.equal(data['Approved Amount'], 'data not included in P3')].index.values
data = data.drop(data.index[[no_amount_idx]],axis=0)
data = data.reset_index('level_0')
data = data.drop('index', axis=1)

# transform the approved amount in float
data['Approved Amount']=data['Approved Amount'].astype(float)

#we also have to remove the data that endswith or NA (the line university doesn't contain information)
to_dropNA=data[data.University.str.endswith('NA')==True].index.values
#create the dataframe with the grants for which we have the university info
data=data.drop(data.index[[to_dropNA]],axis=0)
# reset the index
data.reset_index(inplace=True)
#remove the old index column
data=data.drop(data.columns[0], axis=1)

In [None]:
data.head()

## Get the easily accessible cantons

In a first place we obtain a data frame of sorted data for which it is straightforward to obtain the canton.
This will allow us to process less data in operations that are more time-consuming. The steps involved in this part are:
    <li> for each row extract the abbreviations to the right of the '-' symbol in the university field
    <li> manually replace two abbreviations to cantons that we know (LA --> VD and HEPFR --> FR) 

In [None]:
#add a column for the cantons
data["Canton"] = ""

In [None]:
# /!\ WARNING: DO NOT EXECUTE (time consuming step), if you really want to write if True:
# Extract the letters after the '-' 
if False:
    tiret=data.University.str.find('-').astype(int)
    canton=[]
    for i in range(len(data_uni)):
        # keep everything that is after that
        canton=data.University[[i]].str[tiret[i]+2:]
        data.Canton[[i]]=canton

In [None]:
# save these data so we don't have to run the cleaning part again
#data.to_pickle('data_incomplete_cantons')
data=pd.read_pickle('data_incomplete_cantons')

In [None]:
# Manually replace a couple that were not identifiable with the identify university code
data.Canton[data.Canton=='LA']='VD'
data.Canton[data.Canton=='HEPFR']='FR'

We then sort our data. Data for which a real canton has been identified (it is in canton_id) is moved to data_sorted. Data for which no real canton has been identitifed yet will be moved to data_unsorted for further processing.

This process allows us to know the canton location of 66% of the exploitable data. 

In [None]:
# remove the bad cantons from the unsorted table
data_unsorted["Canton"] = ""

In [None]:
data_sorted.to_pickle('data_sorted')
data_unsorted.to_pickle('data_unsorted')

## From university name to canton

In this section we study the University field more in depth to assign a canton abbreviation to each row.
For this we will build a dictionary (uni2canton) that gives the correspondence between each university and its canton. 

In a first place we get the list of the unique different universities (array without repeating entries) 

In [None]:
# Get a list of all universities
universities = list(set(data_unsorted['University'].values))
#universities = universities[1:]

Then we build some functions that will be handy to extract useful information from the university field:
    <li> extractUniName: will extract the whole name of the university
    <li> extractUniID: will extract the abbreviation of the university (located to the right of the '-')
    <li> extractUniCity: will try to find the city of the university by looking at the different words in the name
    <li> findCanton: will do a request on geonames to find the canton of the found city or university

In [None]:
def extractUniName(university):
    "Extract name of university from table"
    loc = university.find('-')
    uni = str(university[:loc -1])
    return uni

def extractUniID(university):
    "Extract university abbreviation"
    loc = university.find('-')
    uni = str(university[loc + 2:])
    return uni

# List of terms that are generic and shouldn't be analyzed 
excluded = ['Université','University','Hochschule','Universität','école','Haute','Suisse','au','St.','du','Switzerland',
            'Fachhochschule','Università','Institute','de','von','of','Swiss','für','di','et','for','und','sur','della',
            'Schweiz.','Schweizer','Schweiz','Department','Dept.','School','and','Laboratory','Departement','Haute','pédagogique','canton']

def extractUniCity(university):
    "Find city where the university is located"
    locT = university.find('-')
    uniName = str(university[:locT -1]) # get university name
    locs = [m.start() for m in re.finditer(' ', uniName)] # get location of white spaces within university name
    word = []
    if len(locs)>0: # if there are several words in the university name (several blank spaces)
        word = [] # define the first word in the name
        if not uniName[:locs[0]] in excluded:
            word=[uniName[:locs[0]]]
        for i,loc in enumerate(locs):
            if i == len(locs)-1:
                new_word = uniName[locs[i]+1:locT - 1] # the last word raises an exception, treat separately
            else: 
                new_word = uniName[locs[i]+1:locs[i+1]] # extract next words 
            if not new_word in excluded: # if the word is not to be excluded append to our list
                word.append(new_word) 
    return word
    
def findCanton(soup):
    "Function to extract the canton from Beautiful Soup Outputs"
    geonamesInfo = soup.find_all('p')[0].text
    geonamesDict =json.loads(str(geonamesInfo))
    canton = []
    if not geonamesDict['geonames']: # if there are no search results
        canton = []
    elif 'adminCode1' in geonamesDict['geonames'][0]:
        canton = geonamesDict['geonames'][0]['adminCode1'] # extract canton ID that is encoded in adminCode1 field
    return canton   

The algorithm consists on a hierarchical search where, if the search on geonames with a given attribute doesn't give any result, another deeper search will be performed. This will improve algorithm performance. The order of the different attribute searches is:
    <li> search by university abbreviation
    <li> search by university (whole) name
    <li> search using the words within the university names. Here some generic words such as "University" or linkwords will be excluded. 

In [None]:
uni2canton = dict() # dictionary giving the relationship between the university and its canton

for uni in universities: # try for each university
    
    # Try extracting info from university abbreviation (e.g. EPFL)
    to_search = extractUniID(uni)
    r = requests.get('http://api.geonames.org/searchJSON?name=%22'+ to_search + '%22&country=CH&maxRows=500&username=mariacervera')
    soup = BeautifulSoup(r.content,"lxml") 
    canton = findCanton(soup)
    
    if not canton:  # if canton hasn't been found for that uni
        # Try to search using university (whole) name
        to_search = extractUniName(uni)  
        r = requests.get('http://api.geonames.org/searchJSON?name=%22'+ to_search + '%22&country=CH&maxRows=500&username=mariacervera')
        soup = BeautifulSoup(r.content,"lxml") 
        canton = findCanton(soup)
        
        if not canton: # if we still haven't found the canton
            # Try to look at specific words in university name
            to_search = extractUniCity(uni)
            for word in to_search:
                r = requests.get('http://api.geonames.org/searchJSON?name=%22'+word + '%22&country=CH&maxRows=500&username=mariacervera')
                soup = BeautifulSoup(r.content,"lxml") 
                canton = findCanton(soup)
                if not canton == []: # if we found the canton with one word already, don't look at next words
                    break
    if canton =='00': # code given for the whole of switzerland in geonames
        canton = []
    uni2canton[uni] = canton

In [None]:
#remove the NPO from the dictionnary because they automatically go to bern and we want to treat them separaterly
uni2canton.pop('NPO (Biblioth., Museen, Verwalt.) - NPO', None)

Once we have our dictionary uni2canton, we will check the name of the university of our unsorted data and write its found canton in the dataframe.

In [None]:
#write the cantons of the new universities we just identified
for i in range(len(data_unsorted)):
    if str(data_unsorted.University[[i]].values[0]) in uni2canton:
        data_unsorted.Canton[i]=uni2canton[str(data_unsorted.University[[i]].values[0])]     


Finally, we will have in our dataframe rows that have empty or non-valid cantons (such as [] or ''), so again, we will put these in another frame called data_unsorted2 for further analysis.

In [None]:
# /!\ WARNING: DO NOT EXECUTE (time consuming step), if you really want to write if True:
#separate the good and bad data
#reindex the unsorted_data
if False:
    data_unsorted2=pd.DataFrame()
    for i in range(len(data_unsorted)):
        if str(data_unsorted.Canton[[i]].values[0]) in canton_id:    
            data_sorted=pd.concat((data_sorted,data_unsorted.iloc[[i]]))
        else:
            data_unsorted2=pd.concat((data_unsorted2,data_unsorted.iloc[[i]]))

In [None]:
data_unsorted2.to_pickle('data_unsorted2')
data_sorted.to_pickle('data_sorted')

In [None]:
#find the data that already have a good canton and keep it aside
data_sorted=pd.DataFrame()
data_unsorted=pd.DataFrame()
for i in range(len(data)):
    if data.Canton[[i]].values[0] in canton_id:       
        data_sorted=pd.concat((data_sorted,data.iloc[[i]]))
    else:
        data_unsorted=pd.concat((data_unsorted,data.iloc[[i]]))

In [None]:
#reindex everything
data_unsorted = data_unsorted.reset_index('level_0')
data_unsorted = data_unsorted.drop('index', axis=1)

data_sorted = data_sorted.reset_index('level_0')
data_sorted = data_sorted.drop('index', axis=1)

In [None]:
# compute how much data we already have
len(data_sorted)/len(data)

In [None]:
swiss_map

In [None]:
swiss_map.create_map(path='map.html')

## Personal bonus: Add markers for each canton

In this extra part, we decided to add interactive markers for each canton to access its code and total grant amount, in order to access information more easily.