# Exercise 1

In [1]:
import numpy as np
import pandas as pd
import requests
import re
from enum import Enum
from bs4 import BeautifulSoup

# Exercise 1

The data can be fetched from IS-Academia usign a form. To automate this process we looked at the form  source code to find out what query is sent to the server to obtain the data. We found that the query is sent to http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter with the values of the form fields as request parameters (appended to the URL after a '?').

The first step is to find out the values correspondig to the parameters we're interested in. To do so we parse the form HTML with BeautifulSoup.

In [5]:
r = requests.get('http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_b_list=1&ww_i_reportmodel=133685247&ww_c_langue=&ww_i_reportModelXsl=133685270&zz_x_UNITE_ACAD=Informatique&ww_x_UNITE_ACAD=249847&zz_x_PERIODE_ACAD=2016-2017&ww_x_PERIODE_ACAD=355925344&zz_x_PERIODE_PEDAGO=Bachelor+semestre+1&ww_x_PERIODE_PEDAGO=249108&zz_x_HIVERETE=&ww_x_HIVERETE=null&dummy=ok')
soup = BeautifulSoup(r.text, 'html.parser')

In [6]:
def printFieldValues(field_name):
    for opt in soup.find('select', {'name': field_name}).find_all('option'):
        print(opt.string, "=", opt['value'])

The values of the different fields:

In [7]:
# Academic unit
print("=== Academic unit ===")
printFieldValues('ww_x_UNITE_ACAD')

# Academic period
print("\n\n=== Academic period ===")
printFieldValues('ww_x_PERIODE_ACAD')

# Pedagogic period
print("\n\n=== Pedagogic period ===")
printFieldValues('ww_x_PERIODE_PEDAGO')

=== Academic unit ===
None = null
Architecture = 942293
Chimie et génie chimique = 246696
Cours de mathématiques spéciales = 943282
EME (EPFL Middle East) = 637841336
Génie civil = 942623
Génie mécanique = 944263
Génie électrique et électronique  = 943936
Humanités digitales = 2054839157
Informatique = 249847
Ingénierie financière = 120623110
Management de la technologie = 946882
Mathématiques = 944590
Microtechnique = 945244
Physique = 945571
Science et génie des matériaux = 944917
Sciences et ingénierie de l'environnement = 942953
Sciences et technologies du vivant = 945901
Section FCUE = 1574548993
Systèmes de communication = 946228


=== Academic period ===
None = null
2016-2017 = 355925344
2015-2016 = 213638028
2014-2015 = 213637922
2013-2014 = 213637754
2012-2013 = 123456101
2011-2012 = 123455150
2010-2011 = 39486325
2009-2010 = 978195
2008-2009 = 978187
2007-2008 = 978181


=== Pedagogic period ===
None = null
Bachelor semestre 1 = 249108
Bachelor semestre 2 = 249114
Bachelor se

To make the code more readable we define enums containing these values:

In [8]:
# Define ISA codes

class AcademicUnit(Enum):
    Informatique = 249847
    
class AcademicPeriod(Enum):
    y20162017  =  355925344
    y20152016  =  213638028
    y20142015  =  213637922
    y20132014  =  213637754
    y20122013  =  123456101
    y20112012  =  123455150
    y20102011  =  39486325
    y20092010  =  978195
    y20082009  =  978187
    y20072008  =  978181
    
class PedagogicPeriod(Enum):
    Bachelor1  =  249108
    Bachelor2  =  249114
    Bachelor3  =  942155
    Bachelor4  =  942163
    Bachelor5  =  942120
    Bachelor6  =  942175

We now create a function to get the data matching a filter from ISA. To do so we send a request with the given parameters to obtain the filter and the mysterious "GPS" value and then query the resulting table.

In [10]:
def getDataTable(academicUnit, academicPeriod, pedagogicPeriod):
    """Get HTML table from ISA given filter values."""
    
    # URLs of ISA
    url1 = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter"
    url2 = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html"

    # Send a first request to obtain the "GPS" value
    payload1 = {
        'ww_b_list':'1',
        'ww_i_reportmodel':'133685247',
        'ww_c_langue':'',
        'ww_i_reportModelXsl':'133685270',
        'zz_x_UNITE_ACAD':'',
        'ww_x_UNITE_ACAD': academicUnit.value,
        'zz_x_PERIODE_ACAD':'',
        'ww_x_PERIODE_ACAD': academicPeriod.value,
        'zz_x_PERIODE_PEDAGO':'',
        'ww_x_PERIODE_PEDAGO': pedagogicPeriod.value,
        'zz_x_HIVERETE':'',
        'ww_x_HIVERETE': 'null',
        'dummy':'ok'
    }

    # Get the GPS value in the response (it is in the second link with "ww_x_GPS" class)
    r1 = requests.get(url1, payload1)
    soup = BeautifulSoup(r1.text, 'html.parser')
    gps = re.search("\d+",soup.find_all('a', {'class': 'ww_x_GPS'})[1]['onclick']).group(0)

    # Send the request to obtain the table
    payload2 = {
        'ww_x_GPS':gps,
        'ww_i_reportModel':'133685247',
        'ww_i_reportModelXsl':'133685270',
        'ww_x_UNITE_ACAD': academicUnit.value,
        'ww_x_PERIODE_ACAD': academicPeriod.value,
        'ww_x_PERIODE_PEDAGO': pedagogicPeriod.value,
        'ww_x_HIVERETE': 'null'
    }

    # Return the table
    r2 = requests.get(url2,payload2)
    soup = BeautifulSoup(r2.text, 'html.parser')
    
    return soup.find('table')


And then we create a function to build a DataFrame from the HTML table:

In [11]:
def parseTable(table):
    """Parses an HTML table from ISA and returns a dataframe with it's relevant content."""
    rows = map(lambda x: x.find_all('td'), table.find_all('tr')[2:])
    data = []
    
    for row in rows:
        data.append({'Civility': row[0].string, 'Name': row[1].string, 'Sciper': row[10].string})
        
    return pd.DataFrame(data)
    

Now we can fetch bachelor student's data from ISA:

In [14]:
compiled_data = []

for year in AcademicPeriod:
    for semester in PedagogicPeriod:
        data = parseTable(getDataTable(AcademicUnit.Informatique, year, semester))
        data['Year'] = year.name
        data['Semester'] = semester.name
        compiled_data.append(data)
        
bachelor = pd.concat(compiled_data)
bachelor

Unnamed: 0,Civility,Name,Sciper,Year,Semester
0,Monsieur,Abbey Alexandre,235688,y20162017,Bachelor1
1,Monsieur,Ahn Seongho,274015,y20162017,Bachelor1
2,Madame,Alemanno Sara,268410,y20162017,Bachelor1
3,Monsieur,Althaus Luca,271464,y20162017,Bachelor1
4,Monsieur,Assi Karim,274518,y20162017,Bachelor1
5,Monsieur,Badoux Luc-Antoine,249613,y20162017,Bachelor1
6,Monsieur,Bagnoud Jérôme,262214,y20162017,Bachelor1
7,Monsieur,Barbaras Yann Quentin,262239,y20162017,Bachelor1
8,Monsieur,Barras Luca,257916,y20162017,Bachelor1
9,Madame,Barsi Clémence Marie Sabine,271508,y20162017,Bachelor1


In [105]:
# Find out which students are registered in semester 1 and 6
bachelor1 = bachelor[bachelor['Semester'] == 'Bachelor1']
bachelor6 = bachelor[bachelor['Semester'] == 'Bachelor6']
bachelor_complete = pd.merge(bachelor1, bachelor6, how='inner', on=['Name'])['Name']

# Keep those from the whole list
bachelor_complete = bachelor[bachelor['Name'].isin(bachelor_complete)]

# Let's count how many semesters they took for their bachelor
bc = bachelor_complete[['Name']]
bc['Month count'] = 6
bc = bc.groupby('Name').sum()

# Give them thier civility back
civilities = bachelor[['Name', 'Civility']].drop_duplicates()
res = civilities.join(bc, how='inner', on='Name')

# Compute the average bachelor duration by gender
res.groupby('Civility').mean()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,Month count
Civility,Unnamed: 1_level_1
Madame,40.758621
Monsieur,42.63587


# Exercise 2