# Homework 2 - IS-Academia

In [None]:
import requests as rq
from bs4 import BeautifulSoup as bfs
import numpy as np
import pandas as pd
import collections
import os
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

<b>I) Obtain all the data for the Bachelor students, starting from 2007. Keep only the students for which you have an entry for both Bachelor semestre 1 and Bachelor semestre 6. Compute how many months it took each student to go from the first to the sixth semester. Partition the data between male and female students, and compute the average -- is the difference in average statistically significant?</b>

## Get params

In order to get all students data, we need to get all parameters available to fetch the database by the REST API. To do so, we begin by request the section that display all `select` lists in the web page. Then, we extract for each options the humain readable name with his corresponding value.

In [None]:
filters_page = rq.get('http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247')
filters = bfs(filters_page.text, 'html.parser')
selects = filters.findAll('select')

available_params = collections.defaultdict(dict)

for select in selects:    
    options = collections.defaultdict(list)
    
    for option in select.findAll('option'):
        if option.attrs['value'] != 'null':
            options[option.text] = option.attrs['value']
        
    available_params[select.attrs['name']] = options
  
available_params

### Making params names readable

Now, we have all available parameters, but there are not easy to read. To improve this, we have made the choice to directly see the source code in the HTML page and make the correspondance because there are very few `select` box (so it's faster than extract this information directly in python).

Every constants represent a parameter name. We define in addition a dictionary of default params needed in all requests we will make.

In [None]:
SECTION_PARAM = 'ww_x_UNITE_ACAD' # like 'Informatique' => Computer Sciences
YEAR_PARAM = 'ww_x_PERIODE_ACAD' # like '2016-2017'
SEMESTER_PARAM = 'ww_x_PERIODE_PEDAGO' # like 'Bachelor semestre 1' => Bachelor semester 1
TYPE_PARAM = 'ww_x_HIVERETE' # like 'Semestre d'automne' => Fall semester

DEFAULT_PARAMS = {
    'ww_x_GPS' : '-1',
    'ww_i_reportModel' : '133685247',
    'ww_i_reportModelXsl' : '133685270',
}

section_codes = available_params[SECTION_PARAM]
year_codes = available_params[YEAR_PARAM]
semester_codes = available_params[SEMESTER_PARAM]
type_codes = available_params[TYPE_PARAM]

## Get data

Firstly, we make a function that return a single dataframe from the html table extracted from a requested page.

We try to get columns from table header. But in our case, all tables have two header rows, the first one is the title of the table and the second one is the colum names.

Then, we fetch all rows to convert them to a single serie (list of column values in this row).
Finally, we combine columns names and series in a new pandas DataFrame.

In [None]:
def html_page_to_dataframe(page):
    soup = bfs(page.text, 'html.parser')
    table = soup.find('table')
    
    # Get columns names, skip the first column that show the section and year
    columns = [column.text for column in table.find_all('th')[1:]]
    
    series = []
    # Fetch all rows to a serie, skip the two first rows that represent header
    for row in table.find_all('tr')[2:]:
        serie = []
        
        # Fetch all columns, but skip the last one that is not in the header (badly structered html table)
        for col in row.find_all('td')[:-1]:
            serie.append(col.text)
        series.append(serie)

    # Create a dataframe from the columns and series variable
    df = pd.DataFrame(series)
    df.columns = columns
    
    return df

Now, we are able to request a IS-Academia page containing the data table of students in a specific section, year and semester.
We will store all data, to avoid multiple requests and DDOS the IS-Academia access. To do so, we will use pickle to serialize all data :

In [None]:
DATA_FOLDER = 'Data/'
DATA_URL = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html'

# Create the directory Data if not exist
if not os.path.exists('Data'):
    os.makedirs(DATA_FOLDER)

# We focus only one data of students in bachelor and master
bachelor_master_semester_codes = { 
    section_name : semester_codes[section_name] for section_name in semester_codes if section_name.startswith('Bachelor semestre') or section_name.startswith('Master semestre')
}

print('Start fetching data from ' + DATA_URL)

for year_name, year_code in year_codes.items():

    for semester_name, semester_code in bachelor_master_semester_codes.items():
        
        print('\tFetch data for ' + year_name + ' ' + semester_name + '\t: ', end="", flush=True)

        file_path = DATA_FOLDER + year_name + ' ' + semester_name
        
        if not os.path.exists(file_path):
       
            params = {
                SECTION_PARAM : section_codes['Informatique'],
                YEAR_PARAM : year_code,
                SEMESTER_PARAM : semester_code,
                TYPE_PARAM : 'null',
                **DEFAULT_PARAMS
            }

            # We get a dataframe from a table in the html
            try:
                data = html_page_to_dataframe(rq.get(DATA_URL, params))

                if data.empty:
                    print('No data')
                    
                else:
                    # We store data in binary file with pickle in the Data folder
                    data.to_pickle(file_path)

                    print('Stored in ' + file_path)

            except ValueError:
                print('Unable to get data from ' + DATA_URL + ' with following params :')
                for key, value in params.items():
                    print('\t\t' + key + ' : ' + value)
                break

        else:
            print('Already stored in ' + file_path)

print('Fetching data done, see ' + DATA_FOLDER + ' folder')

> Problem detected with "2008-2009 Bachelor semestre 3" and "2007-2008 Bachelor semestre 3", multiple table find (two entry instead of one).

### Analyse bachelor data

We can notice that the data related to students in bachelor 6b is empty in computer science section.

Asumptions:
 - 1) A bachelor student starts at the semester 1 and finishes at the semester 6
 - 2) A semester lasts 6 months
 - 3) Odd semesters are in the fall period and even semesters are in the spring semesters

In [None]:
bachelor_semester_codes = { 
    section_name : semester_codes[section_name] for section_name in semester_codes if section_name.startswith('Bachelor semestre') and section_name.endswith(tuple(['6', '1']))
}

# The variable contains a list of dataframe associated to a semester_name
bachelor_dataframes = collections.defaultdict(pd.DataFrame)

for semester_name, semester_code in bachelor_semester_codes.items():
        
    print('Compute for ' + semester_name)
    
    # The variable contains a list of dataframe related to a specific semester.
    for year_name, year_code in year_codes.items():

        file_path = DATA_FOLDER + year_name + ' ' + semester_name
        
        if os.path.exists(file_path):

            # We get a dataframe from a table in the html
            bachelor_per_year = pd.read_pickle(file_path)

            # We add the attribute year, semester 1 is the begining of the year name, and semester 6 the end
            bachelor_per_year[semester_name] = pd.to_numeric(year_name.split('-')[semester_name.endswith('6')])

            # We add the result to 
            bachelor_dataframes[semester_name] =  pd.concat([bachelor_dataframes[semester_name],bachelor_per_year])

## Visualizing and cleaning data

In [None]:
bachelor_data_semester1 = bachelor_dataframes['Bachelor semestre 1'].copy()
bachelor_data_semester1.head()

In [None]:
bachelor_data_semester6 = bachelor_dataframes['Bachelor semestre 6'].copy()
bachelor_data_semester6.head()

### About the student status

We see a status column that shows if the student is present or not in the semester. This parameter can have an impact on the result we want to compute:

In [None]:
bachelor_data_semester1['Statut'].value_counts()

In [None]:
bachelor_data_semester6['Statut'].value_counts()

We have some students who didn't attend the first or last semester and only one (in semester 1 and 6) in a waiting state.
We can't know with this data if a student registered in bachelor semester 6 has successfully achieved his bachelor degree or not (i.e. a student can be marked as "présent" or "congé" and in the same time can fail or success). For this reasons we have to make new asumptions :
 - 4) The earliest year of bachelor semester 1 stored in database correspond to the begining of his bachelor plan, regardless the status of students ('Présent', 'Congé' or 'Attente')
 - 5) We consider that all students obtain their bachelor regardless their status for the bachelor semester 1 and bachelor semester 6 (obviously, they have to be still present at bachelor semester 6).
 
 > Note: The fifth assumption is not mandatory here as we just want to know the average duration of a bachelor study plan.

### Handle duplicate entries and making index

We want a unique index, in our case this will be the Sciper number :

In [None]:
bachelor_data_semester1['No Sciper'].is_unique

In [None]:
bachelor_data_semester6['No Sciper'].is_unique

We see that isn't unique, so some students repeat the semester 1 or semester 6. We need to keep the earliest year in the case of semester 1 and the lastest year in the second case. To do so, we will sort the dataframe by the year then we keep only the first entry for each students (asumption 4 and 5).

In [None]:
bachelor_data_semester1.sort_values('Bachelor semestre 1', inplace=True)
bachelor_data_semester1.drop_duplicates(subset='No Sciper', keep='first', inplace=True)

In [None]:
bachelor_data_semester6.sort_values('Bachelor semestre 6', ascending=False, inplace=True)
bachelor_data_semester6.drop_duplicates(subset='No Sciper', keep='first', inplace=True)

In [None]:
bachelor_data_semester1['No Sciper'].is_unique

In [None]:
bachelor_data_semester6['No Sciper'].is_unique

We see now that the sciper number is unique, so we set this column in index.

In [None]:
bachelor_data_semester1.set_index('No Sciper', inplace=True)
bachelor_data_semester6.set_index('No Sciper', inplace=True)
bachelor_data_semester1.head()

### Merging data

Now, we want to merge the two obtained (and cleaned) tables. For this, we use the 'merge' function provided by Pandas, specifying some parameters like the columns we want to preserve. Note that we use indexes to perform the merge, and we use inner merge as we want to consider only students who started their bachelor at EPFL.

In [None]:
student_bachelor_done = pd.merge(
    bachelor_data_semester1[['Civilité', 'Nom Prénom', 'Bachelor semestre 1']],
    bachelor_data_semester6[['Bachelor semestre 6']],
    left_index=True,
    right_index=True,
    how='inner'
)

student_bachelor_done.head()

In [None]:
def compute_nb_months_bachelor(row):
    return (row['Bachelor semestre 6'] - row['Bachelor semestre 1']) * 12

student_bachelor_done['Durée bachelor (mois)'] = student_bachelor_done.apply(compute_nb_months_bachelor, axis=1)

student_bachelor_done.head()

In [None]:
student_bachelor_done[[nb == 36 for nb in student_bachelor_done['Durée bachelor (mois)']]]

### Now Civilité TODO

In [None]:
student_bachelor_done['Civilité'].value_counts()

In [None]:
student_bachelor_by_sex = student_bachelor_done[['Civilité', 'Durée bachelor (mois)']].groupby(['Civilité'])
student_bachelor_by_sex.mean()

In [None]:
len(student_bachelor_done)

As we said before, we only consider the students who started their study at EPFL. Thus, we only obtain 397 students at the end, while for bachelor semester 6, we had 500+ entries.