In [1]:
import numpy as np
import pandas as pd
import sys
from bs4 import BeautifulSoup as BSoup
import requests

In [2]:
formLink = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247"
showLink = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247"
r = requests.get(formLink)
soup = BSoup(r.text, "lxml")
selectFields = soup.find_all("select")
infoField=selectFields[0].find("option", text="Informatique")["value"]
infoFieldParam=selectFields[0]["name"]
bsSem1=selectFields[2].find("option", text="Bachelor semestre 1")["value"]
bsSem6=selectFields[2].find("option", text="Bachelor semestre 6")["value"]
semParam=selectFields[2]["name"]
allYears = [y["value"] for y in selectFields[1].find_all("option")[1:]]
yearParam = selectFields[1]["name"]
htmlradiobutton=soup.find("input", type="radio")

In [3]:
def showLinkGen(htmlP, htmlV, fieldP, fieldV, yearP, yearV, semP, semV):
  return "{}&{}={}&{}={}&{}={}&{}={}".format(showLink, htmlP, htmlV, fieldP, fieldV, yearP, yearV, semP, semV)
def get_bs_html(bsSem, year):
    link = showLinkGen(htmlradiobutton["name"], htmlradiobutton["value"], infoFieldParam, infoField, yearParam, 
                 year, semParam, bsSem)
    return requests.get(link)
def get_bs_dataframe(request_link):
  soup2 = BSoup(request_link.text, "lxml")
  elems=soup2.find_all("tr")[2:]
  titleinfo = soup2.find("font").text.split(', ')
  semester=titleinfo[2]
  semester=int(semester[(len(semester)-2):])
  year=titleinfo[1]
  all_data=[]
  for elem in elems:
    items=elem.find_all("td")
    gender = "M" if (items[0].text == "Monsieur") else "F"
    sciper = int(items[10].text)
    all_data.append({"Scipper": sciper, "Sex": gender, "Year": year, "Semester": semester})
  return pd.DataFrame(all_data)

In [4]:
def get_bs_alldata():
  all_data = []
  for bsSem in [bsSem1, bsSem6]:
    for year in allYears:
        all_data.append(get_bs_dataframe(get_bs_html(bsSem, year)))
  return pd.concat(all_data)
bachelor_data = get_bs_alldata()

In [5]:
bachelor_data

Unnamed: 0,Scipper,Semester,Sex,Year
0,235688,1,M,2016-2017
1,274015,1,M,2016-2017
2,268410,1,F,2016-2017
3,271464,1,M,2016-2017
4,274518,1,M,2016-2017
5,249613,1,M,2016-2017
6,262214,1,M,2016-2017
7,262239,1,M,2016-2017
8,257916,1,M,2016-2017
9,271508,1,F,2016-2017


In [6]:
data = bachelor_data.copy()

In [7]:
data.reset_index(None,drop=True,inplace=True)
data

Unnamed: 0,Scipper,Semester,Sex,Year
0,235688,1,M,2016-2017
1,274015,1,M,2016-2017
2,268410,1,F,2016-2017
3,271464,1,M,2016-2017
4,274518,1,M,2016-2017
5,249613,1,M,2016-2017
6,262214,1,M,2016-2017
7,262239,1,M,2016-2017
8,257916,1,M,2016-2017
9,271508,1,F,2016-2017


In [8]:
for i in range(data.shape[0]):
    if (data.loc[i,'Semester'] == 1):
        data.loc[i,'Year'] = data.loc[i,'Year'][0:4]
    else:
        data.loc[i,'Year'] = data.loc[i,'Year'][5:9]

data['Year'] = data['Year'].astype('int')
data.head()

Unnamed: 0,Scipper,Semester,Sex,Year
0,235688,1,M,2016
1,274015,1,M,2016
2,268410,1,F,2016
3,271464,1,M,2016
4,274518,1,M,2016


## keep only students who occur in sem1 and sem6

In [9]:
# only students in sem 1 and sem 6
idx_sem16 = (data.Scipper).isin(data[data.Semester == 6].Scipper) & (data.Scipper).isin(data[data.Semester == 1].Scipper)
data_sem16 = data[idx_sem16]
data_sem16.head()

Unnamed: 0,Scipper,Semester,Sex,Year
452,246671,1,M,2014
456,251759,1,M,2014
462,249498,1,M,2014
463,234551,1,M,2014
473,247328,1,M,2014


# compute stay time

In [10]:
scippers = data_sem16.Scipper.unique()
len(scippers)

397

In [11]:
tmp = pd.DataFrame({ 'Scipper' : scippers})
tmp['Staytime'] = np.nan
tmp = tmp.set_index(['Scipper'])
tmp.head()

Unnamed: 0_level_0,Staytime
Scipper,Unnamed: 1_level_1
246671,
251759,
249498,
234551,
247328,


In [12]:
data_sem16 = data_sem16.set_index(['Scipper'])

In [13]:
# students who graduated
for s in scippers:
    data_s = data_sem16.loc[s]
    nb = data_s.Year.max()-data_s.Year.min()
    nb = nb*12 - 6
    tmp.loc[s] = nb
tmp.reset_index(None,inplace=True)

In [14]:
tmp.head()

Unnamed: 0,Scipper,Staytime
0,246671,30.0
1,251759,30.0
2,249498,30.0
3,234551,30.0
4,247328,30.0


In [15]:
data_sem16.reset_index(None,inplace=True)
data_sem16 = data_sem16.drop(['Year','Semester'],axis=1)
data_sem16 = data_sem16.drop_duplicates()
data_sem16

Unnamed: 0,Scipper,Sex
0,246671,M
1,251759,M
2,249498,M
3,234551,M
4,247328,M
5,251758,M
6,250300,M
7,249954,M
8,249996,M
9,239489,F


In [16]:
data_sem16 = pd.merge(data_sem16, tmp, how='outer')
data_sem16

Unnamed: 0,Scipper,Sex,Staytime
0,246671,M,30.0
1,251759,M,30.0
2,249498,M,30.0
3,234551,M,30.0
4,247328,M,30.0
5,251758,M,30.0
6,250300,M,30.0
7,249954,M,30.0
8,249996,M,30.0
9,239489,F,30.0


# partitonning the data + comparing average

In [17]:
data_grouped = data_sem16.groupby('Sex')
data_grouped['Staytime'].mean()

Sex
F    33.724138
M    35.771739
Name: Staytime, dtype: float64

# Statistical tests

In [19]:
import scipy.stats as stats

## Divide Populations

In [20]:
data_F = data_sem16[data_sem16.Sex == 'F']
data_M = data_sem16[data_sem16.Sex == 'M']

## 1-sample T-Test

In a 1-sample T-Test, the null hypothesis assumes nothing interesting is going on between the variables we are testing. In this case, it means that there is no difference between each of the sub-populations and the whole population.

In [21]:
stats.ttest_1samp(data_M.Staytime,data_sem16.Staytime.mean())

Ttest_1sampResult(statistic=0.32081543725818445, pvalue=0.74853286145726683)

A p-value of 0.7485 means we'd expect to see data as extreme as our sample due to chance about 74.85% of the time if the null hypothesis was true. In this case, the p-value is higher than our significance level α (equal to 1-conf.level or 0.05) so we should not reject the null hypothesis.
#### => The stay-time average of males is statically significant.

In [22]:
stats.ttest_1samp(data_F.Staytime,data_sem16.Staytime.mean())

Ttest_1sampResult(statistic=-1.5732943251612512, pvalue=0.12688368345278089)

A p-value of 0.1268 means we'd expect to see data as extreme as our sample due to chance about only 12.68% of the time if the null hypothesis was true. 
In this case, the p-value is low than our significance level so we should reject the null hypothesis.
#### The stay-time average of females is not statically significant, since the female population is different.

## 2-sample T-Test

In a 2-sample T-Test, the null hypothesis states that the groups are the same.

In [23]:
stats.ttest_ind(a= data_M.Staytime, b = data_F.Staytime, equal_var = False)

Ttest_indResult(statistic=1.5831651359439409, pvalue=0.12191236829650401)

The test yields a p-value of 0.1219, which means there is a 12.19% chance we would see sample data this far apart if the two groups tested are actually identical.
#### => We conclude that the differance in the average of the stay-time between males and females is not statically significant.