In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats

In [2]:
programs = ["Informatique"]
years = ["2007-2008", "2008-2009", "2009-2010", "2010-2011", "2011-2012", "2012-2013",
         "2013-2014", "2014-2015", "2015-2016", "2016-2017"]
bachelor_semesters = ["Bachelor semestre 1", "Bachelor semestre 2", "Bachelor semestre 3", "Bachelor semestre 4",
                     "Bachelor semestre 5", "Bachelor semestre 6"]
master_semesters = ["Master semestre 1", "Master semestre 2", "Master semestre 3", "Projet Master automne", 
                   "Projet Master printemps"]

In [3]:
report_model_param = "133685247"
base_url = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS"
r = requests.get(base_url+".filter", params = {"ww_i_reportModel": report_model_param})
form = BeautifulSoup(r.text, 'html.parser')
b_list_param = form.find_all("input", attrs={"name":"ww_b_list"})[0]['value']
report_model_xsl_param = form.find_all("input", attrs={"name":"ww_i_reportModelXsl"})[0]['value']

In [4]:
program_values = {}
for program in programs:
    program_values[program] = form.find_all("option", string=program)[0]['value']

In [5]:
year_values = {}
for year in years:
    year_values[year] = form.find_all("option", string=year)[0]['value']

In [6]:
bachelor_semester_values = {}
for semester in bachelor_semesters:
    bachelor_semester_values[semester] = form.find_all("option", string=semester)[0]['value']

In [7]:
master_semester_values = {}
for semester in master_semesters:
    master_semester_values[semester] = form.find_all("option", string=semester)[0]['value']

In [8]:
def get_dataset(prog_val, y_val, sem_val):
    params = {"ww_i_reportModel": report_model_param, 
              "ww_b_list": b_list_param, 
              "ww_i_reportModelXsl": report_model_xsl_param, 
              "ww_x_UNITE_ACAD": prog_val,
              "ww_x_PERIODE_ACAD": y_val,
              "ww_x_PERIODE_PEDAGO": sem_val}
    r = requests.get(base_url+".filter", params = params)
    set_page = BeautifulSoup(r.text, 'html.parser')
    gps_string = set_page.find_all("a", attrs={"class": "ww_x_GPS"})[1]["onclick"]
    gps_value = ''.join(list(filter(str.isdigit, gps_string)))
    params = {"ww_i_reportModel": report_model_param, 
              "ww_i_reportModelXsl": report_model_xsl_param,
              "ww_x_GPS": gps_value}
    r = requests.get(base_url+".html", params = params)
    res=pd.read_html(r.text, skiprows = [0], header = 0, index_col = "Nom Prénom")
    if len(res):
        return res[0].drop("Unnamed: 11", axis=1)
    else:
        return pd.DataFrame(columns = ['Civilité', 'Orientation Bachelor', 'Orientation Master', 
                                       'Spécialisation', 'Filière opt.', 'Mineur', 'Statut', 'Type Echange', 
                                       'Ecole Echange', 'No Sciper'])

In [None]:
bachelor_datasets = {}
for year in years:
    bachelor_datasets[year] = {}
for year in years:
    for semester in bachelor_semesters:
        print(year, semester)
        bachelor_datasets[year][semester] = get_dataset(program_values[programs[0]], 
                                                        year_values[year], bachelor_semester_values[semester])


2007-2008 Bachelor semestre 1


In [None]:
for year in years:
    for semester in bachelor_semesters:
        print(year, semester, bachelor_datasets[year][semester].shape)

In [None]:
master_datasets = {}
for year in years:
    master_datasets[year] = {}
for year in years:
    for semester in master_semesters:
        print(year, semester)
        master_datasets[year][semester] = get_dataset(program_values[programs[0]], 
                                                        year_values[year], master_semester_values[semester])

In [None]:
for year in years:
    for semester in master_semesters:
        print(year, semester, master_datasets[year][semester].shape)

In [None]:
sem_1 = 'Bachelor semestre 1'
sem_5 = 'Bachelor semestre 5'
sem_6 = 'Bachelor semestre 6'
bachelor_sem1 = pd.DataFrame()
bachelor_sem5 = pd.DataFrame()
bachelor_sem6 = pd.DataFrame()
for y in years:
    
    with_years = bachelor_datasets[y][sem_1].copy()
    with_years['Years'] = pd.Series([y]*with_years.shape[0], index=with_years.index)
    bachelor_sem1=bachelor_sem1.append(with_years)
    
    with_years = bachelor_datasets[y][sem_5].copy()
    with_years['Years'] = pd.Series([y]*with_years.shape[0], index=with_years.index)
    bachelor_sem5=bachelor_sem5.append(with_years)
                             
    with_years = bachelor_datasets[y][sem_6].copy()
    with_years['Years'] = pd.Series([y]*with_years.shape[0], index=with_years.index)
    bachelor_sem6=bachelor_sem6.append(with_years) 
                             
graduated_students = pd.DataFrame()
for index, row in bachelor_sem1.iterrows():
    if bachelor_sem5['No Sciper'].isin([row['No Sciper']]).values.any() & bachelor_sem6['No Sciper'].isin([row['No Sciper']]).values.any():
        graduated_students=graduated_students.append(row)
del (graduated_students['Years'])
print (graduated_students.head())

In [None]:
graduated_students.drop_duplicates('No Sciper', inplace=True)
months = [0]*len(graduated_students['No Sciper'])
for ind, sciper in enumerate(graduated_students['No Sciper']):
    years_6sem = np.where(bachelor_sem6['No Sciper'] == sciper)
    years_5sem = np.where(bachelor_sem5['No Sciper'] == sciper)
    finish_year_6 = bachelor_sem6['Years'].ix[years_6sem[0][len(years_6sem[0])-1]]
    finish_year_5 = bachelor_sem5['Years'].ix[years_5sem[0][len(years_5sem[0])-1]]
    years_1sem = np.where(bachelor_sem1['No Sciper'] == sciper)
    beginning_year = bachelor_sem1['Years'].ix[years_1sem[0][0]]
    
    months[ind] =12*(int(finish_year_6[0:4]) - int (beginning_year[0:4])+1)
    if (finish_year_5>finish_year_6): months[ind] +=6
        
graduated_students['Months'] = pd.Series(months, index=graduated_students.index)
print(graduated_students.head())

In [20]:
graduated_grouped = graduated_students.groupby(graduated_students.Civilité)
print(graduated_grouped['Months'].mean())
print(graduated_grouped['Months'].describe()) #we see that average number of months is bigger for male than for female
                                              #on the other hand, when we apply robust statistic we see that number of months
                                              #considering 50% of population is equal for male and female. we conclude that 
                                              #average value for male is only bigger because there exist "bigger" outliers (max=84 months)
                                              #therefore, there are no statisticaly significant differences
bp = graduated_students.boxplot(column='Months', by='Civilité', grid=False)
for i in ['Madame', 'Monsieur']:
    y = graduated_students.Months[graduated_students.Civilité==i]
    x = np.random.normal(0, 0.04, size=len(y))
    plt.plot(x, y, alpha=0.2)
    
    
#ok, here we need to compare two arrays of values(for each sex);
#I suggest t_two sample test because on this list http://sites.stat.psu.edu/~ajw13/stat500_su_res/notes/lesson14/images/summary_table.pdf
#it seems most convenient
#you should call function stats.ttest_ind where a and b are arrays of values for men and women, those array you should 
#obtain from graduated_grouped['Months'], it schouldn't be too dificult
#after execution you obtain something like this Ttest_indResult(statistic=-1.7083870793286842, pvalue=0.090731043439577483)
#if the second number is <0.05 that means that in less than 5% of cases same distributions would differ this much-so it is 
#statistically significant
a1,b1=graduated_grouped['Months'].values #this is not real code, just pseudo code
stats.ttest_ind(a=a1 ,b= b1,equal_var=False)

NameError: name 'graduated_students' is not defined

In [None]:
sem_1 = 'Master semestre 1'
sem_2 = 'Master semestre 2'
sem_3 = 'Master semestre 3'
proj_a = 'Projet Master automne'
proj_p = 'Projet Master printemps'
master_sem1 = pd.DataFrame()
master_sem2 = pd.DataFrame()
master_sem3 = pd.DataFrame()
master_proj_a = pd.DataFrame()
master_proj_p = pd.DataFrame()
for y in years:
    
    with_years = master_datasets[y][sem_1].copy()
    with_years['Years'] = pd.Series([y]*with_years.shape[0], index=with_years.index)
    master_sem1=master_sem1.append(with_years)
    
    with_years = master_datasets[y][sem_2].copy()
    with_years['Years'] = pd.Series([y]*with_years.shape[0], index=with_years.index)
    master_sem2=master_sem2.append(with_years)
                             
    with_years = master_datasets[y][sem_3].copy()
    with_years['Years'] = pd.Series([y]*with_years.shape[0], index=with_years.index)
    master_sem3=master_sem3.append(with_years) 
    
    with_years = master_datasets[y][proj_a].copy()
    with_years['Years'] = pd.Series([y]*with_years.shape[0], index=with_years.index)
    master_proj_a=master_proj_a.append(with_years) 
    
    with_years = master_datasets[y][proj_p].copy()
    with_years['Years'] = pd.Series([y]*with_years.shape[0], index=with_years.index)
    master_proj_p=master_proj_p.append(with_years) 
                             
graduated_master = pd.DataFrame()
for index, row in master_sem1.iterrows():
    if master_sem2['No Sciper'].isin([row['No Sciper']]).values.any() and not master_sem3['No Sciper'].isin([row['No Sciper']]).values.any():
        years_1sem = np.where(master_sem1['No Sciper'] == row['No Sciper'])
        if master_sem1.ix[years_1sem[0][0]]['Years']!= '2016-2017':
            graduated_master=graduated_master.append(row)
    elif master_sem2['No Sciper'].isin([row['No Sciper']]).values.any() and master_sem3['No Sciper'].isin([row['No Sciper']]).values.any():
        years_3sem = np.where(master_sem3['No Sciper'] == row['No Sciper'])
        if master_sem3.ix[years_3sem[0][len(years_3sem[0])-1]]['Years']!= '2016-2017':
            graduated_master=graduated_master.append(master_sem3.ix[years_3sem[0][len(years_3sem[0])-1]])
#del (graduated_students['Years'])
print (graduated_master.shape)

In [None]:
#drop_na treba da se doda vrv
graduated_master.drop_duplicates('No Sciper', inplace=True)
months = [0]*len(graduated_master['No Sciper'])
for ind, sciper in enumerate(graduated_master['No Sciper']):
    
    years_proj_a = np.where(master_proj_a['No Sciper'] == sciper)
    years_proj_p = np.where(master_proj_p['No Sciper'] == sciper)
    years_3sem = np.where(master_sem3['No Sciper'] == sciper)
    years_2sem = np.where(master_sem2['No Sciper'] == sciper)
    years_1sem = np.where(master_sem1['No Sciper'] == sciper)
    
    begin_sem1 = master_sem1['Years'].ix[years_1sem[0][0]]
    begin_sem2 = master_sem2['Years'].ix[years_2sem[0][0]]
    beginning_master = min(int(begin_sem1[0:4]), int(begin_sem2[0:4]))
    if int(begin_sem2[0:4])<int(begin_sem1[0:4]): months[ind]-=6 
    
    finish_year_proj_a =0
    finish_year_proj_p =0
    if len(years_proj_a[0])!= 0 or len(years_proj_p[0])!= 0:
        if len(years_proj_a[0])!= 0: 
            finish_year_proj_a = master_proj_a['Years'].ix[years_proj_a[0][len(years_proj_a[0])-1]] 
            finish_year_proj_a =int(finish_year_proj_a[0:4])
        if len(years_proj_p[0])!= 0: 
            finish_year_proj_p = master_proj_p['Years'].ix[years_proj_p[0][len(years_proj_p[0])-1]]
            finish_year_proj_p =int(finish_year_proj_p[0:4])
        if finish_year_proj_a>finish_year_proj_p: months[ind] -=6
        finish_master = max(finish_year_proj_a, finish_year_proj_p) 
    else:
        finish_year_sem1 = master_sem1['Years'].ix[years_1sem[0][len(years_1sem[0])-1]] 
        finish_year_sem2 = master_sem2['Years'].ix[years_2sem[0][len(years_2sem[0])-1]] 
        if  master_sem3['No Sciper'].isin([sciper]).values.any():
            finish_year_sem3 = master_sem3['Years'].ix[years_3sem[0][len(years_3sem[0])-1]]
            finish_master = max(int(finish_year_sem1[0:4]), int(finish_year_sem2[0:4]), int(finish_year_sem3[0:4]))
            if int(finish_year_sem1[0:4])>int(finish_year_sem2[0:4]) or int(finish_year_sem3[0:4])>int(finish_year_sem2[0:4]):
                months[ind]-=6
        else: 
            finish_master = max(int(finish_year_sem1[0:4]), int(finish_year_sem2[0:4]))
            if int(finish_year_sem1[0:4])>int(finish_year_sem1[0:4]): months[ind]-=6
    months[ind] +=12*(finish_master - beginning_master+1)     
graduated_master['Months'] = pd.Series(months, index=graduated_master.index)
print(graduated_master.head(10))

In [None]:
print ('average: ', graduated_master['Months'].mean())
students_spec = graduated_master.copy()
students_spec=students_spec.dropna(subset=['Spécialisation'])
student_spec_grouped = students_spec.groupby(students_spec.Spécialisation)
print(student_spec_grouped['Months'].mean())
print (graduated_master[graduated_master.Months==12].shape)

#in this case I would use one sample T-test, because we have a number -average and a sample, and we should compare 
#sample for each specialisaion with average
#the same, if the pvalue in result is <0.05 we say there it is statisticaly significant
for group in student_spec_grouped:
    a1 = group['Months'].values       #this is not real code, just pseudo code
    stats.ttest_1samp(a= a1,avr) 