In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import sqlite3
from scipy import stats
from scipy.stats import *

# next command ensures that plots appear inside the notebook
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns  # also improves the look of plots
sns.set()
plt.rcParams['figure.figsize'] = 10, 5  # default hor./vert. size of plots, in inches
plt.rcParams['lines.markeredgewidth'] = 1  # to fix issue with seaborn box plots; needed after import seaborn

In [2]:

#create the math database as a pands dataframe
df_student_mat = pd.read_csv('student-mat.csv')
df_student_mat.tail()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10
394,MS,M,19,U,LE3,T,1,1,other,at_home,...,3,2,3,3,3,5,5,8,9,9


In [3]:

#create the portuguese database as a pands dataframe
df_student_por = pd.read_csv('student-por.csv')
df_student_por.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0
mean,16.744222,2.514638,2.306626,1.568567,1.930663,0.22188,3.930663,3.180277,3.1849,1.502311,2.280431,3.53621,3.659476,11.399076,11.570108,11.906009
std,1.218138,1.134552,1.099931,0.74866,0.82951,0.593235,0.955717,1.051093,1.175766,0.924834,1.28438,1.446259,4.640759,2.745265,2.913639,3.230656
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,2.0,0.0,10.0,10.0,10.0
50%,17.0,2.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,2.0,11.0,11.0,12.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,6.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,32.0,19.0,19.0,19.0


In [4]:

# Create a database only showing the students who dont drink in the week
df_clean_0 = df_student_por["Dalc"] == 1
df_clean = df_student_por[df_clean_0]
df_clean.describe() 


Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0
mean,16.689579,2.512195,2.305987,1.536585,2.011086,0.197339,4.006652,3.104213,3.015521,1.0,1.764967,3.461197,3.195122,11.698448,11.895787,12.299335
std,1.191209,1.126137,1.097243,0.72441,0.820494,0.580109,0.910409,1.037199,1.165134,0.0,0.985547,1.43919,3.869203,2.774883,2.847261,3.102898
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,2.0,2.0,1.0,1.0,2.0,0.0,10.0,10.0,10.0
50%,17.0,2.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,1.0,4.0,2.0,12.0,12.0,12.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,1.0,2.0,5.0,5.0,14.0,14.0,14.0
max,21.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,1.0,5.0,5.0,22.0,19.0,19.0,19.0


In [5]:
# Create a database only show the students who never drink
df_cleanst_0 = df_clean["Walc"] == 1
df_cleanst = df_clean[df_cleanst_0]
df_cleanst.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0
mean,16.60166,2.53527,2.294606,1.560166,2.120332,0.211618,4.058091,3.107884,2.763485,1.0,1.0,3.369295,2.804979,11.692946,11.954357,12.369295
std,1.271088,1.106703,1.110711,0.74548,0.860015,0.592338,0.906428,1.055136,1.142947,0.0,0.0,1.455067,3.524813,2.832312,2.814026,3.045579
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,2.0,0.0,4.0,3.0,2.0,1.0,1.0,2.0,0.0,10.0,10.0,10.0
50%,16.0,2.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,1.0,3.0,2.0,12.0,12.0,13.0
75%,18.0,4.0,3.0,2.0,3.0,0.0,5.0,4.0,4.0,1.0,1.0,5.0,4.0,14.0,14.0,14.0
max,21.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,1.0,1.0,5.0,18.0,19.0,19.0,19.0


In [6]:
# Create a database showing the people who drink a lot during the weekdays
df_drunk_0 = df_student_por["Dalc"] == 5
df_drunk = df_student_por[df_drunk_0]
df_drunk.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0
mean,17.529412,2.411765,2.294118,1.705882,1.705882,0.529412,4.058824,3.705882,4.235294,5.0,4.588235,3.529412,7.058824,9.529412,10.058824,10.235294
std,1.736291,1.175735,1.104802,1.046704,1.046704,1.06757,1.144038,1.311712,1.032558,0.0,1.175735,1.419403,5.606588,2.600905,2.164214,2.94808
min,15.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,2.0,5.0,1.0,1.0,0.0,5.0,6.0,5.0
25%,17.0,1.0,2.0,1.0,1.0,0.0,4.0,3.0,3.0,5.0,5.0,3.0,2.0,8.0,9.0,9.0
50%,17.0,2.0,2.0,1.0,1.0,0.0,4.0,4.0,5.0,5.0,5.0,4.0,8.0,10.0,10.0,10.0
75%,18.0,3.0,3.0,2.0,2.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,10.0,11.0,11.0,11.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,21.0,16.0,16.0,16.0


In [7]:
#to only check the same quality of people 
df_smart_0 = df_student_por["failures"] == 0
df_smart = df_student_por[df_smart_0]
df_smart.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [8]:
# Create a database only showing the smart students who dont drink in the week
df_clean_0 = df_smart["Dalc"] == 1
df_clean = df_smart[df_clean_0]
df_clean.describe() 

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,16.563776,2.602041,2.369898,1.497449,2.061224,0.0,4.028061,3.066327,3.028061,1.0,1.760204,3.459184,3.017857,12.137755,12.352041,12.803571
std,1.115349,1.11937,1.093301,0.704384,0.828721,0.0,0.891407,1.019347,1.151771,0.0,0.969429,1.440506,3.701412,2.606583,2.604059,2.807496
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.75,1.0,2.0,0.0,4.0,2.0,2.0,1.0,1.0,2.0,0.0,10.0,11.0,11.0
50%,16.0,2.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,1.0,4.0,2.0,12.0,12.0,13.0
75%,17.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,1.0,2.0,5.0,4.0,14.0,14.0,15.0
max,20.0,4.0,4.0,4.0,4.0,0.0,5.0,5.0,5.0,1.0,5.0,5.0,22.0,19.0,19.0,19.0


In [9]:
# Create a database only show the smart students who never drink
df_cleanst_0 = df_smart["Walc"] == 1
df_cleanst = df_smart[df_cleanst_0]
df_cleanst.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0,212.0
mean,16.419811,2.641509,2.367925,1.518868,2.198113,0.0,4.080189,3.080189,2.778302,1.042453,1.0,3.367925,2.490566,12.165094,12.419811,12.858491
std,1.130442,1.103108,1.099983,0.731565,0.869927,0.0,0.901912,1.020261,1.115552,0.327405,0.0,1.465725,3.161514,2.670908,2.586546,2.782609
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,15.75,2.0,2.0,1.0,2.0,0.0,4.0,3.0,2.0,1.0,1.0,2.0,0.0,10.0,11.0,11.0
50%,16.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,1.0,4.0,2.0,12.0,12.0,13.0
75%,17.0,4.0,3.0,2.0,3.0,0.0,5.0,4.0,3.25,1.0,1.0,5.0,4.0,14.0,14.0,15.0
max,19.0,4.0,4.0,4.0,4.0,0.0,5.0,5.0,5.0,5.0,1.0,5.0,16.0,19.0,19.0,19.0


In [10]:
# Create a database showing the smart people who drink a lot during the weekdays
df_drunk_0 = df_smart["Dalc"] == 5
df_drunk = df_smart[df_drunk_0]
df_drunk.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0
mean,16.923077,2.615385,2.538462,1.769231,1.769231,0.0,3.923077,3.846154,4.307692,5.0,4.692308,3.846154,5.076923,10.0,10.538462,10.923077
std,0.954074,1.192928,1.126601,1.165751,1.165751,0.0,1.187542,1.463224,1.031553,0.0,1.1094,1.214232,4.132423,2.677063,2.183857,2.812518
min,15.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,2.0,5.0,1.0,1.0,0.0,5.0,6.0,6.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,4.0,5.0,5.0,3.0,2.0,9.0,10.0,9.0
50%,17.0,3.0,2.0,1.0,1.0,0.0,4.0,4.0,5.0,5.0,5.0,4.0,4.0,10.0,10.0,11.0
75%,18.0,4.0,4.0,2.0,2.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,8.0,11.0,11.0,12.0
max,18.0,4.0,4.0,4.0,4.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,12.0,16.0,16.0,16.0


In [11]:

# Create 2 pandas datframes one with only the males and one with only females
df_male_0 = df_student_por["sex"] == "M"
df_female_0 = df_student_por["sex"] == "F"
df_male = df_student_por[df_male_0]
df_female = df_student_por[df_female_0]
df_male.iloc[:,26:28].describe()

Unnamed: 0,Dalc,Walc
count,266.0,266.0
mean,1.815789,2.774436
std,1.139304,1.414835
min,1.0,1.0
25%,1.0,1.0
50%,1.0,3.0
75%,2.0,4.0
max,5.0,5.0


In [12]:
df_female.iloc[:,26:28].describe()

Unnamed: 0,Dalc,Walc
count,383.0,383.0
mean,1.284595,1.937337
std,0.659166,1.059112
min,1.0,1.0
25%,1.0,1.0
50%,1.0,2.0
75%,1.0,3.0
max,5.0,5.0
