In [58]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn import linear_model
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import LeaveOneOut
sns.set(style="whitegrid")
sns.set_color_codes("pastel")
%matplotlib inline

### Preprocessing

In [2]:
math = pd.read_csv('math.csv')
port = pd.read_csv('port.csv')

In [3]:
# Dropping the first column of ID numbers
del[math['Unnamed: 0']]
del[port['Unnamed: 0']]

In [4]:
# This will be our transformed dataframes
mathTransformed = math.copy(deep=True)
portTransformed = port.copy(deep=True)

In [5]:
'''
1 school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira) 
2 sex - student's sex (binary: 'F' - female or 'M' - male) 
3 age - student's age (numeric: from 15 to 22) 
4 address - student's home address type (binary: 'U' - urban or 'R' - rural) 
5 famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3) 
6 Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart) 
7 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education) 
8 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education) 
9 Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') 
10 Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') 
11 reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other') 
12 guardian - student's guardian (nominal: 'mother', 'father' or 'other') 
13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour) 
14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours) 
15 failures - number of past class failures (numeric: n if 1<=n<3, else 4) 
16 schoolsup - extra educational support (binary: yes or no) 
17 famsup - family educational support (binary: yes or no) 
18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no) 
19 activities - extra-curricular activities (binary: yes or no) 
20 nursery - attended nursery school (binary: yes or no) 
21 higher - wants to take higher education (binary: yes or no) 
22 internet - Internet access at home (binary: yes or no) 
23 romantic - with a romantic relationship (binary: yes or no) 
24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent) 
25 freetime - free time after school (numeric: from 1 - very low to 5 - very high) 
26 goout - going out with friends (numeric: from 1 - very low to 5 - very high) 
27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high) 
28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high) 
29 health - current health status (numeric: from 1 - very bad to 5 - very good) 
30 absences - number of school absences (numeric: from 0 to 93) 

# these grades are related with the course subject, Math or Portuguese: 
31 G1 - first period grade (numeric: from 0 to 20) 
31 G2 - second period grade (numeric: from 0 to 20) 
32 G3 - final grade (numeric: from 0 to 20, output target
'''
pass

In [6]:
def returnCategoricalColumns(df):
    '''
    This function takes in a dataframe and returns the 
    column names which are categorical / must be labeled
    numerically for training.
    '''
    columns = []
    for i in list(df):
        if not str(math[i][0]).isdigit():
            columns.append(i)
    return columns    

In [7]:
def getBinaryAndNominalColumns(df, categoricalColumns):
    '''
    This function takes a list of categorical columns
    and separates into a list of binary columns, which
    we can easily one label encode into 0/1 and 
    nominal columns which we will need to one hot vector
    encode.
    '''
    binary = []
    nominal = []
    for i in categoricalColumns:
        if len(list(set(df[i]))) == 2:
            binary.append(i)
        else:
            nominal.append(i)
    return binary, nominal

In [8]:
# categorical columns are the same for both port and math
assert(returnCategoricalColumns(math) == returnCategoricalColumns(port))
categoricalColumns = returnCategoricalColumns(math)
print(categoricalColumns)

['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']


In [9]:
binary, nominal = getBinaryAndNominalColumns(math, categoricalColumns)
print(binary)
print(nominal)

['school', 'sex', 'address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
['Mjob', 'Fjob', 'reason', 'guardian']


In [10]:
def convertBinaryColumns(df, binary, transformed_df):
    '''
    This function converts the binary columns in our dataframe
    into numerical (0/1) labels and adds a corresponding
    column to transformed_df
    '''
    for i in binary:
        lb_style = LabelBinarizer()
        lb_results = lb_style.fit_transform(df[i])
        transformed_df[i] = lb_results

In [11]:
convertBinaryColumns(math, binary, mathTransformed)
convertBinaryColumns(port, binary, portTransformed)

In [12]:
assert(list(portTransformed) == list(port))
assert(list(mathTransformed) == list(math))

In [13]:
# One hot vector encode nominal variables
mathTransformed = pd.get_dummies(mathTransformed, columns=nominal)
portTransformed = pd.get_dummies(portTransformed, columns=nominal)

In [14]:
assert(list(portTransformed) == list(mathTransformed))

In [15]:
for i in list(mathTransformed):
    if not str(mathTransformed[i][0]).isdigit() or not str(portTransformed[i][0]).isdigit():
        assert(False)

### We have finally one labeled / one hot vector encoded our input data and can now start learning.

In [16]:
targetColumn = ['G3']
featureColumns = [i for i in list(portTransformed) if i not in targetColumn]

In [17]:
portX = portTransformed[featureColumns]
mathX = mathTransformed[featureColumns]
portY = portTransformed[targetColumn]
mathY = mathTransformed[targetColumn]

In [54]:
def linearRegression(originalDf, targetColumn, featureColumns):
    '''
    Runs unregularized linear regression and returns the 
    R^2 and mean squared error (averaged over all folds 
    via cross validation)
    '''
    lm = linear_model.LinearRegression(normalize=True)
    X = originalDf[featureColumns]
    y = originalDf[targetColumn]
    scores = cross_validate(lm, X, y, scoring=['r2','neg_mean_squared_error'], cv=10, return_train_score=False)
    return scores

In [96]:
def linearRegressionRidge(originalDf, targetColumn, featureColumns):
    '''
    Runs ridge regularized linear regression and returns the 
    mean and standard deviation of test scores
    '''
    parameters = {'alpha' : np.arange(0.0001, 0.01, 0.01)}
    scoringMethods = ['r2','neg_mean_squared_error']
    test_scores = []
    for score in scoringMethods:
        lm = linear_model.Ridge(normalize=True)
        X = originalDf[featureColumns]
        y = originalDf[targetColumn]
        clf = GridSearchCV(lm, parameters, cv=10, scoring=score)
        clf.fit(X,y)
        test_scores.append(clf.cv_results_['mean_test_score'])
    return test_scores

In [108]:
def linearRegressionLasso(originalDf, targetColumn, featureColumns):
    '''
    Runs lasso regularized linear regression and returns the 
    mean and standard deviation of test scores
    '''
    parameters = {'alpha' : np.arange(0.0001, 0.01, 0.01)}
    scoringMethods = ['r2','neg_mean_squared_error']
    test_scores = []
    for score in scoringMethods:
        lm = linear_model.Lasso(normalize=True)
        X = originalDf[featureColumns]
        y = originalDf[targetColumn]
        clf = GridSearchCV(lm, parameters, cv=10, scoring=score)
        clf.fit(X,y)
        test_scores.append(clf.cv_results_['mean_test_score'])
    return test_scores

#### Let's first start with linear regression

In [18]:
######################## WITH GRADES ###############################

In [87]:
# Unregularized linear regression
scores = linearRegression(portTransformed, targetColumn, featureColumns)
portR2, port_mean_sq = scores['test_r2'], scores['test_neg_mean_squared_error']
print("Accuracy: %0.2f (+/- %0.2f)" % (portR2.mean(), portR2.std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (port_mean_sq.mean(), port_mean_sq.std() * 2))

Accuracy: 0.80 (+/- 0.18)
Accuracy: -1.80 (+/- 2.95)


In [99]:
# With ridge regression
scoresRidge = linearRegressionRidge(portTransformed, targetColumn, featureColumns)
portR2, port_mean_sq = scoresRidge[0], scoresRidge[1]
print("Accuracy: %0.2f (+/- %0.2f)" % (portR2.mean(), portR2.std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (port_mean_sq.mean(), port_mean_sq.std() * 2))

Accuracy: 0.80 (+/- 0.00)
Accuracy: -1.79 (+/- 0.00)


In [110]:
# With lasso regression
scoresLasso = linearRegressionLasso(portTransformed, targetColumn, featureColumns)
portR2, port_mean_sq = scoresLasso[0], scoresLasso[1]
print("Accuracy: %0.2f (+/- %0.2f)" % (portR2.mean(), portR2.std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (port_mean_sq.mean(), port_mean_sq.std() * 2))

Accuracy: 0.81 (+/- 0.00)
Accuracy: -1.78 (+/- 0.00)
