We try to determine which features (besides the grades, especially the Mathematics ones) explain the academic success. We consider all grades higher or equal to 10 to be passing grades and we mark them with 1 and all the grades bellow that as failing grades and we mark them with -1.

The goal is to find out if we are able to find some features (other that previous grades) that are able on their own to predict the academic outcome of the students: failed or passed.

In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

students_combined = pd.read_csv('../data/students-combined.csv', sep = ';')
students_combined = students_combined.drop(columns = 'Unnamed: 0')

students_combined

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,health,absences_x,G1_x,G2_x,G3_x,paid_y,absences_y,G1_y,G2_y,G3_y
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,0,11,11,no,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,2,9,11,11,no,4,5,5,6
2,GP,F,15,U,GT3,T,4,2,health,services,...,5,0,14,14,14,yes,2,15,14,15
3,GP,F,16,U,GT3,T,3,3,other,other,...,5,0,11,13,13,yes,4,6,10,10
4,GP,M,16,U,LE3,T,4,3,services,other,...,5,6,12,12,13,yes,10,15,15,15
5,GP,M,16,U,LE3,T,2,2,other,other,...,3,0,13,12,13,no,0,12,12,11
6,GP,F,17,U,GT3,A,4,4,other,teacher,...,1,2,10,13,13,no,6,6,5,6
7,GP,M,15,U,LE3,A,3,2,services,other,...,1,0,15,16,17,yes,0,16,18,19
8,GP,M,15,U,GT3,T,3,4,other,other,...,5,0,12,12,13,yes,0,14,15,15
9,GP,F,15,U,GT3,T,4,4,teacher,health,...,2,2,14,14,14,yes,0,10,8,9


In [2]:
students_combined['paid_y']     = students_combined['paid_y'].apply(lambda x: -1 if x == "no" else 1)
students_combined['paid_x']     = students_combined['paid_x'].apply(lambda x: -1 if x == "no" else 1)
students_combined['address']    = students_combined['address'].apply(lambda x: -1 if x == "R" else 1)
students_combined['famsize']    = students_combined['famsize'].apply(lambda x: -1 if x == "GT3" else 1)
students_combined['Pstatus']    = students_combined['Pstatus'].apply(lambda x: -1 if x == "A" else 1)
students_combined['Mjob']       = students_combined['Mjob'].apply(lambda x: 1 if x == "teacher" else 2 \
                                                                  if x == "health" else 3 if x == "services" else 4 \
                                                                  if x == "at_home" else 5)
students_combined['Fjob']       = students_combined['Fjob'].apply(lambda x: 1 if x == "teacher" else 2 \
                                                                  if x == "health" else 3 if x == "services" else 4 \
                                                                  if x == "at_home" else 5)
students_combined['reason']     = students_combined['reason'].apply(lambda x: 1 if x == "home" else 2 \
                                                                    if x == "reputation" else 3 if "course" else 4)
students_combined['guardian']   = students_combined['guardian'].apply(lambda x: 1 if x == "mother" else 2 \
                                                                      if x == "father" else 3)
students_combined['schoolsup']  = students_combined['schoolsup'].apply(lambda x: -1 if x == "no" else 1)
students_combined['famsup']     = students_combined['famsup'].apply(lambda x: -1 if x == "no" else 1)
students_combined['activities'] = students_combined['activities'].apply(lambda x: -1 if x == "no" else 1)
students_combined['nursery']    = students_combined['nursery'].apply(lambda x: -1 if x == "no" else 1)
students_combined['higher']     = students_combined['higher'].apply(lambda x: -1 if x == "no" else 1)
students_combined['internet']   = students_combined['internet'].apply(lambda x: -1 if x == "no" else 1)
students_combined['romantic']   = students_combined['romantic'].apply(lambda x: -1 if x == "no" else 1)

students_combined

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,health,absences_x,G1_x,G2_x,G3_x,paid_y,absences_y,G1_y,G2_y,G3_y
0,GP,F,18,1,-1,-1,4,4,4,1,...,3,4,0,11,11,-1,6,5,6,6
1,GP,F,17,1,-1,1,1,1,4,5,...,3,2,9,11,11,-1,4,5,5,6
2,GP,F,15,1,-1,1,4,2,2,3,...,5,0,14,14,14,1,2,15,14,15
3,GP,F,16,1,-1,1,3,3,5,5,...,5,0,11,13,13,1,4,6,10,10
4,GP,M,16,1,1,1,4,3,3,5,...,5,6,12,12,13,1,10,15,15,15
5,GP,M,16,1,1,1,2,2,5,5,...,3,0,13,12,13,-1,0,12,12,11
6,GP,F,17,1,-1,-1,4,4,5,1,...,1,2,10,13,13,-1,6,6,5,6
7,GP,M,15,1,1,-1,3,2,3,5,...,1,0,15,16,17,1,0,16,18,19
8,GP,M,15,1,-1,1,3,4,5,5,...,5,0,12,12,13,1,0,14,15,15
9,GP,F,15,1,-1,1,4,4,1,2,...,2,2,14,14,14,1,0,10,8,9


Converting the grades in passed or failed:

In [3]:
students_combined['G1_x'] = students_combined['G1_x'].apply(lambda x: -1 if x < 10 else 1)
students_combined['G2_x'] = students_combined['G2_x'].apply(lambda x: -1 if x < 10 else 1)
students_combined['G3_x'] = students_combined['G3_x'].apply(lambda x: -1 if x < 10 else 1)
students_combined['G1_y'] = students_combined['G1_y'].apply(lambda x: -1 if x < 10 else 1)
students_combined['G2_y'] = students_combined['G2_y'].apply(lambda x: -1 if x < 10 else 1)
students_combined['G3_y'] = students_combined['G3_y'].apply(lambda x: -1 if x < 10 else 1)

students_combined

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,health,absences_x,G1_x,G2_x,G3_x,paid_y,absences_y,G1_y,G2_y,G3_y
0,GP,F,18,1,-1,-1,4,4,4,1,...,3,4,-1,1,1,-1,6,-1,-1,-1
1,GP,F,17,1,-1,1,1,1,4,5,...,3,2,-1,1,1,-1,4,-1,-1,-1
2,GP,F,15,1,-1,1,4,2,2,3,...,5,0,1,1,1,1,2,1,1,1
3,GP,F,16,1,-1,1,3,3,5,5,...,5,0,1,1,1,1,4,-1,1,1
4,GP,M,16,1,1,1,4,3,3,5,...,5,6,1,1,1,1,10,1,1,1
5,GP,M,16,1,1,1,2,2,5,5,...,3,0,1,1,1,-1,0,1,1,1
6,GP,F,17,1,-1,-1,4,4,5,1,...,1,2,1,1,1,-1,6,-1,-1,-1
7,GP,M,15,1,1,-1,3,2,3,5,...,1,0,1,1,1,1,0,1,1,1
8,GP,M,15,1,-1,1,3,4,5,5,...,5,0,1,1,1,1,0,1,1,1
9,GP,F,15,1,-1,1,4,4,1,2,...,2,2,1,1,1,1,0,1,-1,-1


In [9]:
features = students_combined.loc[:, ['address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 
                                    'famsup', 'paid_x', 'paid_y', 'activities', 'nursery', 'higher', 'internet', 'romantic',
                                    'Medu', 'Fedu', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc',
                                    'health', 'G1_x', 'G2_x', 'G1_y', 'G2_y', 'absences_x', 'absences_y']]

target   = students_combined.loc[:, ['G3_x', 'G3_y']]

for feature in features.columns: 
    features[feature] = (features[feature] - np.mean(features[feature])) / np.std(features[feature])

for feature in target.columns:
    target[feature] = (target[feature] - np.mean(target[feature])) / np.std(target[feature])
    
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)

In [10]:
print('Default number of features: {}'.format(2 * features.shape[1]))
print('{} features for the first predicted value and {} for the second one'.format(features.shape[1], features.shape[1]))
print('=' * 100)

lasso_normal = Lasso(max_iter = 10e6)
lasso_normal = lasso_normal.fit(X_train, y_train)

train_score = lasso_normal.score(X_train, y_train)
test_score  = lasso_normal.score(X_test, y_test)
coefficients_used = np.sum(lasso_normal.coef_ != 0)

print("Training score default lasso: {}".format(train_score))
print("Test score default lasso: {}".format(test_score))
print("Number of features used: {}".format(coefficients_used))

coef1, coef2 = np.array(lasso_normal.coef_)
most_relevant1 = coef1.argsort()[-10:]
most_relevant2 = coef2.argsort()[-10:]
print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

alpha_values = [0.1, 0.01, 0.001, 0.0001, 2, 5]
for alpha_iter in alpha_values:
    
    lasso = Lasso(alpha = alpha_iter, max_iter = 10e6)
    lasso = lasso.fit(X_train, y_train)

    train_score = lasso.score(X_train, y_train)
    test_score  = lasso.score(X_test, y_test)
    coefficients_used = np.sum(lasso.coef_ != 0)

    print("-" * 100)
    print("Training score alpha = {} lasso: {}".format(alpha_iter, train_score))
    print("Test score alpha = {} lasso: {}".format(alpha_iter, test_score))
    print("Number of features used: {}".format(coefficients_used))

    coef1, coef2 = np.array(lasso.coef_)
    most_relevant1 = coef1.argsort()[-10:]
    most_relevant2 = coef2.argsort()[-10:]
    print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
    print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

Default number of features: 64
32 features for the first predicted value and 32 for the second one
Training score default lasso: -4.940501124865857e-15
Test score default lasso: -0.009593111682307028
Number of features used: 0
Most relevant columns for Portuguese: Index(['absences_y', 'absences_x', 'famsize', 'Pstatus', 'Mjob', 'Fjob',
       'reason', 'guardian', 'schoolsup', 'famsup'],
      dtype='object')
Most relevant columns for Mathematics: Index(['absences_y', 'absences_x', 'famsize', 'Pstatus', 'Mjob', 'Fjob',
       'reason', 'guardian', 'schoolsup', 'famsup'],
      dtype='object')
----------------------------------------------------------------------------------------------------
Training score alpha = 0.1 lasso: 0.6067132872214965
Test score alpha = 0.1 lasso: 0.4110482180866936
Number of features used: 7
Most relevant columns for Portuguese: Index(['G2_x', 'G1_x', 'absences_y', 'internet', 'famsize', 'Pstatus', 'Mjob',
       'Fjob', 'reason', 'guardian'],
      dtype='ob

This can be viewed as an overall failure for the lasso technique in the case of determining pass or failed caracteristics then the previous grades classified as passed or failed are used. The failure is even more staggering when the absences or the previous grades are removed. Overall this does not represent a sound approach.