# Looking into the Lasso technique for regression, trying to predict the 3rd grade based on a series of features

## First looking for the Portuguese students

In [33]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

students_portuguese = pd.read_csv('../data/student-por.csv', sep = ';')

students_portuguese

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
5,GP,M,16,U,LE3,T,4,3,services,other,...,5,4,2,1,2,5,6,12,12,13
6,GP,M,16,U,LE3,T,2,2,other,other,...,4,4,4,1,1,3,0,13,12,13
7,GP,F,17,U,GT3,A,4,4,other,teacher,...,4,1,4,1,1,1,2,10,13,13
8,GP,M,15,U,LE3,A,3,2,services,other,...,4,2,2,1,1,1,0,15,16,17
9,GP,M,15,U,GT3,T,3,4,other,other,...,5,5,1,1,1,5,0,12,12,13


The quantitative variables (with the exception of age) and the ordinal variables (with the exception of traveltime) shall be used for the lasso model. The factor variables are explainable via the ordinal variables.

In [51]:
features = students_portuguese.loc[:, ['absences', 'G1', 'G2', 'Medu', 'Fedu', 'studytime', 
                                                'failures', 'famrel', 'freetime',
                                                'goout', 'Dalc', 'Walc', 'health']]
target   = students_portuguese.loc[:, 'G3']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)


In [71]:
print('Default number of features: {}'.format(features.shape[1]))
print('=' * 100)

lasso_normal = Lasso(max_iter = 10e6)
lasso_normal = lasso_normal.fit(X_train, y_train)

train_score = lasso_normal.score(X_train, y_train)
test_score  = lasso_normal.score(X_test, y_test)
coefficients_used = np.sum(lasso_normal.coef_ != 0)

print("Training score default lasso: {}".format(train_score))
print("Test score default lasso: {}".format(test_score))
print("Number of features used: {}".format(coefficients_used))

coef = np.array(lasso_normal.coef_)
most_relevant = coef.argsort()[-10:]
print("Most relevant columns: {}".format(features.columns[most_relevant[::-1]]))

alpha_values = [0.1, 0.01, 0.001, 2, 5, 10]
for alpha_iter in alpha_values:
    
    lasso = Lasso(alpha = alpha_iter, max_iter = 10e6)
    lasso = lasso.fit(X_train, y_train)

    train_score = lasso.score(X_train, y_train)
    test_score  = lasso.score(X_test, y_test)
    coefficients_used = np.sum(lasso.coef_ != 0)

    print("-" * 100)
    print("Training score alpha = {} lasso: {}".format(alpha_iter, train_score))
    print("Test score alpha = {} lasso: {}".format(alpha_iter, test_score))
    print("Number of features used: {}".format(coefficients_used))

    coef = np.array(lasso.coef_)
    most_relevant = coef.argsort()[-10:]
    print("Most relevant columns: {}".format(features.columns[most_relevant[::-1]]))


Default number of features: 13
Training score default lasso: 0.8199823565487045
Test score default lasso: 0.9120749579010489
Number of features used: 2
Most relevant columns: Index(['G2', 'G1', 'health', 'Walc', 'Dalc', 'goout', 'freetime', 'famrel',
       'failures', 'studytime'],
      dtype='object')
----------------------------------------------------------------------------------------------------
Training score alpha = 0.1 lasso: 0.8347042355255891
Test score alpha = 0.1 lasso: 0.9180607524592728
Number of features used: 5
Most relevant columns: Index(['G2', 'G1', 'absences', 'Dalc', 'goout', 'freetime', 'famrel',
       'failures', 'studytime', 'Fedu'],
      dtype='object')
----------------------------------------------------------------------------------------------------
Training score alpha = 0.01 lasso: 0.8379648740165422
Test score alpha = 0.01 lasso: 0.9195270870541393
Number of features used: 11
Most relevant columns: Index(['G2', 'G1', 'studytime', 'absences', 'goout',

## Now looking at the Mathematics students

In [72]:
students_mathematics = pd.read_csv('../data/student-mat.csv', sep = ';')

students_mathematics

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
5,GP,M,16,U,LE3,T,4,3,services,other,...,5,4,2,1,2,5,10,15,15,15
6,GP,M,16,U,LE3,T,2,2,other,other,...,4,4,4,1,1,3,0,12,12,11
7,GP,F,17,U,GT3,A,4,4,other,teacher,...,4,1,4,1,1,1,6,6,5,6
8,GP,M,15,U,LE3,A,3,2,services,other,...,4,2,2,1,1,1,0,16,18,19
9,GP,M,15,U,GT3,T,3,4,other,other,...,5,5,1,1,1,5,0,14,15,15


In [73]:
features = students_mathematics.loc[:, ['absences', 'G1', 'G2', 'Medu', 'Fedu', 'studytime', 
                                                'failures', 'famrel', 'freetime',
                                                'goout', 'Dalc', 'Walc', 'health']]
target   = students_mathematics.loc[:, 'G3']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)

In [76]:
print('Default number of features: {}'.format(features.shape[1]))
print('=' * 100)

lasso_normal = Lasso(max_iter = 10e6)
lasso_normal = lasso_normal.fit(X_train, y_train)

train_score = lasso_normal.score(X_train, y_train)
test_score  = lasso_normal.score(X_test, y_test)
coefficients_used = np.sum(lasso_normal.coef_ != 0)

print("Training score default lasso: {}".format(train_score))
print("Test score default lasso: {}".format(test_score))
print("Number of features used: {}".format(coefficients_used))

coef = np.array(lasso_normal.coef_)
most_relevant = coef.argsort()[-10:]
print("Most relevant columns: {}".format(features.columns[most_relevant[::-1]]))

alpha_values = [0.1, 0.01, 0.001, 2, 5, 10, 15]
for alpha_iter in alpha_values:
    
    lasso = Lasso(alpha = alpha_iter, max_iter = 10e6)
    lasso = lasso.fit(X_train, y_train)

    train_score = lasso.score(X_train, y_train)
    test_score  = lasso.score(X_test, y_test)
    coefficients_used = np.sum(lasso.coef_ != 0)

    print("-" * 100)
    print("Training score alpha = {} lasso: {}".format(alpha_iter, train_score))
    print("Test score alpha = {} lasso: {}".format(alpha_iter, test_score))
    print("Number of features used: {}".format(coefficients_used))

    coef = np.array(lasso.coef_)
    most_relevant = coef.argsort()[-10:]
    print("Most relevant columns: {}".format(features.columns[most_relevant[::-1]]))

Default number of features: 13
Training score default lasso: 0.8053304457107777
Test score default lasso: 0.8907590606146218
Number of features used: 3
Most relevant columns: Index(['G2', 'G1', 'absences', 'health', 'Walc', 'Dalc', 'goout', 'freetime',
       'famrel', 'failures'],
      dtype='object')
----------------------------------------------------------------------------------------------------
Training score alpha = 0.1 lasso: 0.8166607601686023
Test score alpha = 0.1 lasso: 0.8957010845754266
Number of features used: 9
Most relevant columns: Index(['G2', 'famrel', 'G1', 'health', 'Walc', 'absences', 'freetime', 'Dalc',
       'goout', 'Fedu'],
      dtype='object')
----------------------------------------------------------------------------------------------------
Training score alpha = 0.01 lasso: 0.820375398381229
Test score alpha = 0.01 lasso: 0.8974758262008494
Number of features used: 12
Most relevant columns: Index(['G2', 'famrel', 'G1', 'Walc', 'freetime', 'Medu', 'hea

## Looking at the inner joined data set, the one containing the students that took both the Mathematics and the Portuguese classes

In [78]:
students_combined = pd.read_csv('../data/students-combined.csv', sep = ';')
students_combined = students_combined.drop(columns = 'Unnamed: 0')

students_combined

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,health,absences_x,G1_x,G2_x,G3_x,paid_y,absences_y,G1_y,G2_y,G3_y
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,0,11,11,no,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,2,9,11,11,no,4,5,5,6
2,GP,F,15,U,GT3,T,4,2,health,services,...,5,0,14,14,14,yes,2,15,14,15
3,GP,F,16,U,GT3,T,3,3,other,other,...,5,0,11,13,13,yes,4,6,10,10
4,GP,M,16,U,LE3,T,4,3,services,other,...,5,6,12,12,13,yes,10,15,15,15
5,GP,M,16,U,LE3,T,2,2,other,other,...,3,0,13,12,13,no,0,12,12,11
6,GP,F,17,U,GT3,A,4,4,other,teacher,...,1,2,10,13,13,no,6,6,5,6
7,GP,M,15,U,LE3,A,3,2,services,other,...,1,0,15,16,17,yes,0,16,18,19
8,GP,M,15,U,GT3,T,3,4,other,other,...,5,0,12,12,13,yes,0,14,15,15
9,GP,F,15,U,GT3,T,4,4,teacher,health,...,2,2,14,14,14,yes,0,10,8,9


Trying to predict the final grade for both Portuguese and Mathematics using the prior information

In [80]:
features = students_combined.loc[:, ['absences_x', 'G1_x', 'G2_x', 'Medu', 'Fedu', 'studytime', 
                                                'failures', 'famrel', 'freetime',
                                                'goout', 'Dalc', 'Walc', 'health', 'absences_y', 'G1_y', 'G2_y']]
target   = students_combined.loc[:, ['G3_x', 'G3_y']]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)

In [118]:
print('Default number of features: {}'.format(2 * features.shape[1]))
print('{} features for the first predicted value and {} for the second one'.format(features.shape[1], features.shape[1]))
print('=' * 100)

lasso_normal = Lasso(max_iter = 10e6)
lasso_normal = lasso_normal.fit(X_train, y_train)

train_score = lasso_normal.score(X_train, y_train)
test_score  = lasso_normal.score(X_test, y_test)
coefficients_used = np.sum(lasso_normal.coef_ != 0)

print("Training score default lasso: {}".format(train_score))
print("Test score default lasso: {}".format(test_score))
print("Number of features used: {}".format(coefficients_used))

coef1, coef2 = np.array(lasso.coef_)
most_relevant1 = coef1.argsort()[-10:]
most_relevant2 = coef2.argsort()[-10:]
print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

alpha_values = [0.1, 0.01, 0.001, 2, 5, 10, 15]
for alpha_iter in alpha_values:
    
    lasso = Lasso(alpha = alpha_iter, max_iter = 10e6)
    lasso = lasso.fit(X_train, y_train)

    train_score = lasso.score(X_train, y_train)
    test_score  = lasso.score(X_test, y_test)
    coefficients_used = np.sum(lasso.coef_ != 0)

    print("-" * 100)
    print("Training score alpha = {} lasso: {}".format(alpha_iter, train_score))
    print("Test score alpha = {} lasso: {}".format(alpha_iter, test_score))
    print("Number of features used: {}".format(coefficients_used))

    coef1, coef2 = np.array(lasso.coef_)
    most_relevant1 = coef1.argsort()[-10:]
    most_relevant2 = coef2.argsort()[-10:]
    print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
    print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

Default number of features: 32
16 features for the first predicted value and 16 for the second one
Training score default lasso: 0.813310410643022
Test score default lasso: 0.8262897376123167
Number of features used: 5
Most relevant columns for Portuguese: Index(['G2_y', 'G1_y', 'absences_y', 'health', 'Walc', 'Dalc', 'goout',
       'freetime', 'famrel', 'failures'],
      dtype='object')
Most relevant columns for Mathematics: Index(['G2_y', 'G1_y', 'absences_y', 'health', 'Walc', 'Dalc', 'goout',
       'freetime', 'famrel', 'failures'],
      dtype='object')
----------------------------------------------------------------------------------------------------
Training score alpha = 0.1 lasso: 0.8341200426864936
Test score alpha = 0.1 lasso: 0.8284345222971394
Number of features used: 18
Most relevant columns for Portuguese: Index(['G2_x', 'G1_x', 'G1_y', 'absences_x', 'health', 'Walc', 'goout',
       'famrel', 'studytime', 'Fedu'],
      dtype='object')
Most relevant columns for Math

Removing the second grade from the analysis

In [127]:
features = students_combined.loc[:, ['absences_x', 'G1_x', 'Medu', 'Fedu', 'studytime', 
                                                'failures', 'famrel', 'freetime',
                                                'goout', 'Dalc', 'Walc', 'health', 'absences_y', 'G1_y']]
target   = students_combined.loc[:, ['G3_x', 'G3_y']]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)

In [138]:
print('Default number of features: {}'.format(2 * features.shape[1]))
print('{} features for the first predicted value and {} for the second one'.format(features.shape[1], features.shape[1]))
print('=' * 100)

lasso_normal = Lasso(max_iter = 10e6)
lasso_normal = lasso_normal.fit(X_train, y_train)

train_score = lasso_normal.score(X_train, y_train)
test_score  = lasso_normal.score(X_test, y_test)
coefficients_used = np.sum(lasso_normal.coef_ != 0)

print("Training score default lasso: {}".format(train_score))
print("Test score default lasso: {}".format(test_score))
print("Number of features used: {}".format(coefficients_used))

coef1, coef2 = np.array(lasso.coef_)
print(coef1)
print(coef2)

alpha_values = [0.1, 0.01, 0.001, 2, 5, 10, 15]
for alpha_iter in alpha_values:
    
    lasso = Lasso(alpha = alpha_iter, max_iter = 10e6)
    lasso = lasso.fit(X_train, y_train)

    train_score = lasso.score(X_train, y_train)
    test_score  = lasso.score(X_test, y_test)
    coefficients_used = np.sum(lasso.coef_ != 0)

    print("-" * 100)
    print("Training score alpha = {} lasso: {}".format(alpha_iter, train_score))
    print("Test score alpha = {} lasso: {}".format(alpha_iter, test_score))
    print("Number of features used: {}".format(coefficients_used))

    coef1, coef2 = np.array(lasso.coef_)
    most_relevant1 = coef1.argsort()[-10:]
    most_relevant2 = coef2.argsort()[-10:]
    print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
    print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

Default number of features: 28
14 features for the first predicted value and 14 for the second one
Training score default lasso: 0.6814685263609779
Test score default lasso: 0.5949513841327817
Number of features used: 4
[-0.  0.  0.  0.  0. -0. -0. -0. -0. -0. -0. -0. -0.  0.]
[-0.  0.  0.  0.  0. -0. -0. -0. -0. -0. -0. -0. -0.  0.]
----------------------------------------------------------------------------------------------------
Training score alpha = 0.1 lasso: 0.7058495540513209
Test score alpha = 0.1 lasso: 0.578753108475099
Number of features used: 18
Most relevant columns for Portuguese: Index(['G1_x', 'G1_y', 'absences_x', 'studytime', 'Walc', 'goout', 'famrel',
       'Fedu', 'Medu', 'absences_y'],
      dtype='object')
Most relevant columns for Mathematics: Index(['G1_y', 'G1_x', 'Walc', 'health', 'absences_y', 'famrel', 'Medu',
       'Dalc', 'goout', 'freetime'],
      dtype='object')
----------------------------------------------------------------------------------------

Removing also the first grade

In [140]:
features = students_combined.loc[:, ['absences_x', 'Medu', 'Fedu', 'studytime', 
                                                'failures', 'famrel', 'freetime',
                                                'goout', 'Dalc', 'Walc', 'health', 'absences_y']]
target   = students_combined.loc[:, ['G3_x', 'G3_y']]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)

In [141]:
print('Default number of features: {}'.format(2 * features.shape[1]))
print('{} features for the first predicted value and {} for the second one'.format(features.shape[1], features.shape[1]))
print('=' * 100)

lasso_normal = Lasso(max_iter = 10e6)
lasso_normal = lasso_normal.fit(X_train, y_train)

train_score = lasso_normal.score(X_train, y_train)
test_score  = lasso_normal.score(X_test, y_test)
coefficients_used = np.sum(lasso_normal.coef_ != 0)

print("Training score default lasso: {}".format(train_score))
print("Test score default lasso: {}".format(test_score))
print("Number of features used: {}".format(coefficients_used))

coef1, coef2 = np.array(lasso.coef_)
print(coef1)
print(coef2)

alpha_values = [0.1, 0.01, 0.001, 2, 5, 10, 15]
for alpha_iter in alpha_values:
    
    lasso = Lasso(alpha = alpha_iter, max_iter = 10e6)
    lasso = lasso.fit(X_train, y_train)

    train_score = lasso.score(X_train, y_train)
    test_score  = lasso.score(X_test, y_test)
    coefficients_used = np.sum(lasso.coef_ != 0)

    print("-" * 100)
    print("Training score alpha = {} lasso: {}".format(alpha_iter, train_score))
    print("Test score alpha = {} lasso: {}".format(alpha_iter, test_score))
    print("Number of features used: {}".format(coefficients_used))

    coef1, coef2 = np.array(lasso.coef_)
    most_relevant1 = coef1.argsort()[-10:]
    most_relevant2 = coef2.argsort()[-10:]
    print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
    print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

Default number of features: 24
12 features for the first predicted value and 12 for the second one
Training score default lasso: 0.008023327094650855
Test score default lasso: 0.021220558230881016
Number of features used: 3
[-0.  0.  0.  0.  0. -0. -0. -0. -0. -0. -0. -0. -0.  0.]
[-0.  0.  0.  0.  0. -0. -0. -0. -0. -0. -0. -0. -0.  0.]
----------------------------------------------------------------------------------------------------
Training score alpha = 0.1 lasso: 0.11740658578552697
Test score alpha = 0.1 lasso: 0.14112330945274862
Number of features used: 20
Most relevant columns for Portuguese: Index(['studytime', 'Medu', 'Walc', 'absences_x', 'goout', 'famrel', 'Fedu',
       'absences_y', 'freetime', 'health'],
      dtype='object')
Most relevant columns for Mathematics: Index(['Medu', 'Walc', 'Fedu', 'studytime', 'freetime', 'absences_y', 'famrel',
       'health', 'absences_x', 'failures'],
      dtype='object')
-------------------------------------------------------------

So, we have no chance of predicting the final grade without the help of any of the previous grades. 
Let's look at the situation in with we don't have the first grade but we have the second grade.

In [143]:
features = students_combined.loc[:, ['absences_x', 'Medu', 'Fedu', 'studytime', 
                                                'failures', 'famrel', 'freetime',
                                                'goout', 'Dalc', 'Walc', 'health', 'absences_y', 'G2_x', 'G2_y']]
target   = students_combined.loc[:, ['G3_x', 'G3_y']]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)

In [144]:
print('Default number of features: {}'.format(2 * features.shape[1]))
print('{} features for the first predicted value and {} for the second one'.format(features.shape[1], features.shape[1]))
print('=' * 100)

lasso_normal = Lasso(max_iter = 10e6)
lasso_normal = lasso_normal.fit(X_train, y_train)

train_score = lasso_normal.score(X_train, y_train)
test_score  = lasso_normal.score(X_test, y_test)
coefficients_used = np.sum(lasso_normal.coef_ != 0)

print("Training score default lasso: {}".format(train_score))
print("Test score default lasso: {}".format(test_score))
print("Number of features used: {}".format(coefficients_used))

coef1, coef2 = np.array(lasso.coef_)
print(coef1)
print(coef2)

alpha_values = [0.1, 0.01, 0.001, 2, 5, 10, 15]
for alpha_iter in alpha_values:
    
    lasso = Lasso(alpha = alpha_iter, max_iter = 10e6)
    lasso = lasso.fit(X_train, y_train)

    train_score = lasso.score(X_train, y_train)
    test_score  = lasso.score(X_test, y_test)
    coefficients_used = np.sum(lasso.coef_ != 0)

    print("-" * 100)
    print("Training score alpha = {} lasso: {}".format(alpha_iter, train_score))
    print("Test score alpha = {} lasso: {}".format(alpha_iter, test_score))
    print("Number of features used: {}".format(coefficients_used))

    coef1, coef2 = np.array(lasso.coef_)
    most_relevant1 = coef1.argsort()[-10:]
    most_relevant2 = coef2.argsort()[-10:]
    print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
    print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

Default number of features: 28
14 features for the first predicted value and 14 for the second one
Training score default lasso: 0.8092113607493295
Test score default lasso: 0.8321159532635143
Number of features used: 4
[-0.  0.  0.  0. -0. -0. -0. -0. -0. -0. -0. -0.]
[-0.  0.  0.  0. -0. -0. -0. -0. -0.  0. -0.  0.]
----------------------------------------------------------------------------------------------------
Training score alpha = 0.1 lasso: 0.8272017497966664
Test score alpha = 0.1 lasso: 0.8395628524613928
Number of features used: 12
Most relevant columns for Portuguese: Index(['G2_x', 'absences_x', 'G2_y', 'health', 'Walc', 'Dalc', 'goout',
       'freetime', 'famrel', 'failures'],
      dtype='object')
Most relevant columns for Mathematics: Index(['G2_y', 'famrel', 'G2_x', 'Walc', 'absences_y', 'health', 'Dalc',
       'goout', 'freetime', 'failures'],
      dtype='object')
----------------------------------------------------------------------------------------------------

It seems that the second grade is more relevant that the first one. So now we try removing one of the second grades but reintroducing a first grade but from the other subject

In [145]:
features = students_combined.loc[:, ['absences_x', 'Medu', 'Fedu', 'studytime', 
                                                'failures', 'famrel', 'freetime',
                                                'goout', 'Dalc', 'Walc', 'health', 'absences_y', 'G2_x', 'G1_y']]
target   = students_combined.loc[:, ['G3_x', 'G3_y']]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)

In [146]:
print('Default number of features: {}'.format(2 * features.shape[1]))
print('{} features for the first predicted value and {} for the second one'.format(features.shape[1], features.shape[1]))
print('=' * 100)

lasso_normal = Lasso(max_iter = 10e6)
lasso_normal = lasso_normal.fit(X_train, y_train)

train_score = lasso_normal.score(X_train, y_train)
test_score  = lasso_normal.score(X_test, y_test)
coefficients_used = np.sum(lasso_normal.coef_ != 0)

print("Training score default lasso: {}".format(train_score))
print("Test score default lasso: {}".format(test_score))
print("Number of features used: {}".format(coefficients_used))

coef1, coef2 = np.array(lasso.coef_)
print(coef1)
print(coef2)

alpha_values = [0.1, 0.01, 0.001, 2, 5, 10, 15]
for alpha_iter in alpha_values:
    
    lasso = Lasso(alpha = alpha_iter, max_iter = 10e6)
    lasso = lasso.fit(X_train, y_train)

    train_score = lasso.score(X_train, y_train)
    test_score  = lasso.score(X_test, y_test)
    coefficients_used = np.sum(lasso.coef_ != 0)

    print("-" * 100)
    print("Training score alpha = {} lasso: {}".format(alpha_iter, train_score))
    print("Test score alpha = {} lasso: {}".format(alpha_iter, test_score))
    print("Number of features used: {}".format(coefficients_used))

    coef1, coef2 = np.array(lasso.coef_)
    most_relevant1 = coef1.argsort()[-10:]
    most_relevant2 = coef2.argsort()[-10:]
    print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
    print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

Default number of features: 28
14 features for the first predicted value and 14 for the second one
Training score default lasso: 0.6868753600708458
Test score default lasso: 0.7390742490916306
Number of features used: 5
[-0.  0.  0.  0. -0. -0. -0. -0. -0. -0. -0. -0.  0.  0.]
[-0.  0.  0.  0. -0.  0.  0. -0. -0. -0. -0.  0.  0.  0.]
----------------------------------------------------------------------------------------------------
Training score alpha = 0.1 lasso: 0.7115550268387356
Test score alpha = 0.1 lasso: 0.7641366197662409
Number of features used: 15
Most relevant columns for Portuguese: Index(['G2_x', 'absences_x', 'G1_y', 'health', 'Walc', 'Dalc', 'goout',
       'freetime', 'famrel', 'studytime'],
      dtype='object')
Most relevant columns for Mathematics: Index(['G1_y', 'G2_x', 'Walc', 'Medu', 'health', 'Dalc', 'absences_y',
       'freetime', 'famrel', 'failures'],
      dtype='object')
------------------------------------------------------------------------------------

In [147]:
features = students_combined.loc[:, ['absences_x', 'Medu', 'Fedu', 'studytime', 
                                                'failures', 'famrel', 'freetime',
                                                'goout', 'Dalc', 'Walc', 'health', 'absences_y', 'G1_x', 'G2_y']]
target   = students_combined.loc[:, ['G3_x', 'G3_y']]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)

In [148]:
print('Default number of features: {}'.format(2 * features.shape[1]))
print('{} features for the first predicted value and {} for the second one'.format(features.shape[1], features.shape[1]))
print('=' * 100)

lasso_normal = Lasso(max_iter = 10e6)
lasso_normal = lasso_normal.fit(X_train, y_train)

train_score = lasso_normal.score(X_train, y_train)
test_score  = lasso_normal.score(X_test, y_test)
coefficients_used = np.sum(lasso_normal.coef_ != 0)

print("Training score default lasso: {}".format(train_score))
print("Test score default lasso: {}".format(test_score))
print("Number of features used: {}".format(coefficients_used))

coef1, coef2 = np.array(lasso.coef_)
print(coef1)
print(coef2)

alpha_values = [0.1, 0.01, 0.001, 2, 5, 10, 15]
for alpha_iter in alpha_values:
    
    lasso = Lasso(alpha = alpha_iter, max_iter = 10e6)
    lasso = lasso.fit(X_train, y_train)

    train_score = lasso.score(X_train, y_train)
    test_score  = lasso.score(X_test, y_test)
    coefficients_used = np.sum(lasso.coef_ != 0)

    print("-" * 100)
    print("Training score alpha = {} lasso: {}".format(alpha_iter, train_score))
    print("Test score alpha = {} lasso: {}".format(alpha_iter, test_score))
    print("Number of features used: {}".format(coefficients_used))

    coef1, coef2 = np.array(lasso.coef_)
    most_relevant1 = coef1.argsort()[-10:]
    most_relevant2 = coef2.argsort()[-10:]
    print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
    print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

Default number of features: 28
14 features for the first predicted value and 14 for the second one
Training score default lasso: 0.7572833590385984
Test score default lasso: 0.8836093330565066
Number of features used: 5
[-0.  0.  0.  0. -0. -0. -0. -0. -0. -0. -0. -0.  0.  0.]
[-0.  0.  0.  0. -0. -0. -0. -0. -0. -0. -0. -0.  0.  0.]
----------------------------------------------------------------------------------------------------
Training score alpha = 0.1 lasso: 0.7798264226076181
Test score alpha = 0.1 lasso: 0.8937002222691526
Number of features used: 18
Most relevant columns for Portuguese: Index(['G1_x', 'famrel', 'G2_y', 'absences_x', 'Walc', 'goout', 'studytime',
       'Fedu', 'Medu', 'freetime'],
      dtype='object')
Most relevant columns for Mathematics: Index(['G2_y', 'famrel', 'G1_x', 'Walc', 'absences_y', 'health', 'Medu',
       'Dalc', 'goout', 'freetime'],
      dtype='object')
-----------------------------------------------------------------------------------------

By looking at the two situations we find that we are able to predict the final grades for Portuguese and Mathematics when we only have the second grade for Mathematics and the first grade for Portuguese than when the first grade in Mathematics and the second grade in Portuguese is known.

In [149]:
features = students_combined.loc[:, ['absences_x', 'Medu', 'Fedu', 'studytime', 
                                                'failures', 'famrel', 'freetime',
                                                'goout', 'Dalc', 'Walc', 'health', 'absences_y', 'G1_x', 'G2_y', 'G2_x']]
target   = students_combined.loc[:, ['G3_x', 'G3_y']]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)

In [150]:
print('Default number of features: {}'.format(2 * features.shape[1]))
print('{} features for the first predicted value and {} for the second one'.format(features.shape[1], features.shape[1]))
print('=' * 100)

lasso_normal = Lasso(max_iter = 10e6)
lasso_normal = lasso_normal.fit(X_train, y_train)

train_score = lasso_normal.score(X_train, y_train)
test_score  = lasso_normal.score(X_test, y_test)
coefficients_used = np.sum(lasso_normal.coef_ != 0)

print("Training score default lasso: {}".format(train_score))
print("Test score default lasso: {}".format(test_score))
print("Number of features used: {}".format(coefficients_used))

coef1, coef2 = np.array(lasso.coef_)
print(coef1)
print(coef2)

alpha_values = [0.1, 0.01, 0.001, 2, 5, 10, 15]
for alpha_iter in alpha_values:
    
    lasso = Lasso(alpha = alpha_iter, max_iter = 10e6)
    lasso = lasso.fit(X_train, y_train)

    train_score = lasso.score(X_train, y_train)
    test_score  = lasso.score(X_test, y_test)
    coefficients_used = np.sum(lasso.coef_ != 0)

    print("-" * 100)
    print("Training score alpha = {} lasso: {}".format(alpha_iter, train_score))
    print("Test score alpha = {} lasso: {}".format(alpha_iter, test_score))
    print("Number of features used: {}".format(coefficients_used))

    coef1, coef2 = np.array(lasso.coef_)
    most_relevant1 = coef1.argsort()[-10:]
    most_relevant2 = coef2.argsort()[-10:]
    print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
    print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

Default number of features: 30
15 features for the first predicted value and 15 for the second one
Training score default lasso: 0.8080145297147785
Test score default lasso: 0.8781713077288474
Number of features used: 4
[-0.  0.  0.  0. -0. -0. -0. -0. -0. -0. -0. -0.  0.  0.]
[-0.  0.  0.  0. -0. -0.  0. -0. -0.  0. -0. -0.  0.  0.]
----------------------------------------------------------------------------------------------------
Training score alpha = 0.1 lasso: 0.8266676624543167
Test score alpha = 0.1 lasso: 0.8802520249183082
Number of features used: 15
Most relevant columns for Portuguese: Index(['G2_x', 'G1_x', 'absences_x', 'G2_y', 'health', 'Walc', 'goout',
       'freetime', 'famrel', 'studytime'],
      dtype='object')
Most relevant columns for Mathematics: Index(['G2_y', 'famrel', 'G2_x', 'freetime', 'Walc', 'absences_y', 'health',
       'G1_x', 'Dalc', 'goout'],
      dtype='object')
---------------------------------------------------------------------------------------

In [152]:
features = students_combined.loc[:, ['absences_x', 'Medu', 'Fedu', 'studytime', 
                                                'failures', 'famrel', 'freetime',
                                                'goout', 'Dalc', 'Walc', 'health', 'absences_y', 'G1_y', 'G2_y']]
target   = students_combined.loc[:, ['G3_x', 'G3_y']]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)

In [154]:
print('Default number of features: {}'.format(2 * features.shape[1]))
print('{} features for the first predicted value and {} for the second one'.format(features.shape[1], features.shape[1]))
print('=' * 100)

lasso_normal = Lasso(max_iter = 10e6)
lasso_normal = lasso_normal.fit(X_train, y_train)

train_score = lasso_normal.score(X_train, y_train)
test_score  = lasso_normal.score(X_test, y_test)
coefficients_used = np.sum(lasso_normal.coef_ != 0)

print("Training score default lasso: {}".format(train_score))
print("Test score default lasso: {}".format(test_score))
print("Number of features used: {}".format(coefficients_used))

coef1, coef2 = np.array(lasso.coef_)
coef1, coef2 = np.array(lasso.coef_)
most_relevant1 = coef1.argsort()[-10:]
most_relevant2 = coef2.argsort()[-10:]
print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

alpha_values = [0.1, 0.01, 0.001, 2, 5, 10, 15]
for alpha_iter in alpha_values:
    
    lasso = Lasso(alpha = alpha_iter, max_iter = 10e6)
    lasso = lasso.fit(X_train, y_train)

    train_score = lasso.score(X_train, y_train)
    test_score  = lasso.score(X_test, y_test)
    coefficients_used = np.sum(lasso.coef_ != 0)

    print("-" * 100)
    print("Training score alpha = {} lasso: {}".format(alpha_iter, train_score))
    print("Test score alpha = {} lasso: {}".format(alpha_iter, test_score))
    print("Number of features used: {}".format(coefficients_used))

    coef1, coef2 = np.array(lasso.coef_)
    most_relevant1 = coef1.argsort()[-10:]
    most_relevant2 = coef2.argsort()[-10:]
    print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
    print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

Default number of features: 28
14 features for the first predicted value and 14 for the second one
Training score default lasso: 0.6560563782514762
Test score default lasso: 0.6912569021419093
Number of features used: 6
Most relevant columns for Portuguese: Index(['G2_y', 'G1_y', 'absences_y', 'health', 'Walc', 'Dalc', 'goout',
       'freetime', 'famrel', 'failures'],
      dtype='object')
Most relevant columns for Mathematics: Index(['G2_y', 'G1_y', 'absences_y', 'health', 'Walc', 'Dalc', 'goout',
       'freetime', 'famrel', 'failures'],
      dtype='object')
----------------------------------------------------------------------------------------------------
Training score alpha = 0.1 lasso: 0.7014263372692329
Test score alpha = 0.1 lasso: 0.7225640392774101
Number of features used: 21
Most relevant columns for Portuguese: Index(['studytime', 'G1_y', 'G2_y', 'goout', 'Medu', 'famrel', 'absences_x',
       'Walc', 'freetime', 'Fedu'],
      dtype='object')
Most relevant columns for M

In [155]:
features = students_combined.loc[:, ['absences_x', 'Medu', 'Fedu', 'studytime', 
                                                'failures', 'famrel', 'freetime',
                                                'goout', 'Dalc', 'Walc', 'health', 'absences_y', 'G1_x', 'G2_x']]
target   = students_combined.loc[:, ['G3_x', 'G3_y']]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2)

In [156]:
print('Default number of features: {}'.format(2 * features.shape[1]))
print('{} features for the first predicted value and {} for the second one'.format(features.shape[1], features.shape[1]))
print('=' * 100)

lasso_normal = Lasso(max_iter = 10e6)
lasso_normal = lasso_normal.fit(X_train, y_train)

train_score = lasso_normal.score(X_train, y_train)
test_score  = lasso_normal.score(X_test, y_test)
coefficients_used = np.sum(lasso_normal.coef_ != 0)

print("Training score default lasso: {}".format(train_score))
print("Test score default lasso: {}".format(test_score))
print("Number of features used: {}".format(coefficients_used))

coef1, coef2 = np.array(lasso.coef_)
coef1, coef2 = np.array(lasso.coef_)
most_relevant1 = coef1.argsort()[-10:]
most_relevant2 = coef2.argsort()[-10:]
print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

alpha_values = [0.1, 0.01, 0.001, 2, 5, 10, 15]
for alpha_iter in alpha_values:
    
    lasso = Lasso(alpha = alpha_iter, max_iter = 10e6)
    lasso = lasso.fit(X_train, y_train)

    train_score = lasso.score(X_train, y_train)
    test_score  = lasso.score(X_test, y_test)
    coefficients_used = np.sum(lasso.coef_ != 0)

    print("-" * 100)
    print("Training score alpha = {} lasso: {}".format(alpha_iter, train_score))
    print("Test score alpha = {} lasso: {}".format(alpha_iter, test_score))
    print("Number of features used: {}".format(coefficients_used))

    coef1, coef2 = np.array(lasso.coef_)
    most_relevant1 = coef1.argsort()[-10:]
    most_relevant2 = coef2.argsort()[-10:]
    print("Most relevant columns for Portuguese: {}".format(features.columns[most_relevant1[::-1]]))
    print("Most relevant columns for Mathematics: {}".format(features.columns[most_relevant2[::-1]]))

Default number of features: 28
14 features for the first predicted value and 14 for the second one
Training score default lasso: 0.44922693972223343
Test score default lasso: 0.42051903367900656
Number of features used: 4
Most relevant columns for Portuguese: Index(['G2_x', 'G1_x', 'absences_y', 'health', 'Walc', 'Dalc', 'goout',
       'freetime', 'famrel', 'failures'],
      dtype='object')
Most relevant columns for Mathematics: Index(['G2_x', 'G1_x', 'absences_y', 'health', 'Walc', 'Dalc', 'goout',
       'freetime', 'famrel', 'failures'],
      dtype='object')
----------------------------------------------------------------------------------------------------
Training score alpha = 0.1 lasso: 0.488136324699626
Test score alpha = 0.1 lasso: 0.4643763157689699
Number of features used: 18
Most relevant columns for Portuguese: Index(['G2_x', 'G1_x', 'absences_x', 'health', 'Walc', 'goout', 'famrel',
       'failures', 'studytime', 'Fedu'],
      dtype='object')
Most relevant columns fo

Looking at the previous statistics we can say the following:
- having the second Mathematics grade is more relevant than having the second Portuguese grade.
- if we have both Portuguese grades and the second Mathematics grade we don't have any significant improvement compared to the case in which we only have the second Mathematics grade and a Portuguese grade.
- if we have only the Mathematics grades the overall prediction score declines even though not drastically compared to the situation in which we only have only the Portuguese grades case in which the prediction score is almost a half (of that from the Mathematics only scenario).