# Linear Models - ML CUP (NO LBE)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures  
from sklearn.base import clone

from sklearn.metrics import (
    make_scorer,
    mean_squared_error,
    mean_absolute_error     )

In [None]:
#This function calculates the index of the element of "array" less distant from "value"
def find_nearest(array, value):
    array = np.asarray(array) # Convert the input to an array.
    idx = (np.abs(array - value)).argmin()
    return array[idx]



# Given a model already trained, this function compute the Loss value 
# Depending on the regularization type (no regularization , L1, L2) a different kind of loss is computed
def loss_f(model_fitted, X, y):
    
    coeff = model_fitted.coef_   
    alpha = model_fitted.get_params([])["alpha"]
    
    #Not regularized model
    if (model_fitted.get_params([])["penalty"] == None): 
        total_loss = mean_squared_error(y, model_fitted.predict(X))
    
    #L1 regularization (Lasso)
    if (model_fitted.get_params([])["penalty"] == "l1"):
        total_loss = mean_squared_error(y, model_fitted.predict(X)) + alpha*(np.linalg.norm(coeff, ord = 1))
    
    #L2 regularization (Ridge)
    if (model_fitted.get_params([])["penalty"] == "l2"):
        total_loss = mean_squared_error(y, model_fitted.predict(X)) +  alpha*(np.dot(coeff, coeff))
        
    return total_loss


# This function is useful to create plots of loss_vs_epochs (over TR) and MSE_vs_epochs (both over TR and VL)
def curves(epochs, model, X_train, y_train, X_val, y_val, loss_title, val_title):
     
    tol = model.get_params([])["tol"]    
    epochs_array = np.arange(1, epochs +1, step = 1)    
    train_losses = []
    train_MSE_s = []    
    vl_MSE_s = []
    partial_model_list = []
    
    for i in range(0, epochs):
        partial_model_list.append(model.partial_fit(X_train, y_train))
    
        train_losses.append(loss_f(model, X_train, y_train)) #loss value for i-th epoch
        
        train_MSE_s.append(mean_squared_error(y_train, model.predict(X_train))) # TR MSE value for i-th epoch
        vl_MSE_s.append(mean_squared_error(y_val, model.predict(X_val))) # VL MSE value for i-th epoch
    
    
    #Loss curve
    plt.figure()
    plt.title(loss_title)
    plt.plot(epochs_array, train_losses, label = "Training")
    plt.ylabel("Loss")
    plt.xlabel("Epochs")
    plt.legend()
    plt.grid()
    plt.savefig(loss_title + ".pdf")
        
       
    #MSE curves (over the TR and VL)
    plt.figure()
    plt.title(val_title)
    plt.plot(epochs_array, train_MSE_s, color='r', linestyle = "-", label = "Training MSE")
    plt.plot(epochs_array, vl_MSE_s, color='b', linestyle = "--", label = "Validation MSE")
    plt.ylabel("MSE")
    plt.xlabel("Epochs")
    str_label = "#epochs at (min(VL MSE) + tol) = " + str(vl_MSE_s.index(find_nearest(vl_MSE_s, min(vl_MSE_s) + tol)) + 1)
    plt.axvline(x = np.array(vl_MSE_s.index(find_nearest(vl_MSE_s, min(vl_MSE_s) + tol)) + 1), color='green', label = str_label)
    plt.legend()
    plt.grid() 
    plt.savefig(val_title + ".pdf")
    
    
    return vl_MSE_s, partial_model_list



def plots_f(exp_alpha_min, exp_alpha_max, final_model, n_iteration,
            X_inner_tr, y_inner_tr, X_val, y_val,
            weights_vs_alpha_title_str, mae_vs_alpha_title_str, 
            weights_vs_alpha_file_title_name, mae_vs_alpha_file_title_name):
    
    
    coefs = []
    alphas = np.logspace(exp_alpha_min, exp_alpha_max, 100)
    train_MAEs = []
    val_MAEs = []
    
    #The other hyperparameter are the ones found for the best model during model selection phase
    for a in alphas:
        model = SGDRegressor(alpha = a, 
                             eta0 = final_model.get_params([])["eta0"], 
                            learning_rate =  final_model.get_params([])["learning_rate"],
                            loss = final_model.get_params([])["loss"], 
                            max_iter = final_model.get_params([])["max_iter"], 
                            n_iter_no_change = final_model.get_params([])["n_iter_no_change"],
                             penalty = final_model.get_params([])["penalty"],
                             power_t = final_model.get_params([])["power_t"],
                            tol =final_model.get_params([])["tol"]
                             )
        for n in range (n_iteration):
            model.partial_fit(X_inner_tr, y_inner_tr)

        coefs.append(model.coef_)
        train_MAEs.append(mean_absolute_error(y_inner_tr, model.predict(X_inner_tr)))
        val_MAEs.append(mean_absolute_error(y_val, model.predict(X_val)))
    
    
    #Plot of linear model coefficients varying the hyperparameter of regularization (alpha).
    # The other hyperpar are the same of the best model ones
    ax = plt.gca()
    ax.plot(alphas, coefs)
    ax.set_xscale("log")
    plt.xlabel("alpha")
    plt.ylabel("weights")
    plt.title(weights_vs_alpha_title_str)
    plt.grid()
    plt.axis("tight")
    plt.savefig(weights_vs_alpha_file_title_name + ".pdf")
    plt.show()

    
    #Plot of MAE over TR and VL varying alpha, for the best model where.
    #The other hyperpar are the same of the best model ones
    ax = plt.gca()
    ax.plot(alphas, train_MAEs, label = "TR MAE")
    ax.plot(alphas, val_MAEs, label = "VL MAE")
    ax.legend()
    ax.set_xscale("log")
    plt.xlabel("alpha")
    plt.ylabel("MAE")
    plt.title(mae_vs_alpha_title_str)
    plt.grid()
    plt.axis("tight")
    plt.savefig(mae_vs_alpha_file_title_name + ".pdf")
    plt.show()
    
    return 

# Importing datasets "ML-CUP22-TR" and "ML-CUP22-TS" (Blind Test)

In [None]:
tr = pd.read_csv('ML-CUP22-TR.csv', skiprows=7, header=None)
tr = tr.drop([0], axis=1).rename(columns={i:i-1 for i in tr.columns})
blind_ts = pd.read_csv('ML-CUP22-TS.csv', skiprows=7, header=None)
blind_ts = blind_ts.drop([0], axis=1).rename(columns={i:i-1 for i in blind_ts.columns})
 
    
X_blind = blind_ts.values
X = tr.iloc[:,:9].values
Y = tr.iloc[:,9:].values

# HOLD-OUT TR_&_TS CUP 
X_tr, X_tt, Y_tr, Y_tt = train_test_split(
    X, Y, test_size=0.3, random_state=0, shuffle=True)
 
y1_tr, y2_tr = Y_tr[:,0], Y_tr[:,1]
y1_tt, y2_tt = Y_tt[:,0], Y_tt[:,1]

# Data Preprocessing
- Linear Basis Expansion: OFF
- Data Normalization: ON

In [None]:
# Data normalization so that for each feature mean = 0 and std = 1 (implemented by StandardScaler()
scaler = StandardScaler()
scaler.fit(X_tr)
X_tr = scaler.transform(X_tr)
X_tt = scaler.transform(X_tt)
blind_ts = scaler.transform(X_blind)

X_tr.shape

# Linear Model- No regularization -  Target 1


## Model Selection (Phase 1)

- Goal : Find the best hyperparameters.  
- Approach: Grid search through a RepeatedKFold (Repeated: 10 times, #folds: 5)
- The RepeatedKFold is applied to the desing test (X_inner_tr, y1_tr)

In [None]:
%%time

param_grid = {
    "loss": ["squared_error"],
    "penalty": [None],
    "max_iter": [3000],
    "tol": [1e-3, 1e-2, 1e-1],
    "learning_rate": ["invscaling", "adaptive"],
    "eta0": [1e-4, 5e-4],
    "power_t": [1/4],
    "n_iter_no_change": [50]          
}


grid_not_reg1 = GridSearchCV(
    SGDRegressor(),
    param_grid=param_grid,
    scoring = ["neg_mean_absolute_error"],
    cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    refit="neg_mean_absolute_error",
    return_train_score = False
)

grid_not_reg1.fit(X_tr, y1_tr)

#Converting grid search results in a pd DataFrame
cv_results_ = pd.DataFrame(grid_not_reg1.cv_results_)

#Selecting the models with the highest negative mean absolute error (best models)
cv_results_[cv_results_["rank_test_neg_mean_absolute_error"] == 1]

In [None]:
#Saving the grid search results of LM not regularized for target1
pd.DataFrame(grid_not_reg1.cv_results_).to_csv("grid_not_reg1.csv")

## Model Selection (Phase 2)

- Goal: find the best number of epochs up to the best model found in the phase 1 has to be train
- Approach: Hold-out of the desing test (X_tr, y1_tr): t
- The deisgn test is splitted into two subsets: an inner training set 80% (X_inner_tr, y1_inner_tr) and a validation set 20% (X_val, y1_val)


In [None]:
X_inner_tr, X_val, y1_inner_tr, y1_val = train_test_split(X_tr, y1_tr, 
                                                          test_size=0.2,
                                                          random_state=0,
                                                          shuffle=True)

In [None]:
#max number of epochs 
n_epochs = 200

val_MSE_s_nr1, partial_model_list_nr1 = curves (n_epochs, 
                                         clone(grid_not_reg1.best_estimator_), 
                                         X_inner_tr, 
                                         y1_inner_tr,
                                         X_val,
                                         y1_val,
                                         loss_title = "Loss - MLCUP - target1 - (LM not regularized)",
                                         val_title =  "MSE - MLCUP - target1 - (LM not regularized)" )                                                                                 

### Number of epochs up to the best LM not reg (for target 1) will be trained 

In [None]:
tol = grid_not_reg1.best_estimator_.get_params([])["tol"]
n_iteration_not_reg_1 = val_MSE_s_nr1.index(find_nearest(val_MSE_s_nr1, min(val_MSE_s_nr1)+tol)) + 1
print(f"Epochs di LM not reg finale per target 1: {n_iteration_not_reg_1}") 

### Best model trained for a #epochs equals to n_iteration found in the line of code above

In [None]:
final_fitted_model_not_reg_1 = partial_model_list_nr1[n_iteration_not_reg_1 - 1]
final_fitted_model_not_reg_1

# Model Assesment - Linear Model - No regularization -  Target 1 

- Mean Absolute Error of the model trained over the inner training set (X_inner_tr, y1_inner_tr) 

In [None]:
print(f"Linear model not reg (target 1): TRAINING_MAE = {mean_absolute_error(y1_inner_tr, final_fitted_model_not_reg_1.predict(X_inner_tr))}") 
print(f"Linear model not reg (target 1): VALIDATION_MAE = {mean_absolute_error(y1_val, final_fitted_model_not_reg_1.predict(X_val))}") 
print(f"Linear model not reg (target 1): TEST_MAE = {mean_absolute_error(y1_tt, final_fitted_model_not_reg_1.predict(X_tt))}") 

# Linear Model- No regularization -  Target 2

## Model Selection (Phase 1)

- Goal : Find the best hyperparameters.  
- Approach: Grid search through a RepeatedKFold (Repeated: 10 times, #folds: 5)
- The RepeatedKFold is applied to the desing test (X_inner_tr, y1_tr)

In [None]:
%%time

param_grid = {

    "loss": ["squared_error"],
    "penalty": [None],
    "max_iter": [3000],
    "tol": [1e-4, 1e-3, 1e-2, 1e-1],
    "learning_rate": ["invscaling", "adaptive"],
    "eta0": [1e-4, 5e-4],
    "power_t": [1/5],
    "n_iter_no_change": [50]          
}


grid_not_reg2 = GridSearchCV(
    SGDRegressor(),
    param_grid=param_grid,
    scoring = ["neg_mean_absolute_error"],
    cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    refit="neg_mean_absolute_error",
    return_train_score = False
)

grid_not_reg2.fit(X_tr, y2_tr)

#Converting grid search results in a pd DataFrame
cv_results_ = pd.DataFrame(grid_not_reg2.cv_results_)

#Selecting the models with the highest negative mean absolute error (best models)
cv_results_[cv_results_["rank_test_neg_mean_absolute_error"] == 1]

In [None]:
#Saving the grid search results of LM not regularized for target2
pd.DataFrame(grid_not_reg2.cv_results_).to_csv("grid_not_reg2.csv")

## Model Selection (Phase 2)

- Goal: find the best number of epochs up to the best model found in the phase 1 has to be train
- Approach: Hold-out of the desing test (X_tr, y2_tr): t
- The deisgn test is splitted into two subsets: an inner training set 80% (X_inner_tr, y2_inner_tr) and a validation set 20% (X_val, y2_val)


In [None]:
X_inner_tr, X_val, y2_inner_tr, y2_val = train_test_split(X_tr, y2_tr, 
                                                          test_size=0.2,
                                                          random_state=0, 
                                                          shuffle=True)

In [None]:
#max number of epochs 
n_epochs = 200

val_MSE_s_nr2, partial_model_list_nr2 = curves (n_epochs, 
                                         clone(grid_not_reg2.best_estimator_), 
                                         X_inner_tr, 
                                         y2_inner_tr,
                                         X_val,
                                         y2_val,
                                         loss_title = "Loss - MLCUP - target2 - (LM not regularized)",
                                         val_title =  "MSE - MLCUP - target2 - (LM not regularized)" )   

### Number of epochs up to the best LM not reg (for target 2) will be trained 

In [None]:
tol = grid_not_reg2.best_estimator_.get_params([])["tol"]
n_iteration_not_reg_2 = val_MSE_s_nr2.index(find_nearest(val_MSE_s_nr2, min(val_MSE_s_nr2)+tol)) + 1
print(f"Epochs di LM not reg finale per target 1: {n_iteration_not_reg_2}") 

### Best model trained for a #epochs equals to n_iteration found in the line of code above


In [None]:
final_fitted_model_not_reg_2 = partial_model_list_nr2[n_iteration_not_reg_2 - 1]
final_fitted_model_not_reg_2

## Model Assesment - Linear Model - No regularization -  Target 2 
- Mean Absolute Error of the model trained over the training set inner training set (X_inner_tr, y2_inner_tr)

In [None]:
print(f"Linear model not reg (target 2): TRAINING_MAE = {mean_absolute_error(y2_inner_tr, final_fitted_model_not_reg_2.predict(X_inner_tr))}") 
print(f"Linear model not reg (target 2): VALIDATION_MAE = {mean_absolute_error(y2_val, final_fitted_model_not_reg_2.predict(X_val))}") 
print(f"Linear model not reg (target 2): TEST_MAE = {mean_absolute_error(y2_tt, final_fitted_model_not_reg_2.predict(X_tt))}") 

## Model Assesment - Linear Models - No regularization -  Target 1 and 2 
- Assessing the goodness of the models found (LM not reg) for target 1 and 2 by calculating the mean Euclidean error (MEE)

In [None]:
TRAINING_MEE_not_reg = (1/X_inner_tr.shape[0])*np.sqrt(np.square(y1_inner_tr - final_fitted_model_not_reg_1.predict(X_inner_tr)) + np.square(y2_inner_tr - final_fitted_model_not_reg_2.predict(X_inner_tr))).sum()
VALIDATION_MEE_not_reg = (1/X_val.shape[0])*np.sqrt(np.square(y1_val - final_fitted_model_not_reg_1.predict(X_val)) + np.square(y2_val - final_fitted_model_not_reg_2.predict(X_val))).sum()
TEST_MEE_not_reg = (1/X_tt.shape[0])*np.sqrt(np.square(y1_tt - final_fitted_model_not_reg_1.predict(X_tt)) + np.square(y2_tt - final_fitted_model_not_reg_2.predict(X_tt))).sum()

print(f"Linear model not reg: TRAINING_MEE = {TRAINING_MEE_not_reg}")
print(f"Linear model not reg: VALIDATION_MEE = {VALIDATION_MEE_not_reg}")
print(f"Linear model not reg: TEST_MEE = {TEST_MEE_not_reg}") 


## Predicting target 1 and 2 of blind test using respectively "final_fitted_model_not_reg_1" and "final_fitted_model_not_reg_2" 

In [None]:
y1_blind_not_reg = final_fitted_model_not_reg_1.predict(blind_ts)
y2_blind_not_reg = final_fitted_model_not_reg_2.predict(blind_ts)

# 


# Linear Model - L1 regularization (Lasso) - Target 1 

## Model Selection (Phase 1)

- Goal : Find the best hyperparameters.  
- Approach: Grid search through a RepeatedKFold (Repeated: 10 times, #folds: 5)
- The RepeatedKFold is applied to the desing test (X_inner_tr, y1_tr)

In [None]:
%%time
alphas = np.logspace(-3, 0, 100)

param_grid = {
    "alpha": alphas,
    "loss": ["squared_error"],
    "penalty": ["l1"],
    "max_iter": [3000],
    "tol": [1e-3],
    "learning_rate": ["constant","invscaling", "adaptive"],
    "eta0": [5e-4],
    "power_t": [1/4],
    "n_iter_no_change": [50]          
}



grid_lasso1 = GridSearchCV(
    SGDRegressor(),
    param_grid=param_grid,
    scoring = ["neg_mean_absolute_error"],
    cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    refit="neg_mean_absolute_error",
    return_train_score = False
)


grid_lasso1.fit(X_tr, y1_tr)

#Converting grid search results in a pd DataFrame
cv_results_ = pd.DataFrame(grid_lasso1.cv_results_)

#Selecting the models with the highest negative mean absolute error (best models)
cv_results_[cv_results_["rank_test_neg_mean_absolute_error"] == 1]

In [None]:
#Saving the grid search results of LM LASSO for target1
pd.DataFrame(grid_lasso1.cv_results_).to_csv("grid_lasso1.csv")


## Model Selection (Phase 2)

- Goal: find the best number of epochs up to the best model found in the phase 1 has to be train
- Approach: Hold-out of the desing test (X_tr, y1_tr): t
- The deisgn test is splitted into two subsets: an inner training set 80% (X_inner_tr, y1_inner_tr) and a validation set 20% (X_val, y1_val)


In [None]:
#max number of epochs 
n_epochs = 200

val_MSE_s_lasso1, partial_model_list_lasso1  =   curves (n_epochs, 
                                                 clone(grid_lasso1.best_estimator_), 
                                                 X_inner_tr, 
                                                 y1_inner_tr,
                                                 X_val,
                                                 y1_val,
                                                 loss_title = "Loss - MLCUP - target1 - (LASSO)",
                                                 val_title =  "MSE - MLCUP - target1 - (LASSO)" )

### Number of epochs up to the best LM LASSO (for target 1) will be trained 


In [None]:
tol = grid_lasso1.best_estimator_.get_params([])["tol"]
n_iteration_lasso_1 = val_MSE_s_lasso1.index(find_nearest(val_MSE_s_lasso1, min(val_MSE_s_lasso1)+tol)) + 1
print(f"Epochs di LM LASSO finale per target 1: {n_iteration_lasso_1}") 

### Best LM LASSO (for target 1) trained for a #epochs equals to n_iteration_lasso_1


In [None]:
final_fitted_model_lasso_1 = partial_model_list_lasso1[n_iteration_lasso_1 - 1]
final_fitted_model_lasso_1

### Plots to study how the regularization hyperparameter affects the final values of weights and the mean absolute errore (MAE) over the inner training set (X_inner_tr, y1_inner_tr) and validation set (X_val, y1_val)
- The other hyperparameters used for building these plots are setted equal to the ones found for the best model "final_fitted_model_lasso_1". 
- For every value of alpha a clone of "final_fitted_model_lasso_1" (clone: same model but still untrained) has beeen trained up to #epochs = n_iteration_lasso_1.
- After the training:
- 1) Final weights has been extracted; --> (1st plot)
- 2) Has been computed the mean absolute error over inner training and validation --> (2nd plot)

In [None]:
plots_f(-3, 2, final_fitted_model_lasso_1, n_iteration_lasso_1,
            X_inner_tr, y1_inner_tr, X_val, y1_val,
            "LASSO coefficients as a function of alpha (Target 1)", "MLCUP - Target1 - (LASSO)",
            "coeff_vs_alpha_lasso", "mae_vs_alpha_lasso1")

# Model Assesment -Linear Model - L1 regularization (Lasso) - Target 1
- Mean Absolute Error of the model trained over the training set inner training set (X_inner_tr, y1_inner_tr)

In [None]:
print(f"Linear model LASSO (target 1): TRAINING_MAE = {mean_absolute_error(y1_inner_tr, final_fitted_model_lasso_1.predict(X_inner_tr))}") 
print(f"Linear model LASSO (target 1): VALIDATION_MAE = {mean_absolute_error(y1_val, final_fitted_model_lasso_1.predict(X_val))}") 
print(f"Linear model LASSO (target 1): TEST_MAE = {mean_absolute_error(y1_tt, final_fitted_model_lasso_1.predict(X_tt))}") 


# Linear Model - L1 regularization (Lasso) - Target 2 

## Model Selection (Phase 1)

- Goal : Find the best hyperparameters.  
- Approach: Grid search through a RepeatedKFold (Repeated: 10 times, #folds: 5)
- The RepeatedKFold is applied to the desing test (X_inner_tr, y2_tr)

In [None]:
%%time

alphas = np.logspace(-3, 0, 100)

param_grid = {
    "alpha": alphas,
    "loss": ["squared_error"],
    "penalty": ["l1"],
    "max_iter": [3000],
    "tol": [1e-3],
    "learning_rate": ["constant","invscaling", "adaptive"],
    "eta0": [5e-4],
    "power_t": [1/4],
    "n_iter_no_change": [100]          
}



grid_lasso2 = GridSearchCV(
    SGDRegressor(),
    param_grid=param_grid,
    scoring = ["neg_mean_absolute_error"],
    cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    refit="neg_mean_absolute_error",
    return_train_score = False
)


grid_lasso2.fit(X_tr, y2_tr)

#Converting grid search results in a pd DataFrame
cv_results_ = pd.DataFrame(grid_lasso2.cv_results_)

#Selecting the models with the highest negative mean absolute error (best models)
cv_results_[cv_results_["rank_test_neg_mean_absolute_error"] == 1]

In [None]:
#Saving the grid search results of LM LASSO for target2
pd.DataFrame(grid_lasso2.cv_results_).to_csv("grid_lasso2.csv")

## Model Selection (Phase 2)

- Goal: find the best number of epochs up to the best model found in the phase 1 has to be train
- Approach: Hold-out of the desing test (X_tr, y2_tr): t
- The deisgn test is splitted into two subsets: an inner training set 80% (X_inner_tr, y2_inner_tr) and a validation set 20% (X_val, y2_val)

In [None]:
#max number of epochs 
n_epochs = 200 

val_MSE_s_lasso2, partial_model_list_lasso2 = curves (n_epochs ,                  
                                         clone(grid_lasso2.best_estimator_), 
                                         X_inner_tr, 
                                         y2_inner_tr,
                                         X_val,
                                         y2_val,
                                         loss_title = "Loss - MLCUP - target2 - (LASSO)",
                                         val_title =  "MSE - MLCUP - target2 - (LASSO)" )  

### Number of epochs up to the best LM LASSO (for target 2) will be trained 


In [None]:
tol = grid_lasso2.best_estimator_.get_params([])["tol"]
n_iteration_lasso_2 = val_MSE_s_lasso2.index(find_nearest(val_MSE_s_lasso2, min(val_MSE_s_lasso2)+tol)) + 1
print(f"Epochs di LM LASSO finale per target 2: {n_iteration_lasso_2}") 

### Training the best LM LASSO (for target 2) for a #epochs equals to n_iteration_lasso_2
 

In [None]:
final_fitted_model_lasso_2 = partial_model_list_lasso2[n_iteration_lasso_2 - 1]
final_fitted_model_lasso_2

### Plots to study how the regularization hyperparameter affects the final values of weights and the mean absolute errore (MAE) over the inner training set (X_inner_tr, y2_inner_tr) and validation set (X_val, y2_val)
- The other hyperparameters used for building these plots are setted equal to the ones found for the best model "final_fitted_model_lasso_2". 
- For every value of alpha a clone of "final_fitted_model_lasso_2" (clone: same model but still untrained) has beeen trained up to #epochs = n_iteration_lasso_2.
- After the training:
- 1) Final weights has been extracted; --> (1st plot)
- 2) Has been computed the mean absolute error over inner training and validation --> (2nd plot)

In [None]:
plots_f(-5, 2, final_fitted_model_lasso_2, n_iteration_lasso_2,
            X_inner_tr, y2_inner_tr, X_val, y2_val,
            "Lasso coefficients as a function of alpha (Target 2)", "MLCUP - Target2 - (LASSO)",
            "coeff_vs_alpha_lasso2", "mae_vs_alpha_lasso2")

# Model Assesment -Linear Model - L1 regularization (Lasso) - Target 2
- Mean Absolute Error of the model trained over the training set inner training set (X_inner_tr, y1_inner_tr)

In [None]:
print(f"Linear model LASSO (target 2): TRAINING_MAE = {mean_absolute_error(y2_inner_tr, final_fitted_model_lasso_2.predict(X_inner_tr))}") 
print(f"Linear model LASSO (target 2): VALIDATION_MAE = {mean_absolute_error(y2_val, final_fitted_model_lasso_2.predict(X_val))}") 
print(f"Linear model LASSO (target 2): TEST_MAE = {mean_absolute_error(y2_tt, final_fitted_model_lasso_2.predict(X_tt))}") 


# Model Assesment - Linear Models - LASSO -  Target 1 and 2 
- Assessing the goodness of the models found (LM LASSO) for target 1 and 2 by calculating the mean Euclidean error (MEE)

In [None]:
TRAINING_MEE_lasso = (1/X_inner_tr.shape[0])*np.sqrt(np.square(y1_inner_tr - final_fitted_model_lasso_1.predict(X_inner_tr)) + np.square(y2_inner_tr - final_fitted_model_lasso_2.predict(X_inner_tr))).sum()
VALIDATION_MEE_lasso= (1/X_val.shape[0])*np.sqrt(np.square(y1_val - final_fitted_model_lasso_1.predict(X_val)) + np.square(y2_val - final_fitted_model_lasso_2.predict(X_val))).sum()
TEST_MEE_lasso = (1/X_tt.shape[0])*np.sqrt(np.square(y1_tt - final_fitted_model_lasso_1.predict(X_tt)) + np.square(y2_tt - final_fitted_model_lasso_2.predict(X_tt))).sum()

print(f"Linear model LASSO: TRAINING_MEE = {TRAINING_MEE_lasso}")
print(f"Linear model LASSO: VALIDATION_MEE = {VALIDATION_MEE_lasso}")
print(f"Linear model LASSO: TEST_MEE = {TEST_MEE_lasso}") 


##  Predicting target 1 and 2 of blind test using respectively "final_fitted_model_lasso_1" and "final_fitted_model_lasso_2" 

In [None]:
y1_blind_lasso= final_fitted_model_lasso_1.predict(blind_ts)
y2_blind_lasso = final_fitted_model_lasso_2.predict(blind_ts)

# 

# Linear Model - L2 regularization (Ridge) - Target 1 

## Model Selection (Phase 1)

- Goal : Find the best hyperparameters.  
- Approach: Grid search through a RepeatedKFold (Repeated: 10 times, #folds: 5)
- The RepeatedKFold is applied to the desing test (X_inner_tr, y1_tr)

In [None]:
%%time

alphas = np.logspace(-3, 0, 100)

param_grid = {
    "alpha": alphas,
    "loss": ["squared_error"],
    "penalty": ["l2"],
    "max_iter": [3000],
    "tol": [1e-3, 1e-2, 1e-1],
    "learning_rate": ["constant","invscaling", "adaptive"],
    "eta0": [1e-4, 5e-4],
    "power_t": [1/4],
    "n_iter_no_change": [100]          
}


grid_ridge1 = GridSearchCV(
    SGDRegressor(),
    param_grid=param_grid,
    scoring = ["neg_mean_absolute_error"],
    cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    refit="neg_mean_absolute_error",
    return_train_score = False
)


grid_ridge1.fit(X_tr, y1_tr)

#Converting grid search results in a pd DataFrame
cv_results_ = pd.DataFrame(grid_ridge1.cv_results_)

#Selecting the models with the highest negative mean absolute error (best models)
cv_results_[cv_results_["rank_test_neg_mean_absolute_error"] == 1]

In [None]:
#Saving the grid search results of LM RIDGE for target1
pd.DataFrame(grid_ridge1.cv_results_).to_csv("grid_ridge1.csv")

## Model Selection (Phase 2)

- Goal: find the best number of epochs up to the best model found in the phase 1 has to be train
- Approach: Hold-out of the desing test (X_tr, y1_tr): t
- The deisgn test is splitted into two subsets: an inner training set 80% (X_inner_tr, y1_inner_tr) and a validation set 20% (X_val, y1_val)

In [None]:
#max number of epochs 
n_epochs = 200

val_MSE_s_ridge1, partial_model_list_ridge1 = curves (n_epochs ,                  
                                         clone(grid_ridge1.best_estimator_), 
                                         X_inner_tr, 
                                         y1_inner_tr,
                                         X_val,
                                         y1_val,
                                         loss_title = "Loss - MLCUP - target1 - (RIDGE)",
                                         val_title =  "MSE - MLCUP - target1 - (RIDGE)" )  

### Number of epochs up to the best LM RIDGE (for target 1) will be trained 


In [None]:
tol = grid_ridge1.best_estimator_.get_params([])["tol"]
n_iteration_ridge_1 = val_MSE_s_ridge1.index(find_nearest(val_MSE_s_ridge1, min(val_MSE_s_ridge1)+tol)) + 1
print(f"Epochs di LM RIDGE finale per target 1: {n_iteration_ridge_1}") 

### Best LM RIDGE (for target 1) trained for a #epochs equals to n_iteration found in the line of code above


In [None]:
final_fitted_model_ridge_1 = partial_model_list_ridge1[n_iteration_ridge_1 - 1]
final_fitted_model_ridge_1

### Plots to study how the regularization hyperparameter affects the final values of weights and the mean absolute errore (MAE) over the inner training set (X_inner_tr, y1_inner_tr) and validation set (X_val, y1_val)
- The other hyperparameters used for building these plots are setted equal to the ones found for the best model "final_fitted_model_ridge_1". 
- For every value of alpha a clone of "final_fitted_model_ridge_1" (clone: same model but still untrained) has beeen trained up to #epochs = n_iteration_ridge_1.
- After the training:
- 1) Final weights has been extracted; --> (1st plot)
- 2) Has been computed the mean absolute error over inner training and validation --> (2nd plot)

In [None]:
plots_f(-3, 3, final_fitted_model_ridge_1, n_iteration_ridge_1,
            X_inner_tr, y1_inner_tr, X_val, y1_val,
            "RIDGE coefficients as a function of alpha (Target 1)", "MLCUP - Target1 - (RIDGE)",
            "coeff_vs_alpha_ridge1", "mae_vs_alpha_ridge1")

# Model Assesment -Linear Model - L2 regularization (Ridge) - Target 1
- Mean Absolute Error of the model trained over the training set inner training set (X_inner_tr, y1_inner_tr)


In [None]:
print(f"Linear model RIDGE (target 1): TRAINING_MAE = {mean_absolute_error(y1_inner_tr, final_fitted_model_ridge_1.predict(X_inner_tr))}") 
print(f"Linear model RIDGE (target 1): VALIDATION_MAE = {mean_absolute_error(y1_val, final_fitted_model_ridge_1.predict(X_val))}") 
print(f"Linear model RIDGE (target 1): TEST_MAE = {mean_absolute_error(y1_tt, final_fitted_model_ridge_1.predict(X_tt))}") 



# Linear Model - L2 regularization (Ridge) - Target 2 


## Model Selection (Phase 1)

- Goal : Find the best hyperparameters.  
- Approach: Grid search through a RepeatedKFold (Repeated: 10 times, #folds: 5)
- The RepeatedKFold is applied to the desing test (X_inner_tr, y2_tr)

In [None]:
%%time

alphas = np.logspace(-3, 0, 100)

param_grid = {
    "alpha": alphas,
    "loss": ["squared_error"],
    "penalty": ["l2"],
    "max_iter": [3000],
    "tol": [1e-3, 1e-2, 1e-1],
    "learning_rate": ["constant","invscaling", "adaptive"],
    "eta0": [1e-4, 5e-4],
    "power_t": [1/4],
    "n_iter_no_change": [100]          
}


grid_ridge2 = GridSearchCV(
    SGDRegressor(),
    param_grid=param_grid,
    scoring = ["neg_mean_absolute_error"],
    cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    refit="neg_mean_absolute_error",
    return_train_score = False
)


grid_ridge2.fit(X_tr, y2_tr)

#Converting grid search results in a pd DataFrame
cv_results_ = pd.DataFrame(grid_ridge2.cv_results_)

#Selecting the models with the highest negative mean absolute error (best models)
cv_results_[cv_results_["rank_test_neg_mean_absolute_error"] == 1]

In [None]:
#Saving the grid search results of LM RIDGE for target2
pd.DataFrame(grid_ridge2.cv_results_).to_csv("grid_ridge2.csv")

## Model Selection (Phase 2)

- Goal: find the best number of epochs up to the best model found in the phase 1 has to be train
- Approach: Hold-out of the desing test (X_tr, y2_tr): t
- The deisgn test is splitted into two subsets: an inner training set 80% (X_inner_tr, y2_inner_tr) and a validation set 20% (X_val, y2_val)

In [None]:
#max number of epochs 
n_epochs = 200

val_MSE_s_ridge2, partial_model_list_ridge2 = curves (n_epochs ,                  
                                         clone(grid_ridge2.best_estimator_), 
                                         X_inner_tr, 
                                         y2_inner_tr,
                                         X_val,
                                         y2_val,
                                         loss_title = "Loss - MLCUP - target2 - (RIDGE)",
                                         val_title =  "MSE - MLCUP - target2 - (RIDGE)" )  

### Number of epochs up to the best LM RIDGE (for target 2) will be trained 


In [None]:
tol = grid_ridge2.best_estimator_.get_params([])["tol"]
n_iteration_ridge_2 = val_MSE_s_ridge2.index(find_nearest(val_MSE_s_ridge2, min(val_MSE_s_ridge2)+tol)) + 1
print(f"Epochs di LM RIDGE finale per target 1: {n_iteration_ridge_2}") 

### Best LM RIDGE (for target 2) trained for a #epochs equals to n_iteration found in the line of code above


In [None]:
final_fitted_model_ridge_2 = partial_model_list_ridge2[n_iteration_ridge_2 - 1]
final_fitted_model_ridge_2

### Plots to study how the regularization hyperparameter affects the final values of weights and the mean absolute errore (MAE) over the inner training set (X_inner_tr, y2_inner_tr) and validation set (X_val, y2_val)
- The other hyperparameters used for building these plots are setted equal to the ones found for the best model "final_fitted_model_ridge_2". 
- For every value of alpha a clone of "final_fitted_model_ridge_2" (clone: same model but still untrained) has beeen trained up to #epochs = n_iteration_ridge_2.
- After the training:
- 1) Final weights has been extracted; --> (1st plot)
- 2) Has been computed the mean absolute error over inner training and validation --> (2nd plot)

In [None]:
plots_f(-5, 3, final_fitted_model_ridge_2, n_iteration_ridge_2,
            X_inner_tr, y2_inner_tr, X_val, y2_val,
            "Ridge coefficients as a function of alpha (Target 2)", "MLCUP - Target2 - (RIDGE)",
            "coeff_vs_alpha_ridge2", "mae_vs_alpha_ridge2")
                                         

# Model Assesment -Linear Model - L2 regularization (Ridge) - Target 2
- Mean Absolute Error of the model trained over the training set inner training set (X_inner_tr, y2_inner_tr)


In [None]:
print(f"Linear model RIDGE (target 2): TRAINING_MAE = {mean_absolute_error(y2_inner_tr, final_fitted_model_ridge_2.predict(X_inner_tr))}") 
print(f"Linear model RIDGE (target 2): VALIDATION_MAE = {mean_absolute_error(y2_val, final_fitted_model_ridge_2.predict(X_val))}") 
print(f"Linear model RIDGE (target 2): TEST_MAE = {mean_absolute_error(y2_tt, final_fitted_model_ridge_2.predict(X_tt))}") 


## Assessing the goodness of the models (LM RIDGE) found by calculating the Mean Euclidean Error (MEE)


In [None]:
TRAINING_MEE_ridge = (1/X_inner_tr.shape[0])*np.sqrt(np.square(y1_inner_tr - final_fitted_model_ridge_1.predict(X_inner_tr)) + np.square(y2_inner_tr - final_fitted_model_ridge_2.predict(X_inner_tr))).sum()
VALIDATION_MEE_ridge= (1/X_val.shape[0])*np.sqrt(np.square(y1_val - final_fitted_model_ridge_1.predict(X_val)) + np.square(y2_val - final_fitted_model_ridge_2.predict(X_val))).sum()
TEST_MEE_ridge = (1/X_tt.shape[0])*np.sqrt(np.square(y1_tt - final_fitted_model_ridge_1.predict(X_tt)) + np.square(y2_tt - final_fitted_model_ridge_2.predict(X_tt))).sum()

print(f"Linear model RIDGE: TRAINING_MEE = {TRAINING_MEE_ridge}")
print(f"Linear model RIDGE: VALIDATION_MEE = {VALIDATION_MEE_ridge}")
print(f"Linear model RIDGE: TEST_MEE = {TEST_MEE_ridge}") 


##  Predicting target 1 and 2 of blind test using respectively "final_fitted_model_ridge_1" and "final_fitted_model_ridge_2" 

In [None]:
y1_blind_ridge = final_fitted_model_ridge_1.predict(blind_ts)
y2_blind_ridge = final_fitted_model_ridge_2.predict(blind_ts)

## Saving precition over blind test for each model in a csv file

In [None]:
df_blind_predictions_LMs = (pd.DataFrame([y1_blind_not_reg, 
               y2_blind_not_reg, 
               y1_blind_lasso, 
               y2_blind_lasso, 
               y1_blind_ridge, 
               y2_blind_ridge]).T).rename(columns={0: "y1_blind_not_reg",
                                                   1: "y2_blind_not_reg",
                                                   2: "y1_blind_lasso",
                                                   3: "y2_blind_lasso",
                                                   4: "y1_blind_ridge", 
                                                   5: "y2_blind_ridge"})

df_blind_predictions_LMs.to_csv('blind_predictions_LMs.csv')

# Best model combination (Lasso for target 1 and ridge for target 2)
- Computing the MEE of the model combination over TR, VL and TS

In [None]:
combined_train_mee = (1/X_inner_tr.shape[0])*np.sqrt(np.square(y1_inner_tr - final_fitted_model_lasso_1.predict(X_inner_tr)) + np.square(y2_inner_tr - final_fitted_model_ridge_2.predict(X_inner_tr))).sum()
combined_validation_mee = (1/X_val.shape[0])*np.sqrt(np.square(y1_val - final_fitted_model_lasso_1.predict(X_val)) + np.square(y2_val - final_fitted_model_ridge_2.predict(X_val))).sum()
combined_test_mee = (1/X_tt.shape[0])*np.sqrt(np.square(y1_tt - final_fitted_model_ridge_1.predict(X_tt)) + np.square(y2_tt - final_fitted_model_ridge_2.predict(X_tt))).sum()


print(f"Linear model combined (lasso1 + ridge2): TRAINING_MEE = {combined_train_mee}") 
print(f"Linear model combined (lasso1 + ridge2): VALIDATION_MEE = {combined_validation_mee}") 
print(f"Linear model combined (lasso1 + ridge2): TEST_MEE = {combined_test_mee}") 
