## Kundyz Muktar - Lab 1 - Applied Machine Learning - February 16, 2024

In [24]:
def train_test_split(X, y, test_size=0.2):

    if test_size <= 0 or test_size >= 1:
        raise ValueError("test_size must be between 0 and 1")

    n = len(X)
    i = int(n * (1 - test_size))
    
    X_train, y_train = X[:i], y[:i]
    X_test, y_test = X[i:], y[i:]
    
    return X_train, X_test, y_train, y_test

# Task 1

In [25]:
from sklearn.datasets import load_boston
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import KFold
from sklearn.preprocessing import PolynomialFeatures

# Task 2

In [26]:
## Avoid printing out warnings## Avoid printing out warnings
with warnings.catch_warnings():
     warnings.filterwarnings("ignore")
     X, y = load_boston(return_X_y=True)
with warnings.catch_warnings():
     warnings.filterwarnings("ignore")
     X, y = load_boston(return_X_y=True)

X_with_bias = np.concatenate([np.ones((X.shape[0], 1)), X], axis=1)
X_b, x_test, y, y_test = train_test_split(X_with_bias, y, test_size=0.1)

feature_means = X_b[:, 1:].mean(axis=0)
feature_stds = X_b[:, 1:].std(axis=0)

X_b[:, 1:] = (X_b[:, 1:] - feature_means) / feature_stds
x_test[:, 1:] = (x_test[:, 1:] - feature_means) / feature_stds


kf = KFold(n_splits=5, shuffle=True, random_state=2)


# Task 3

In [27]:
mse_train3 = []
mse_validation3 = []
for train, validation in kf.split(X_b):
    X_training, Y_training = X_b[train], y[train]
    X_validation, Y_validation  = X_b[validation], y[validation]

    weight_vector = np.linalg.inv(X_training.T @ X_training) @ X_training.T @ Y_training
    
    mse_train3.append(np.mean((Y_training - (X_training @ weight_vector)) ** 2))
    mse_validation3.append(np.mean((Y_validation - (X_validation @ weight_vector)) ** 2))

weight_vector_final = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y_train_val
mse_test3 = np.mean((y_test - (x_test @ weight_vector_final))**2)

print("Mean MSE training:", np.mean(mse_train3))
print("Mean MSE validation:", np.mean(mse_validation3))
print("MSE test:", mse_test3) 

Mean MSE training: 22.93840526238035
Mean MSE validation: 25.978551123247335
MSE test: 10.80620138107843


# Task 4

In [28]:
mse_train4 = []
mse_validation4 = []
lambdas = np.logspace(1, 7, num=13)
I = np.eye(X_b.shape[1])
I[0,0]=0

for lmbda in lambdas: 
    mse_train_per_lambda1 = []
    mse_validation_per_lambda1 = []

    for train, validation in kf.split(X_b):
        X_training, Y_training = X_b[train], y[train]
        X_validation, Y_validation  = X_b[validation], y[validation]
    
        weight_vector = np.linalg.inv(X_training.T @ X_training + lmbda * I) @ X_training.T @ Y_training
        
        Yh_training = X_training @ weight_vector
        R1 = Y_training - Yh_training
        loss1 = (R1.T @ R1) / len(Y_training)
        mse_train_per_lambda1.append(loss1)
            
        Yh_validation = X_validation @ weight_vector
        R2 = Y_validation - Yh_validation
        loss2 = (R2.T @ R2) / len(Y_validation)
        mse_validation_per_lambda1.append(loss2)
        
    mse_train4.append(np.mean(mse_train_per_lambda1)) 
    mse_validation4.append(np.mean(mse_validation_per_lambda1)) 

best_lambda_index = np.argmin(mse_validation4)

# print("mse for training set:", mse_train4)
# print("mse for validation set:", mse_validation4)
print("the best lambda is ", lambdas[best_lambda_index])

the best lambda is  10.0


**Task 4** The best choice of $\lambda$ is $10^1$, for which we obtain minimum MSE across MSE varying depending on $\lambda_i$. 

**Task 5** When $\lambda$ = $10^1$, we have average (across K-folds) MSE for training set of score 21.87 and for validation set 23.47. 

# Task 5

In [29]:
mse_train5 = []
mse_validation5 = []

I = np.eye(X_b.shape[1])
I[0,0]=0

for train, validation in kf.split(X_b):
    X_training, Y_training = X_b[train], y[train]
    X_validation, Y_validation  = X_b[validation], y[validation]
    
    weight_vector = np.linalg.inv(X_training.T @ X_training + lambdas[best_lambda_index] * I) @ X_training.T @ Y_training
    
    mse_train5.append(np.mean((Y_training - (X_training @ weight_vector)) ** 2))
    mse_validation5.append(np.mean((Y_validation - (X_validation @ weight_vector)) ** 2))

weight_vector_final5 = np.linalg.inv(X_b.T @ X_b + lambdas[best_lambda_index] * I) @ X_b.T @ y_train_val
mse_test5 = np.mean((y_test - (x_test @ weight_vector_final5)) ** 2)

print("Mean MSE training:", np.mean(mse_train5))
print("Mean MSE validation:", np.mean(mse_validation5))
print("MSE test:", mse_test5)

Mean MSE training: 23.091344595493844
Mean MSE validation: 25.961663903882958
MSE test: 9.728459652108022


# Task 6

In [30]:
p = PolynomialFeatures(degree=2)
X_p = p.fit_transform(X_b)
X_p = np.concatenate([np.ones((X_p.shape[0], 1)), X_p], axis=1)

# same as before, before this step i need to make sure that X_deg2 is of a type numpy array
mse_train6 = []
mse_validation6 = []
lambdas = np.logspace(1, 7, num=13)
I = np.eye(X_p.shape[1])
I[0,0]=0

for lmbda in lambdas: 
    mse_train_per_lambda2 = []
    mse_validation_per_lambda2 = []

    for train, validation in kf.split(X_p):
        X_training, Y_training = X_p[train], y[train]
        X_validation, Y_validation  = X_p[validation], y[validation]
    
        weight_vector = np.linalg.inv(X_training.T @ X_training + lmbda * I) @ X_training.T @ Y_training

        Yh_training = X_training @ weight_vector
        R1 = Y_training - Yh_training
        loss1 = (R1.T @ R1) / len(Y_training)
        mse_train_per_lambda2.append(loss1)
          
        Yh_validation = X_validation @ weight_vector
        R2 = Y_validation - Yh_validation
        loss2 = (R2.T @ R2) / len(Y_validation)
        mse_validation_per_lambda2.append(loss2)
        
    mse_train6.append(np.mean(mse_train_per_lambda2)) 
    mse_validation6.append(np.mean(mse_validation_per_lambda2)) 

best_lambda_index = np.argmin(mse_validation6)

# print("mse for training set:", mse_train)
# print("mse for validation set:", mse_validation)
print("the best lambda is ", lambdas[best_lambda_index])

the best lambda is  10.0


# Task 7

In [31]:
mse = []
mse_train7 =[]
mse_validation7 =[]
for train, validation in kf.split(X_b):
    X_training, Y_training = X_b[train], y[train]
    X_validation, Y_validation = X_b[validation], y[validation]
    
    weight_vector = np.zeros(X_training.shape[1])
    
    for iteration in range(1000):

        Yh_training = X_training @ weight_vector
        n = X_training.shape[0]
        
        gradients = -2*(X_training.T @ (Y_training - Yh_training)/n)
        
        weight_vector = weight_vector - 0.001 * gradients
        
    mse_train7.append(np.mean((Y_training - (X_training @ weight_vector)) ** 2))
    mse_validation7.append(np.mean((Y_validation - (X_validation @ weight_vector)) ** 2))

mse_test7 = np.mean((y_test - (x_test @ weight_vector)) ** 2)

print("Mean MSE training:", np.mean(mse_train7))
print("Mean MSE validation:", np.mean(mse_validation7))
print("MSE test:", mse_test7)

Mean MSE training: 35.090146352768315
Mean MSE validation: 39.23650317320514
MSE test: 19.90529355507329


# Task 8

In [32]:
mse_train8 =[]
mse_validation8 =[]
# Parameters
lmbda = 0.001
learning_rate = 0.001

for train, validation in kf.split(X_b):
    X_training, Y_training = X_b[train], y[train]
    X_validation, Y_validation = X_b[validation], y[validation]
    
    weight_vector = np.zeros(X_training.shape[1])

    for iteration in range(1000):  # You can change the iterations if needed

        Yh_training = X_training @ weight_vector
        n = X_training.shape[0]
        
        # Partial derivatives with respect to parameters of Lasso 
        pd_wrt_parameters = -2/n * (X_training.T @ (Y_training - Yh_training)) + lmbda * np.sign(weight_vector)
        # regularize all weigths (except from the intercept) 
        weight_vector[1:] = weight_vector[1:] - learning_rate * pd_wrt_parameters[1:]
        # gradient descent for the intercept
        weight_vector[0] = weight_vector[0] - learning_rate * (-2/n) * np.sum(Y_training - Yh_training)
        
    mse_train8.append(np.mean((Y_training - (X_training @ weight_vector)) ** 2))
    mse_validation8.append(np.mean((Y_validation - (X_validation @ weight_vector)) ** 2))

mse_test8 = np.mean((y_test - (x_test @ weight_vector)) ** 2)

print("Mean MSE training:", np.mean(mse_train8))
print("Mean MSE validation:", np.mean(mse_validation8))
print("MSE test:", mse_test8)

Mean MSE training: 35.09171127507219
Mean MSE validation: 39.23747384724328
MSE test: 19.902658689182118


# Task 9 

In [33]:
lmbda = 0.001
learning_rate = 0.001

def elastic_gradient(X, y, y_pred, theta):
    r1 = 0.2  # L1 
    r2 = 1-r1  # L2 
    return (
        -(2 / y.size) * X.T.dot(y - y_pred)  # Gradient of MSE
        + lmbda * r2 * theta  # Gradient of L2 penalty
        + lmbda * r1 * np.sign(theta)  # Subgradient of L1 penalty
    )

mse_train9 =[]
mse_validation9 =[]

for train, validation in kf.split(X_b):
    X_training, Y_training = X_b[train], y[train]
    X_validation, Y_validation = X_b[validation], y[validation]
    
    theta = np.zeros(X_train.shape[1])
    
    for i in range(1000):
        Yh_train = X_train @ theta
        gradients = elastic_gradient(X_train, y_train, Yh_train, theta)
        theta -= learning_rate * gradients
        
    mse_train9.append(np.mean((Y_training - (X_training @ weight_vector)) ** 2))
    mse_validation9.append(np.mean((Y_validation - (X_validation @ weight_vector)) ** 2))

mse_test9 = np.mean((y_test - (x_test @ theta)) ** 2)

print("Mean MSE training:", np.mean(mse_train9))
print("Mean MSE validation:", np.mean(mse_validation9))
print("MSE test:", mse_test9)

Mean MSE training: 34.25833010084479
Mean MSE validation: 34.25833010084479
MSE test: 19.93877685674913


# Task 10

From the given three models, I would choose Elastic Net.

Firstly, while the test MSEs for three models are almost the same, in terms of training and MSEs

In terms of the use cases of these three models, multivariate regression can overfit the data as it doesn't have regularization, and lasso regression eliminates features with the least impact on the prediction. Elastic Net has both L1 and L2 regularizations, which implies that it eliminates irrelevant features while also handling multicollinearity. Considering the features of the Boston dataset, it has only 13 features, so eliminating features might not be the best decision. However, looking at these features, one may say that some of them can be correlated. Elastic Net is very rebust, as it has both L1 and L2 regularizations, which implies that it eliminates irrelevant features while also handling multicollinearity.

Elastic Net has two main parameters:

α: the overall regularization strength.
r: the ratio that balances the amount of L1 and L2 regularization.
Additionally, since we are optimizing the Elastic Net model with Gradient Descent, the learning rate is also a parameter here, which determines the magnitude of the steps we are taking when updating the weights.