In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [3]:
data = pd.read_csv("train.csv")
y = data["y"].to_numpy()
data = data.drop(columns="y")
# print a few data samples
# print(data.head())

X = data.to_numpy()
# The function calculating the average RMSE
lambdas = [0.1, 1, 10, 100, 200]
n_folds = 10


In [106]:
w = fit(X, y, 1)

calculate_RMSE(w,X,y)

Iteration 0 completed. Cost: 283.7728
Iteration 100 completed. Cost: 178.73893380251437
Iteration 200 completed. Cost: 122.85412550451042
Iteration 300 completed. Cost: 92.924278834803
Iteration 400 completed. Cost: 76.71052696579
Iteration 500 completed. Cost: 67.75429619179619
Iteration 600 completed. Cost: 62.64688396941769
Iteration 700 completed. Cost: 59.588820996949885
Iteration 800 completed. Cost: 57.629754125763924
Iteration 900 completed. Cost: 56.2673883353225
Iteration 1000 completed. Cost: 55.23587689414152
Iteration 1100 completed. Cost: 54.394168972802504
Iteration 1200 completed. Cost: 53.6670972689887
Iteration 1300 completed. Cost: 53.01429259948222
Iteration 1400 completed. Cost: 52.41378074279994
Iteration 1500 completed. Cost: 51.85332606576842
Iteration 1600 completed. Cost: 51.325862614693406
Iteration 1700 completed. Cost: 50.82708192473309
Iteration 1800 completed. Cost: 50.35415883707715
Iteration 1900 completed. Cost: 49.905077825296


121.83281504267724

In [69]:
def ridge_cost_gradient(X, y, w, alpha):
    n = len(y)
    y_pred = X @ w
    cost = (1 / (2 * n)) * np.sum((y_pred - y) ** 2) + (alpha / 2) * np.sum(w[1:] ** 2)
    # To calculate the gradient, one needs to take the derivative of the cost with respect to the weight vector
    gradient = (1 / n) * (X.T @ (y_pred - y)) + alpha * np.hstack(([0], w[1:])) * (1 / n)
    return cost, gradient    

In [92]:
def fit(X, y, lam):
    """
    This function receives training data points, then fits the ridge regression on this data
    with regularization hyperparameter lambda. The weights w of the fitted ridge regression
    are returned. 

    Parameters
    ----------
    X: matrix of floats, dim = (135,13), inputs with 13 features
    y: array of floats, dim = (135,), input labels)
    lam: float. lambda parameter, used in regularization term

    Returns
    ----------
    w: array of floats: dim = (13,), optimal parameters of ridge regression
    """
    # To fit Ridge regression: gradient descent will be applied
    learning_rate = 1e-8
    max_iter = 2000
    
    n, p = X.shape
    w = np.zeros((p,))

    for ii in range(max_iter):
        cost, gradient = ridge_cost_gradient(X, y, w, alpha = lam)
        w -= learning_rate*gradient
        # if (ii % 100 == 0):
        #    print(f"Iteration {ii} completed. Cost: {cost}")


    assert w.shape == (13,)
    return w

In [105]:
def calculate_RMSE(w, X, y):
    """This function takes test data points (X and y), and computes the empirical RMSE of 
    predicting y from X using a linear model with weights w. 

    Parameters
    ----------
    w: array of floats: dim = (13,), optimal parameters of ridge regression 
    X: matrix of floats, dim = (15,13), inputs with 13 features
    y: array of floats, dim = (15,), input labels

    Returns
    ----------
    RMSE: float: dim = 1, RMSE value
    """
    RMSE = 0
    
    y_pred = X @ w
    RMSE = np.sqrt( np.sum((y - y_pred)**2) )

    assert np.isscalar(RMSE)
    return RMSE

In [108]:
def average_LR_RMSE(X, y, lambdas, n_folds):
    """
    Main cross-validation loop, implementing 10-fold CV. In every iteration (for every train-test split), the RMSE for every lambda is calculated, 
    and then averaged over iterations.
    
    Parameters
    ---------- 
    X: matrix of floats, dim = (150, 13), inputs with 13 features
    y: array of floats, dim = (150, ), input labels
    lambdas: list of floats, len = 5, values of lambda for which ridge regression is fitted and RMSE estimated
    n_folds: int, number of folds (pieces in which we split the dataset), parameter K in KFold CV
    
    Returns
    ----------
    avg_RMSE: array of floats: dim = (5,), average RMSE value for every lambda
    """
    RMSE_mat = np.zeros((n_folds, len(lambdas)))

    # TODO: Enter your code here. Hint: Use functions 'fit' and 'calculate_RMSE' with training and test data
    # and fill all entries in the matrix 'RMSE_mat'

    kf = KFold(n_splits = n_folds, shuffle=True, random_state=42)

    # Evaluate the obtained RMSE per every value of the proposed regularization parameters
    for ii in range(len(lambdas)):
        lam = lambdas[ii]

        lam_RMSE = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            w = fit(X_train, y_train, lam)
            lam_RMSE.append(calculate_RMSE(w, X_test, y_test))
            
        RMSE_mat[:, ii] = lam_RMSE


    assert RMSE_mat.shape == (n_folds, len(lambdas))

    avg_RMSE = np.mean(RMSE_mat, axis=0)
    assert avg_RMSE.shape == (5,)
    return avg_RMSE


In [109]:
if __name__ == "__main__":
    # Data loading
    data = pd.read_csv("train.csv")
    y = data["y"].to_numpy()
    data = data.drop(columns="y")
    # print a few data samples
    # print(data.head())

    X = data.to_numpy()
    # The function calculating the average RMSE
    lambdas = [0.1, 1, 10, 100, 200]
    n_folds = 10
    avg_RMSE = average_LR_RMSE(X, y, lambdas, n_folds)
    # Save results in the required format
    np.savetxt("./results.csv", avg_RMSE, fmt="%.12f")


Iteration 0 completed. Cost: 276.56059259259257
Iteration 100 completed. Cost: 174.65676275096214
Iteration 200 completed. Cost: 119.61713502585461
Iteration 300 completed. Cost: 89.70082164697544
Iteration 400 completed. Cost: 73.26238502989983
Iteration 500 completed. Cost: 64.06349931381096
Iteration 600 completed. Cost: 58.76190599888575
Iteration 700 completed. Cost: 55.56651327505046
Iteration 800 completed. Cost: 53.517073923084936
Iteration 900 completed. Cost: 52.09838327810031
Iteration 1000 completed. Cost: 51.03364393284028
Iteration 1100 completed. Cost: 50.173791018124795
Iteration 1200 completed. Cost: 49.43821341398531
Iteration 1300 completed. Cost: 48.78300237202763
Iteration 1400 completed. Cost: 48.1839437311295
Iteration 1500 completed. Cost: 47.6274067976855
Iteration 1600 completed. Cost: 47.10546271824034
Iteration 1700 completed. Cost: 46.613268228044795
Iteration 1800 completed. Cost: 46.147662804027426
Iteration 1900 completed. Cost: 45.706415789358985
Iterat