## ML ZoomCamp 2024
### Homework 2

#### 1. Import

##### 1.1 Import libraries

In [1]:
# import packages
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

##### 1.2 Import data

In [2]:
# import dataframe
df = pd.read_csv('data/laptops.csv')

In [3]:
# display first 5 rows of dataframe
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


#### 2. Preparing the data

##### 2.1 Normalizing the columns names

In [4]:
# normalize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [5]:
# display columns names
df.columns

Index(['laptop', 'status', 'brand', 'model', 'cpu', 'ram', 'storage',
       'storage_type', 'gpu', 'screen', 'touch', 'final_price'],
      dtype='object')

#### 2.2 Reducing the number of columns

In [6]:
# defining a list of columns to be used in the analysis
col = [
    'ram',
    'storage',
    'screen',
    'final_price'
]

In [7]:
# creating a new dataframe with the selected columns
df = df[col]

#### 3. Questions

##### 3.1 Question 1 
There's one column with missing values. What is it?

In [8]:
# displaying the number of null values for each column
df.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

The ```ram``` column is the only columns with missing values.

##### 3.2 Question 2
What's the median for variable ```ram```?

In [9]:
# calculating the median for the ram column
median_ram = df['ram'].median()
print(median_ram)

16.0


The median of the ```ram``` column is $16$.

##### 3.3 Question 3

We need to deal with missing values for the column from Q1.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lessons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)

Which option gives better RMSE?

##### 3.3.1 Preparing and splitting the data

In [10]:
# checking the size of each dataset
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - (n_val + n_test)

In [11]:
# shuffling the indices
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

In [12]:
# creating the datasets
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[:n_val]]
df_test = df.iloc[idx[n_val:n_val+n_test]]

In [13]:
# resetting the indices
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [14]:
# transorming target variable into log
y_train = df_train.final_price.values 
y_test = df_test.final_price.values
y_val = df_val.final_price.values

In [15]:
# dropping the target variable from the datasets
df_train = df_train.drop(['final_price'], axis=1)
df_val = df_val.drop(['final_price'], axis=1)
df_test = df_test.drop(['final_price'], axis=1)

##### 3.3.2 Dealing with missing values

In [16]:
def prepare_X(df):
    """
    Prepare the feature matrix from the given DataFrame.

    This function copies the input DataFrame, fills any missing values with 0,
    and converts the DataFrame to a NumPy array.

    Parameters:
    ----------
    df : pandas.DataFrame
        The input DataFrame containing the features.

    Returns:
    -------
    numpy.ndarray
        The feature matrix as a NumPy array.
    """
    df = df.copy()
    df = df.fillna(0)
    X = df.values
    return X

In [17]:
def prepare_X_mean(df):
    """
    Prepare the feature matrix from the given DataFrame, filling missing values with the column mean.

    This function copies the input DataFrame, fills any missing values with the mean of the respective columns,
    and converts the DataFrame to a NumPy array.

    Parameters:
    ----------
    df : pandas.DataFrame
        The input DataFrame containing the features.

    Returns:
    -------
    numpy.ndarray
        The feature matrix as a NumPy array.
    """
    df = df.copy()
    df = df.fillna(df.mean())
    X = df.values
    return X

In [18]:
def train_linear_regression(X, y):
    """
    Train a linear regression model using the normal equation.

    This function adds a bias term to the feature matrix, computes the weights
    using the normal equation, and returns the bias term and the weights separately.

    Parameters:
    ----------
    X : numpy.ndarray
        Feature matrix of shape (n_samples, n_features).
    y : numpy.ndarray
        Target vector of shape (n_samples,).

    Returns:
    -------
    float
        The bias term (intercept).
    numpy.ndarray
        The weights (coefficients) of the linear model.
    """
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

In [19]:
def rmse(y, y_pred):
    """
    Calculate the Root Mean Squared Error (RMSE) between actual and predicted values.

    This function computes the squared error between the actual and predicted values,
    calculates the mean of these squared errors (MSE), and then returns the square root of the MSE.

    Parameters:
    ----------
    y : numpy.ndarray
        Actual values.
    y_pred : numpy.ndarray
        Predicted values.

    Returns:
    -------
    float
        The Root Mean Squared Error (RMSE).
    """
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [20]:
# calculating the RMSE for the training set with fillna(0)
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)
X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)
print('rmse with 0 :', round(score, 2))

rmse with 0 : 623.58


In [21]:
# calculating the RMSE for the training set with mean
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)
X_val = prepare_X_mean(df_val)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)
print('rmse with mean :', round(score, 2))

rmse with mean : 624.22


##### 3.4 Question 4

Now let's train a regularized linear regression.

- For this question, fill the NAs with 0.
- Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.

Which r gives the best RMSE?

In [33]:
def train_linear_regression_reg(X, y, r):
    """
    Train a linear regression model with L2 regularization (ridge regression).

    Parameters:
    X (numpy.ndarray): Feature matrix of shape (n_samples, n_features).
    y (numpy.ndarray): Target vector of shape (n_samples,).
    r (float): Regularization strength; must be a non-negative float.

    Returns:
    float: The bias term (intercept).
    numpy.ndarray: The weights (coefficients) of the linear model.
    """
    # Add bias term to the feature matrix
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    # Compute X^T * X
    XTX = X.T.dot(X)
    # Add regularization term to the diagonal
    XTX = XTX + r * np.eye(XTX.shape[0])
    # Compute the inverse of X^T * X + r * I
    XTX_inv = np.linalg.inv(XTX)
    # Compute the weights
    w_full = XTX_inv.dot(X.T).dot(y)
    # Return the bias term and the weights separately
    return w_full[0], w_full[1:]

In [34]:
# create an empty list to store the results
results = []

# Iterate over different regularization parameters
for r in [0, 0.01, 0.1, 1, 5, 10, 10]:
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)
    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    # Append the results (regularization parameter, bias term, RMSE score)
    results.append((r, w0, score))

# Sort results by RMSE score
results.sort(key=lambda x: x[2])

# Print results in ranked order
print("Ranked results (r, w0, RMSE):")
for rank, (r, w0, score) in enumerate(results, start=1):
    print(f"{rank}. r: {r}, w0: {w0}, RMSE: {score}")

Ranked results (r, w0, RMSE):
1. r: 0, w0: 6.054179380043934e-10, RMSE: 6.953494687826305e-11
2. r: 0.01, w0: 1.6129309443613238e-08, RMSE: 1.2720344812620746e-08
3. r: 0.1, w0: 1.4556150773614718e-07, RMSE: 1.2709670215123333e-07
4. r: 1, w0: 1.3021886893227475e-06, RMSE: 1.2703912542736236e-06
5. r: 5, w0: 4.516747388549902e-06, RMSE: 6.348387204422951e-06
6. r: 10, w0: 6.5349150624172125e-06, RMSE: 1.2698958428552832e-05
7. r: 10, w0: 6.5349150624172125e-06, RMSE: 1.2698958428552832e-05


##### 3.5 Question 5

We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))

In [24]:
# Function to split data and prepare datasets
def split_data(df, seed, n_train, n_val, n_test):
    np.random.seed(seed)
    idx = np.random.permutation(len(df))
    df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
    df_val = df.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
    df_test = df.iloc[idx[n_train+n_val:n_train+n_val+n_test]].reset_index(drop=True)
    return df_train, df_val, df_test

In [25]:
# List to store RMSE scores
rmse_scores = []

# Loop over different seed values
for seed in range(10):
    # Split the data
    df_train, df_val, df_test = split_data(df, seed, n_train, n_val, n_test)
    
    # Extract target variable
    y_train = df_train.pop('final_price').values
    y_val = df_val.pop('final_price').values
    y_test = df_test.pop('final_price').values

    # Prepare feature matrices
    X_train = prepare_X(df_train)
    X_val = prepare_X(df_val)

    # Train the model
    w0, w = train_linear_regression(X_train, y_train)

    # Make predictions
    y_pred = w0 + X_val.dot(w)

    # Calculate RMSE
    score = rmse(y_val, y_pred)
    rmse_scores.append(score)

# Compute the standard deviation of the RMSE scores
std = np.std(rmse_scores)
print("Standard deviation of RMSE scores:", round(std, 3))

Standard deviation of RMSE scores: 29.176


##### 3.6 Question 6

- Split the dataset like previously, use seed 9.
- Combine train and validation datasets.
- Fill the missing values with 0 and train a model with r=0.001.

What's the RMSE on the test dataset?


In [26]:
# importing packages
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [27]:
# Split the data using seed 9
df_train, df_val, df_test = split_data(df, 9, n_train, n_val, n_test)

In [28]:
# Prepare the feature matrices and target vectors
X_train = prepare_X(df_train.drop(['final_price'], axis=1))
y_train = df_train['final_price'].values

X_val = prepare_X(df_val.drop(['final_price'], axis=1))
y_val = df_val['final_price'].values

X_test = prepare_X(df_test.drop(['final_price'], axis=1))
y_test = df_test['final_price'].values

In [29]:
# Combine train and validation datasets
X_train_combined = np.vstack((X_train, X_val))
y_train_combined = np.hstack((y_train, y_val))

In [30]:
# Train the model using Ridge regression
ridge_reg = Ridge(alpha=0.001)
ridge_reg.fit(X_train_combined, y_train_combined)

In [31]:
# Make predictions on the test set
y_pred = ridge_reg.predict(X_test)

In [32]:
# Calculate RMSE
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE on the test dataset:", round(test_rmse, 3))

RMSE on the test dataset: 608.61
