In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
def split_dataset(filepath="./Boston-filtered.csv", testsize=1/3, random_state=None):
    """
    Split a dataset into training and test sets.

    This function loads a dataset from a CSV file, shuffles the rows (if a random seed is provided), 
    and splits the dataset into input features (`X`) and target values (`y`). The dataset is then 
    divided into training and test sets based on the specified `testsize` ratio.

    Parameters:
    ----------
    filepath : str, optional
        The path to the CSV file containing the dataset. The default is "./Boston-filtered.csv".
    
    testsize : float, optional
        The proportion of the dataset to include in the test split. Default is 1/3, meaning 
        that 1/3 of the dataset will be used for testing and the rest for training.
    
    random_state : int, optional
        The seed used by the random number generator for shuffling the data. If `None`, the data 
        will not be shuffled in a reproducible way. Default is `None`.

    Returns:
    -------
    X_train : ndarray
        A numpy array containing the training input features.
    
    y_train : ndarray
        A numpy array containing the training target values.
    
    X_test : ndarray
        A numpy array containing the test input features.
    
    y_test : ndarray
        A numpy array containing the test target values.
    
    Example:
    --------
    X_train, y_train, X_test, y_test = split_dataset(filepath="data.csv", testsize=0.2, random_state=42)
    """
    
    # Load the dataset from the CSV file
    data = pd.read_csv(filepath)

    # Shuffle the dataset (if random_state is specified for reproducibility)
    data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)

    # Separate the features (X) and target values (y)
    X = data.iloc[:, :-1].values  # All columns except the last one are features
    y = data.iloc[:, -1].values   # The last column is the target value

    # Calculate the index to split the dataset based on the testsize ratio
    split_index = int(len(data) * (1 - testsize))

    # Split the data into training and test sets
    X_train = X[:split_index]
    y_train = y[:split_index]

    X_test = X[split_index:]
    y_test = y[split_index:]

    # Return the training and test sets
    return X_train, y_train, X_test, y_test

In [3]:
# (a)
# Set the number of runs for repeated cross-validation
num_runs = 20

# Lists to store the training and test errors for each run
training_errors = []
test_errors = []

def constant_attribute(X_train, y_train, X_test, y_test):
    """
    Perform naive regression by predicting the mean of the training target values.
    
    This function predicts the target value using a constant model, where the prediction 
    for each sample is simply the mean of the target values in the training set. It then 
    calculates the Mean Squared Error (MSE) for both the training and test sets.
    
    Parameters:
    ----------
    X_train : ndarray
        A 1D numpy array of training input features (not used in the constant model).
    
    y_train : ndarray
        A 1D numpy array of training target values.
    
    X_test : ndarray
        A 1D numpy array of test input features (not used in the constant model).
    
    y_test : ndarray
        A 1D numpy array of test target values.
    
    Returns:
    -------
    mse_train : float
        The Mean Squared Error (MSE) on the training set.
    
    mse_test : float
        The Mean Squared Error (MSE) on the test set.
    
    """
    # Predict the mean of the target values in the training set
    y_mean = np.mean(y_train)

    # Create arrays of ones (constant features) for the training and test sets
    X_train_ones = np.ones(len(X_train))
    X_test_ones = np.ones(len(X_test))

    # The predictions are the constant value (mean of y_train)
    y_train_pred = y_mean * X_train_ones
    y_test_pred = y_mean * X_test_ones

    # Calculate Mean Squared Error for training and test sets
    mse_train = np.mean((y_train - y_train_pred)**2)
    mse_test = np.mean((y_test - y_test_pred)**2)

    return mse_train, mse_test

# Loop over the number of runs to evaluate the model multiple times
for run in range(num_runs):
    # Split the dataset into training and test sets with different random seeds
    X_train, y_train, X_test, y_test = split_dataset(random_state=run)

    # Perform constant model regression and compute the MSE for training and test sets
    constant_mse_train, constant_mse_test = constant_attribute(X_train, y_train, X_test, y_test)
    
    # Append the MSE values for each run to the respective lists
    training_errors.append(constant_mse_train)
    test_errors.append(constant_mse_test)

# Calculate the mean and standard deviation of the MSEs over all runs
mean_training_error = np.mean(training_errors)
std_training_error = np.std(training_errors)
mean_test_error = np.mean(test_errors)
std_test_error = np.std(test_errors)

# Print the average MSE and standard deviation for training and test sets
print(f"(Naive Regression) MSE training: {mean_training_error:.2f} ± {std_training_error:.2f}, MSE test: {mean_test_error:.2f} ± {std_test_error:.2f}")

(Naive Regression) MSE training: 84.54 ± 5.39, MSE test: 84.47 ± 10.76


(b) Although the constant prediction model provides a basic, computationally simple solution, its inability to account for feature-target relationships leads to high MSE values. This highlights the limitations of using a naive approach in predictive modeling, as it fails to capture the complexity of the data.

In [4]:
# (c)
def single_attribute(X_train, y_train, X_test, y_test):
    """
    Perform linear regression on each attribute (feature) of the training and test datasets, 
    using a single attribute at a time, augmented with a column of ones (bias term).
    
    Parameters:
    X_train (numpy.ndarray): The training feature matrix (n_samples x n_features).
    y_train (numpy.ndarray): The training target vector (n_samples,).
    X_test (numpy.ndarray): The test feature matrix (n_samples x n_features).
    y_test (numpy.ndarray): The test target vector (n_samples,).
    
    Returns:
    tuple: A tuple containing two lists:
        - training_errors (list): A list of Mean Squared Errors (MSE) for each attribute in the training set.
        - test_errors (list): A list of MSEs for each attribute in the test set.
    """
    training_errors = []  # To store MSE values for each attribute on the training set
    test_errors = []      # To store MSE values for each attribute on the test set

    # Loop over each feature (attribute) in the dataset
    for i in range(X_train.shape[1]):
        # Extract the single attribute (feature) from both training and test data
        X_train_single = X_train[:, i].reshape(-1, 1)  # Column vector for training
        X_test_single = X_test[:, i].reshape(-1, 1)    # Column vector for testing

        # Augment the feature matrix with a column of ones for the bias term
        X_train_augmented = np.hstack([X_train_single, np.ones((X_train_single.shape[0], 1))])
        X_test_augmented = np.hstack([X_test_single, np.ones((X_test_single.shape[0], 1))])

        # Compute the weight vector 'w' using the Normal Equation for Linear Regression
        w = np.linalg.inv(X_train_augmented.T @ X_train_augmented) @ X_train_augmented.T @ y_train

        # Make predictions on the training and test sets
        y_train_pred = X_train_augmented @ w
        y_test_pred = X_test_augmented @ w

        # Calculate Mean Squared Error for both training and test predictions
        mse_train = np.mean((y_train - y_train_pred)**2)
        mse_test = np.mean((y_test - y_test_pred)**2)

        # Append the MSE values to the respective lists
        training_errors.append(mse_train)
        test_errors.append(mse_test)    

    return training_errors, test_errors


# Initialize arrays to store results from multiple runs
num_runs = 20
all_mse_train = np.zeros((num_runs, 12))  # MSE for 12 features, across 20 runs
all_mse_test = np.zeros((num_runs, 12))   # MSE for 12 features, across 20 runs

# Run the experiment 'num_runs' times
for run in range(num_runs):
    # Split the dataset into training and test sets for this run
    X_train, y_train, X_test, y_test = split_dataset(random_state=run)

    # Perform linear regression on each feature and get MSE values
    mse_train, mse_test = single_attribute(X_train, y_train, X_test, y_test)
    
    # Store the MSE values for each run
    all_mse_train[run] = mse_train
    all_mse_test[run] = mse_test

# Calculate the mean and standard deviation of the MSE for training and test sets
mean_mse_single_train = np.mean(all_mse_train, axis=0)
std_mse_single_train = np.std(all_mse_train, axis=0)
mean_mse_single_test = np.mean(all_mse_test, axis=0)
std_mse_single_test = np.std(all_mse_test, axis=0)

# Print the results for each feature
for i in range(12):
    print(f'Linear Regression (attribute {i+1}) - MSE train: {mean_mse_single_train[i]:.2f} ± {std_mse_single_train[i]:.2f}, MSE test: {mean_mse_single_test[i]:.2f} ± {std_mse_single_test[i]:.2f}')

Linear Regression (attribute 1) - MSE train: 71.25 ± 4.89, MSE test: 73.91 ± 10.40
Linear Regression (attribute 2) - MSE train: 74.19 ± 4.34, MSE test: 72.38 ± 8.77
Linear Regression (attribute 3) - MSE train: 64.92 ± 4.35, MSE test: 64.56 ± 8.85
Linear Regression (attribute 4) - MSE train: 82.02 ± 5.12, MSE test: 82.19 ± 10.62
Linear Regression (attribute 5) - MSE train: 69.48 ± 4.45, MSE test: 68.48 ± 8.98
Linear Regression (attribute 6) - MSE train: 43.36 ± 3.02, MSE test: 44.46 ± 5.91
Linear Regression (attribute 7) - MSE train: 72.69 ± 4.81, MSE test: 72.26 ± 9.68
Linear Regression (attribute 8) - MSE train: 79.44 ± 5.26, MSE test: 78.98 ± 10.54
Linear Regression (attribute 9) - MSE train: 71.93 ± 4.88, MSE test: 73.08 ± 9.85
Linear Regression (attribute 10) - MSE train: 65.59 ± 4.48, MSE test: 66.97 ± 9.02
Linear Regression (attribute 11) - MSE train: 62.35 ± 3.99, MSE test: 63.83 ± 8.09
Linear Regression (attribute 12) - MSE train: 38.70 ± 2.33, MSE test: 38.50 ± 4.63


In [5]:
# (d)
def all_attributes(X_train, y_train, X_test, y_test):
    """
    Perform linear regression using all features in the training and test datasets.
    This function augments the feature matrix with a column of ones (for the bias term),
    solves for the weights using the Normal Equation, and computes the MSE for both the training and test sets.
    
    Parameters:
    X_train (numpy.ndarray): The training feature matrix (n_samples x n_features).
    y_train (numpy.ndarray): The training target vector (n_samples,).
    X_test (numpy.ndarray): The test feature matrix (n_samples x n_features).
    y_test (numpy.ndarray): The test target vector (n_samples,).
    
    Returns:
    tuple: A tuple containing two values:
        - mse_train (float): The Mean Squared Error for the training set.
        - mse_test (float): The Mean Squared Error for the test set.
    """
    
    # Augment the feature matrices with a column of ones for the bias term (intercept)
    X_train_augmented = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
    X_test_augmented = np.hstack([X_test, np.ones((X_test.shape[0], 1))])

    # Compute the weight vector 'w' using the Normal Equation for Linear Regression
    w = np.linalg.inv(X_train_augmented.T @ X_train_augmented) @ X_train_augmented.T @ y_train

    y_train_pred = X_train_augmented @ w
    y_test_pred = X_test_augmented @ w

    mse_train = np.mean((y_train - y_train_pred)**2)
    mse_test = np.mean((y_test - y_test_pred)**2)

    return mse_train, mse_test


# Number of runs for the experiment
num_runs = 20
all_mse_train = np.zeros(num_runs)
all_mse_test = np.zeros(num_runs)

# Run the experiment for 'num_runs' times
for run in range(num_runs):
    X_train, y_train, X_test, y_test = split_dataset(random_state=run)

    # Perform linear regression using all features and calculate MSE values
    mse_train, mse_test = all_attributes(X_train, y_train, X_test, y_test)
    
    all_mse_train[run] = mse_train
    all_mse_test[run] = mse_test

mean_mse_train = np.mean(all_mse_train)
std_mse_train = np.std(all_mse_train)
mean_mse_test = np.mean(all_mse_test)
std_mse_test = np.std(all_mse_test)

print(f"Linear Regression (all attributes) MSE training: {mean_mse_train:.2f} ± {std_mse_train:.2f}, MSE test: {mean_mse_test:.2f} ± {std_mse_test:.2f}")

Linear Regression (all attributes) MSE training: 22.28 ± 1.65, MSE test: 23.98 ± 3.59
