## Lab5 Linear Regression and Ridge Regression

Austin Nguyen 

Friday 2:15 - 5:00 PM 

October 25, 2024


In [1]:
import numpy as np

# Function to load data
def load_data(train_file, test_file):
    # Load data from text files, skipping the header row and using tab as incrementer
    train_data = np.loadtxt(train_file, delimiter='\t', skiprows=1)
    test_data = np.loadtxt(test_file, delimiter='\t', skiprows=1)
    
    # response variable (first column) features (remaining columns)
    y_train = train_data[:, 0]  
    X_train = train_data[:, 1:]  
    
    y_test = test_data[:, 0]  
    X_test = test_data[:, 1:] 
    
    # 95 + 1 term for bias 
    X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
    X_test = np.hstack([np.ones((X_test.shape[0], 1)), X_test])
    
    return X_train, y_train, X_test, y_test

# Linear Regression Implementation
def linear_regression(X_train, y_train):
    # w = (X^T * X)^-1 * X^T * y
    w = np.linalg.inv(X_train.T @ X_train) @ X_train.T @ y_train
    return w

# Ridge Regression Implementation
def ridge_regression(X_train, y_train, lambd):
    # w = (X^T * X + lambda * I)^-1 * X^T * y
    P = X_train.shape[1]
    identity_matrix = np.eye(P)
    w = np.linalg.inv(X_train.T @ X_train + lambd * identity_matrix) @ X_train.T @ y_train
    return w

# Function to make predictions using linear regression weights
def problem1(samples):
    return np.dot(samples, w_linear)

# Function to make predictions using ridge regression weights
def problem2(samples):
    return np.dot(samples, w_ridge)

# RMSE Calculation
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

# load data
train_file = 'crime-train.txt'
test_file = 'crime-test.txt'

# train models
X_train, y_train, X_test, y_test = load_data(train_file, test_file)

# Linear Regression
w_linear = linear_regression(X_train, y_train)
y_train_pred_linear = problem1(X_train)
y_test_pred_linear = problem1(X_test)

# Ridge Regression (lambda defined as 100)
lambd = 100
w_ridge = ridge_regression(X_train, y_train, lambd)
y_train_pred_ridge = problem2(X_train)
y_test_pred_ridge = problem2(X_test)

# Compute RMSE for Linear Regression
rmse_train_linear = rmse(y_train, y_train_pred_linear)
rmse_test_linear = rmse(y_test, y_test_pred_linear)

# Compute RMSE for Ridge Regression
rmse_train_ridge = rmse(y_train, y_train_pred_ridge)
rmse_test_ridge = rmse(y_test, y_test_pred_ridge)

# Report the RMSE values
print(f'Linear Regression RMSE (Train): {rmse_train_linear}')
print(f'Linear Regression RMSE (Test): {rmse_test_linear}')
print(f'Ridge Regression RMSE (Train): {rmse_train_ridge}')
print(f'Ridge Regression RMSE (Test): {rmse_test_ridge}')


Linear Regression RMSE (Train): 0.12768967421762195
Linear Regression RMSE (Test): 0.1458346449094935
Ridge Regression RMSE (Train): 0.13134320424615797
Ridge Regression RMSE (Test): 0.14765698468526112


## Training And Testing Results:

Linear Regression RMSE (Train): 0.12768967421762195

Linear Regression RMSE (Test): 0.1458346449094935

Ridge Regression RMSE (Train): 0.13134320424615797

Ridge Regression RMSE (Test): 0.14765698468526112