# Lab 6 Austin Nguyen

November 8, 2024



In [1]:

import numpy as np
import csv 

def load_data(name):
    data = []
    with open(name, mode = 'r') as file:
        csvFile = csv.reader(file, delimiter='\t')
        next(csvFile) #skip the first line
        for lines in csvFile:
            converted_lines = []
            for x in lines:
                converted_lines.append(float(x))
            data.append(converted_lines)
    return data

k_fold = 5

train_data = np.array(load_data('crime-train.csv'))
y_train = train_data[:, 0]
x_train = train_data[:, 1:]


# adding the dummy feature
x = np.ones((x_train.shape[0], 1))
x_train = np.append(x_train, x, 1)



test_data = np.array(load_data('crime-test.csv'))
y_test = test_data[:, 0]
x_test = test_data[:, 1:]

x = np.ones((x_test.shape[0], 1))
x_test = np.append(x_test, x, 1)

# Splitting data

split = int(x_train.shape[0]/k_fold)
x_train1 = x_train[:split]
x_train2 = x_train[split:split*2]
x_train3 = x_train[split*2:split*3]
x_train4 = x_train[split*3:split*4]
x_train5 = x_train[split*4:]

y_train1 = y_train[:split]
y_train2 = y_train[split:split*2]
y_train3 = y_train[split*2:split*3]
y_train4 = y_train[split*3:split*4]
y_train5 = y_train[split*4:]

x_trainers = np.array([x_train1, x_train2, x_train3, x_train4, x_train5])
y_trainers = np.array([y_train1, y_train2, y_train3, y_train4, y_train5])


lambdas = np.zeros(10)
lambdas[0] = 400
for i in range(9):
    lambdas[i+1] = lambdas[i]/2
rmses = np.zeros(10)

def rmse(x_estimated, x_actual):
    n = x_estimated.shape[0]
    sum = 0
    for i in range(len(x_estimated)):
        sum += np.power((x_actual[i]-x_estimated[i]), 2)
    sum /= n
    sum = np.power(sum, 1/2)
    
    return sum

def ridge_fit(x_train, y_train, lam):

    weights = np.dot(np.dot(np.linalg.inv(np.dot(x_train.T, x_train) + np.dot(lam, np.identity(x_train.shape[1]))), x_train.T), y_train)
    return weights

def prediction(x_test, weights):
    
    pred = []
    pred = np.dot(x_test, weights)

    return pred

def create_nonval(trainers, i):
    actual_train = []
    for j in range(1, k_fold):
        actual_train = np.append(actual_train, trainers[(i+j)%k_fold])
    if (actual_train.shape[0] > 1276):
        actual_train = np.reshape(actual_train, (1276, 96))
    return actual_train

def ridge_gradient_descent(x_train, y_train, lam):

    tol = 0.0000001
    diff = 1

    w_t = np.random.rand(x_train.shape[1])
    alpha = .00005

    while (diff > tol):
        temp_gradient = np.dot(lam, np.identity(x_train.shape[1]))
        temp_gradient = np.dot(temp_gradient, w_t)

        w_t1 = w_t - alpha * np.add(np.dot(x_train.T, np.subtract(np.dot(x_train, w_t), y_train)), temp_gradient)
        diff = np.abs(np.average(w_t1-w_t))
        w_t = w_t1
        
    return w_t


def cross_validation():
    for i in range(10):
        avg_rmse = 0
        # Make sure each fold is tested 
        for j in range(k_fold):

            weight = ridge_gradient_descent(create_nonval(x_trainers, j), create_nonval(y_trainers, j), lambdas[i])

            y_val = np.array(prediction(x_trainers[j], weight))
            avg_rmse += rmse(y_val, y_trainers[j])

        # average rmse for the current lambda
        rmses[i]= avg_rmse/k_fold 
    
    # chooses and returns the lambda with the lowest rmse
    actual_lam = lambdas[np.argmin(rmses)]
    return actual_lam

# Step 1: uses cross validation to obtain the best lambda and then uses that to train
actual_lam = cross_validation()

print(f"Best lambda: {actual_lam}")

weight = ridge_fit(x_train, y_train, actual_lam)
train_pred = prediction(x_train, weight)
rmse_train = rmse(train_pred, y_train)


test_pred = prediction(x_test, weight)
rmse_test = rmse(test_pred, y_test)

print(f"Problem 1 Training RMSE: {rmse_train}")
print(f"Problem 1 Testing RMSE: {rmse_test}")


# Step 2: Linear Regression using Gradient Descent
def problem2(samples):

    tol = 0.0000001
    diff = 1

    w_t = np.random.rand(x_train.shape[1])

    alpha = .00005

    while (diff > tol):
        w_t1 = w_t - alpha * np.dot(x_train.T,np.subtract(np.dot(x_train, w_t),y_train))

        diff = np.abs(np.average(w_t1-w_t))
        w_t = w_t1
    
    pred = np.dot(samples, w_t)
    
    return pred
    

train_pred = problem2(x_train)
test_pred = problem2(x_test)

print(f"Problem 2 Training RMSE: {rmse(train_pred, y_train)}")
print(f"Problem 2 Testing RMSE: {rmse(test_pred, y_test)}")



# Part 3: Ridge Regression with 5 Fold Cross Validation using Gradient Descent
def problem3(samples):
    actual_lam = cross_validation()
    print(actual_lam)
    weight = ridge_gradient_descent(x_train, y_train, actual_lam)
    return prediction(samples,weight)


train_pred = problem3(x_train)
rmse_train = rmse(train_pred, y_train)

test_pred = problem3(x_test)
rmse_test = rmse(test_pred, y_test)

print(f"Problem 3 Training RMSE: {rmse_train}")
print(f"Problem 3 Testing RMSE: {rmse_test}")

    

Best lambda: 25.0
Problem 1 Training RMSE: 0.1287970145987978
Problem 1 Testing RMSE: 0.1457465070705802
Problem 2 Training RMSE: 0.12828429481592424
Problem 2 Testing RMSE: 0.14568341221467915
25.0
25.0
Problem 3 Training RMSE: 0.12877146243488172
Problem 3 Testing RMSE: 0.14562337951330204
