In [354]:
%reset -f
import numpy as np
import pandas as pd

In [355]:
def mean_square_error(y, y_pred):
    m = len(y)
    return np.sum(np.square(y - y_pred))/m

In [356]:
df = pd.read_csv("Boston-filtered.csv")

In [357]:
# df = df.sample(frac=1).reset_index(drop=True)

# # Create train and test set
# train = df[:m_train].reset_index(drop=True)
# test = df[m_train:].reset_index(drop=True)

# train_y = train.iloc[:, -1]
# train_X = train.iloc[:, :-1]

# test_y = test.iloc[:, -1]
# test_X = test.iloc[:, :-1]

# train_ones_x = np.ones(m_train)
# test_ones_x = np.ones(m - m_train)

In [358]:
def train_lr(X_train, y_train):
    w = np.linalg.inv(X_train.T@X_train)@X_train.T@y_train
    return w

In [359]:
def evaluate_lr(X_test, y_test, w):
    y_pred = X_test@w
    error = mean_square_error(y_test, y_pred)
    return error

In [360]:
def create_train_test_set(D, train_ratio):
    m = len(D)
    m_train = int(train_ratio * m // 1)
    
    np.random.shuffle(D)

    train = D[:m_train]
    test = D[m_train:]

    X_train = train[:, :-1]
    y_train = train[:, -1]
    
    X_test = test[:, :-1]
    y_test = test[:, -1]
    
    return X_train, y_train, X_test, y_test

In [361]:
def lr(df, mode='single', variable=1, iterations=20):
    train_errors = []
    test_errors = []
    
    D = df.to_numpy()
    
    for i in range(iterations):
        X_train, y_train, X_test, y_test = create_train_test_set(D, 2/3)

        if mode == 'ones':
            X_train = np.ones(len(X_train)).reshape(-1, 1)
            X_test = np.ones(len(X_test)).reshape(-1, 1)

        elif mode == 'single':
            X_train = X_train[:, variable].reshape(-1, 1)
            X_train = np.hstack( (X_train, np.ones(len(X_train)).reshape(-1, 1)) )
            X_test = X_test[:, variable].reshape(-1, 1)
            X_test = np.hstack( (X_test, np.ones(len(X_test)).reshape(-1, 1)) )
        
        w = train_lr(X_train, y_train)
        
        train_error = evaluate_lr(X_train, y_train, w)
        test_error = evaluate_lr(X_test, y_test, w)

        train_errors.append(train_error)
        test_errors.append(test_error)
            
    return train_errors, test_errors

In [363]:
train_errors, test_errors = lr(df, mode='ones')
print(f"Train and Test MSE over 20 runs for Naive Regression: {np.mean(np.array(train_errors))}, {np.mean(np.array(test_errors))}")
print('\n')

for i in range(df.shape[-1] - 1):
    train_errors, test_errors = lr(df, mode='single', variable=i, iterations=20)
    
    print(f"MSE over 20 runs for {df.columns[i]}: {np.mean(np.array(train_errors))}, {np.mean(np.array(test_errors))}")
print('\n')

train_errors, test_errors = lr(df, mode='full')
print(f"Train and Test MSE over 20 runs with full dataset: {np.mean(np.array(train_errors))}, {np.mean(np.array(test_errors))}")
print('\n')

Train and Test MSE over 20 runs for Naive Regression: 83.27688808565718, 86.93278432843866


MSE over 20 runs for CRIM: 71.63669066529039, 72.91303074913745
MSE over 20 runs for  ZN : 73.41032642630876, 73.90910649738568
MSE over 20 runs for INDUS : 65.0759115867134, 64.16154855914704
MSE over 20 runs for CHAS: 82.92113993679496, 80.20030631804573
MSE over 20 runs for NOX: 68.28866993269887, 70.7400701931123
MSE over 20 runs for RM: 42.14518968637113, 46.9065753047473
MSE over 20 runs for AGE: 71.79776217759456, 74.0694667842541
MSE over 20 runs for DIS: 77.19623145226164, 83.39547865763275
MSE over 20 runs for RAD: 70.13903436595885, 76.35678544051646
MSE over 20 runs for TAX: 66.08852671807836, 65.86105380202233
MSE over 20 runs for PTRATIO: 62.5168412367232, 63.420476142255936
MSE over 20 runs for LSTAT: 39.099727683074185, 37.54066296098701


Train and Test MSE over 20 runs with full dataset: 24.91610247869065, 28.513295410872907


