In [None]:
%reset -f
import numpy as np
import pandas as pd

In [None]:
def mean_square_error(y, y_pred):
    m = len(y)
    return np.sum(np.square(y - y_pred))/m

In [None]:
df = pd.read_csv("Boston-filtered.csv")

In [None]:
# df = df.sample(frac=1).reset_index(drop=True)

# # Create train and test set
# train = df[:m_train].reset_index(drop=True)
# test = df[m_train:].reset_index(drop=True)

# train_y = train.iloc[:, -1]
# train_X = train.iloc[:, :-1]

# test_y = test.iloc[:, -1]
# test_X = test.iloc[:, :-1]

# train_ones_x = np.ones(m_train)
# test_ones_x = np.ones(m - m_train)

In [None]:
def train_lr(X_train, y_train):
    w = np.linalg.inv(X_train.T@X_train)@X_train.T@y_train
    return w

In [None]:
def evaluate_lr(X_test, y_test, w):
    y_pred = X_test@w
    error = mean_square_error(y_test, y_pred)
    return error

In [None]:
def create_train_test_set(D, train_ratio):
    m = len(D)
    m_train = int(train_ratio * m // 1)
    
    np.random.shuffle(D)

    train = D[:m_train]
    test = D[m_train:]

    X_train = train[:, :-1]
    y_train = train[:, -1]
    
    X_test = test[:, :-1]
    y_test = test[:, -1]
    
    return X_train, y_train, X_test, y_test

In [None]:
def lr(df, mode='single', variable=1, iterations=20, precision=3):
    train_errors = []
    test_errors = []
    
    D = df.to_numpy()
    
    for i in range(iterations):
        X_train, y_train, X_test, y_test = create_train_test_set(D, 2/3)

        if mode == 'ones':
            X_train = np.ones(len(X_train)).reshape(-1, 1)
            X_test = np.ones(len(X_test)).reshape(-1, 1)

        elif mode == 'single':
            X_train = X_train[:, variable].reshape(-1, 1)
            X_train = np.hstack( (X_train, np.ones(len(X_train)).reshape(-1, 1)) )
            X_test = X_test[:, variable].reshape(-1, 1)
            X_test = np.hstack( (X_test, np.ones(len(X_test)).reshape(-1, 1)) )
        
        w = train_lr(X_train, y_train)
        
        train_error = evaluate_lr(X_train, y_train, w)
        test_error = evaluate_lr(X_test, y_test, w)

        train_errors.append(train_error)
        test_errors.append(test_error)
    
    train_mean = round(np.mean(np.array(train_errors)), precision)
    train_std = round(np.std(np.array(train_errors)), precision)

    test_mean = round(np.mean(np.array(test_errors)), precision)
    test_std = round(np.std(np.array(test_errors)), precision)
            
    return train_errors, test_errors, train_mean, train_std, test_mean, test_std

In [None]:
train_errors, test_errors, train_means, train_stds, test_means, test_stds = lr(df, mode='ones', precision = 3)
print(f"Train and Test MSE over 20 runs for Naive Regression: {train_means}, {test_means}")
print(f"std over 20 runs for Naive Regression: {train_stds}, {test_stds}")
print('\n')

for i in range(df.shape[-1] - 1):
    train_errors, test_errors, train_means, train_stds, test_means, test_stds = lr(df, mode='single', variable=i, iterations=20)
    
    print(f"MSE over 20 runs for {df.columns[i]}: {train_means}, {test_means}")
    print(f"std over 20 runs for {df.columns[i]}: {train_stds}, {test_stds}")
    print('\n')

train_errors, test_errors, train_means, train_stds, test_means, test_stds = lr(df, mode='full')
print(f"Train and Test MSE over 20 runs with full dataset: {train_means}, {test_means}")
print(f"Train and Test std over 20 runs with full dataset: {train_stds}, {test_stds}")

print('\n')