In [1]:
# Render plots inline
%matplotlib inline

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
def make_df(csv_path):
    return pd.read_csv(csv_path, header=None, names=['x', 'y'], index_col=False)

In [4]:
def make_data_matrix(x_values, degree):
    X_data = []
    for x_val in x_values:
        col = []
        for power in range(degree+1):
            col.append(x_val ** power)
        X_data.append(col)
    X_matrix = np.array(X_data)
    return X_matrix

In [5]:
def find_params_vector(data_matrix, y_values):
    data_product = (data_matrix.T).dot(data_matrix)
    inv_product = np.linalg.inv(data_product)
    params_vector = (inv_product.dot(data_matrix.T)).dot(y_values)
    return params_vector

In [6]:
def find_mse(data_matrix, params_vector, y_values):
    y_predictions = np.array([row.dot(params_vector) for row in data_matrix])
    error = y_values - y_predictions
    error_square = error.dot(error)
    mse = error_square/len(y_values)
    return mse

In [None]:
def fit_model_find_mse(csv_path, degree):
    df = make_df(csv_path)
    x_values = df['x']
    y_values = df['y']
    X_matrix = make_data_matrix(x_values, degree)
    params_vector = find_params_vector(X_matrix, y_values)
    mse = find_mse(X_matrix, params_vector, y_values)
    return mse

In [7]:
train_df = make_df('./Datasets/Dataset_1_train.csv')

In [8]:
train_x = train_df['x']
train_y = train_df['y']

In [9]:
train_matrix = make_data_matrix(train_x, 20)

In [10]:
params_vector = find_params_vector(train_matrix, train_y)

In [11]:
train_mse = find_mse(train_matrix, params_vector, train_y)

In [12]:
valid_df = make_df('./Datasets/Dataset_1_valid.csv')

In [13]:
valid_x = valid_df['x']
valid_y = valid_df['y']

In [14]:
valid_matrix = make_data_matrix(valid_x, 20)

In [15]:
valid_mse = find_mse(valid_matrix, params_vector, valid_y)

In [16]:
print(train_mse)

6.474766080931443


In [17]:
print(valid_mse)

1419.5725161143287
