In [6]:
import numpy as np
import random

dataSet = np.genfromtxt('x06simple.csv', delimiter=',')

requiredData = dataSet[1:, 1:]

# randomizing data prior to division into training and testing sets / standardization
random.seed(0)

np.random.shuffle(requiredData)

trainingSamples = int(len(requiredData) * 2/3)

# isolating the x and y values in two separate matrices prior to standardization to make the whole thing easier
x_values_training = requiredData[:trainingSamples, :-1]
y_values_training = requiredData[:trainingSamples, -1]

x_values_testing = requiredData[trainingSamples:, :-1]
y_values_testing = requiredData[trainingSamples:, -1]


# standardization process

training_mean = np.mean(x_values_training, axis=0)
testing_mean = np.mean(x_values_testing, axis=0)

training_sd = np.std(x_values_training, axis=0)
testing_sd = np.std(x_values_testing, axis=0)

standardized_training = (x_values_training - training_mean) / training_sd
standardized_testing = (x_values_testing - testing_mean) / testing_sd

# padding a column of 1s to the standardized matrices
ones_training = np.ones((len(standardized_training), 1))
ones_testing = np.ones((len(standardized_testing), 1))

x_training = np.hstack((ones_training, standardized_training))
x_testing = np.hstack((ones_testing, standardized_testing))

# applying the closed form LSE rule
# the rule is theta = (X'X)^(-1) * X' * Y

x_training_transpose = np.transpose(x_training)

In [7]:
x_training_matrix = np.matrix(x_training)
x_training_transpose_matrix = np.matrix(x_training_transpose)

In [8]:
x_xt =np.matmul(x_training_transpose_matrix, x_training_matrix)

In [9]:
x_xt_inverse = x_xt.I

In [10]:
x_xt_inverse_xt = np.matmul(x_xt_inverse, x_training_transpose)

In [11]:
weights = np.matmul(x_xt_inverse_xt, y_values_training)

In [12]:
print(weights)

[[3069.72413793 1204.58199143 -261.56030329]]


In [13]:
np.matmul (np.matrix(x_testing), np.matrix(weights.T))

matrix([[2241.57201291],
        [5708.02521984],
        [2740.59381346],
        [1043.01162432],
        [3475.77681583],
        [1751.49522981],
        [3711.93801766],
        [3974.79861637],
        [2005.41081108],
        [3212.91621712],
        [ 780.15102561],
        [3738.63741454],
        [4237.65921509],
        [2713.89441658],
        [4709.98161875]])

In [14]:
print(x_training)

[[ 1.          1.06120421 -0.60023282]
 [ 1.         -0.40926988  1.23205685]
 [ 1.          0.76710939  1.23205685]
 [ 1.          0.17891975 -1.51637766]
 [ 1.          1.35529903 -1.51637766]
 [ 1.         -1.56464238 -0.60023282]
 [ 1.          0.76710939 -1.51637766]
 [ 1.          1.35529903  1.23205685]
 [ 1.         -0.99745952 -0.60023282]
 [ 1.         -0.99745952 -1.51637766]
 [ 1.          0.47301457  1.23205685]
 [ 1.         -1.27054756 -0.60023282]
 [ 1.         -0.99745952  0.31591201]
 [ 1.         -1.56464238  1.23205685]
 [ 1.          1.35529903  0.31591201]
 [ 1.          0.76710939 -0.60023282]
 [ 1.          1.06120421  0.31591201]
 [ 1.         -1.56464238 -1.51637766]
 [ 1.          1.06120421  1.23205685]
 [ 1.         -0.11517506  1.23205685]
 [ 1.         -1.27054756  0.31591201]
 [ 1.          0.47301457 -0.60023282]
 [ 1.         -0.99745952  1.23205685]
 [ 1.         -0.11517506  0.31591201]
 [ 1.          0.76710939  0.31591201]
 [ 1.          1.35529903

In [15]:
print(weights)


[[3069.72413793 1204.58199143 -261.56030329]]
