In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Read data

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Data/houses.csv')

Look at the data

In [4]:
df.head()

Unnamed: 0,Square feet,Bedrooms,Price
0,2104,3,399900
1,1600,3,329900
2,2400,3,369000
3,1416,2,232000
4,3000,4,539900


Transfer the data into numpy arrays. The matrix of features $X$ and the vector of labels $y$

In [26]:
X = df[['Square feet','Bedrooms']].values
y = df['Price'].values

In [27]:
def shuffle_examples(X,y):
  p = np.random.permutation(len(X))
  return X[p],y[p]

In [28]:
def split_train_test(X,y,prop_train):
  n = int(prop_train*len(X))
  return X[:n],y[:n],X[n:],y[n:]

In [29]:
def scale(x):
  mu = np.mean(x,axis=0)
  st = np.std(x,axis=0)
  return (x-mu)/st,mu,st

In [30]:
def add_one_end_rows(X):
  return np.column_stack((X, np.ones(X.shape[0])))

def best_lin(X,y):
  C = add_one_end_rows(X)
  A = np.matmul(C.T,C)
  r = np.dot(C.T,y)
  z = np.linalg.solve(A,r)
  w = z[:-1]
  b = z[-1]
  return w, b

Finds the linear function that best fits the data splitting into training and test sets

In [31]:
def best_linear_train_test(X,y,prop_train):
  X,y = shuffle_examples(X,y)
  X_train,y_train,X_test,y_test = split_train_test(X,y,prop_train)
  X_train_scaled, X_train_mean, X_train_std = scale(X)
  y_train_scaled, y_train_mean, y_train_std = scale(y)
  w, b = best_lin(X_train_scaled,y_train_scaled)
  return w, b, X_train_mean, X_train_std, y_train_mean, y_train_std, X_test, y_test

Function to make predictions

In [32]:
def scaled_predictions(x, w, b, X_train_mean, X_train_std, y_train_mean, y_train_std):
  x_scaled = (x-X_train_mean)/X_train_std
  y_hat_scaled = np.matmul(x_scaled,w) + b
  return y_hat_scaled  
  
def predictions(x, w, b, X_train_mean, X_train_std, y_train_mean, y_train_std):
  y_hat_scaled = scaled_predictions(x, w, b, X_train_mean, X_train_std, y_train_mean, y_train_std)
  y_hat = y_train_std*y_hat_scaled + y_train_mean
  return y_hat

Computes error

In [33]:
def error(X, y, w, b, X_train_mean, X_train_std, y_train_mean, y_train_std):
  y_hat_scaled = scaled_predictions(X, w, b, X_train_mean, X_train_std, y_train_mean, y_train_std)
  y_scaled = (y - y_train_mean)/y_train_std
  return np.sum((y_hat_scaled - y_scaled)**2)/len(y) 

Test our codes:

In [46]:
p = 0.7
w, b, X_train_mean, X_train_std, y_train_mean, y_train_std, X_test, y_test = best_linear_train_test(X,y,p)

In [47]:
x = np.array([[3000,4],[2400,3]])
predictions(x, w, b,X_train_mean, X_train_std, y_train_mean, y_train_std)

array([472277.85514636, 397489.46984812])

In [48]:
error(X_test, y_test, w, b, X_train_mean, X_train_std, y_train_mean, y_train_std)

0.40497028907491556

In [49]:
error(X_train, y_train, w, b, X_train_mean, X_train_std, y_train_mean, y_train_std)

0.2024071817661654