# Housing prices for King County

Training a simple polynomial model on Hosing Sale data from king country [data source](https://www.kaggle.com/datasets/harlfoxem/housesalesprediction)
As a learning exercise

In [1]:
from utils import load_data
from train import compute_cost, compute_gradient, gradient_descent
import numpy as np
import pandas as pd
# Data is a pandas DataFrame
data = load_data.load_data("./data/kc_house_data.csv")


# Feature Engineering
Housing prices do not tend to be linear but curve like, thus we will be engineering some polynomial features

In [2]:
labels = data['price']
data.drop('price', axis=1, inplace=True)
data.drop('date', axis=1, inplace=True)
dataSquared = data.pow(2)
dataSquared.rename(lambda c: f"{c} squared", inplace=True, axis='columns')

dataCubed = data.pow(3)
dataCubed.rename(lambda c: f"{c} cubed", inplace=True, axis='columns')

engineeredData = data.join(dataSquared).join(dataCubed)

engineeredData[0:5]

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,grade cubed,sqft_above cubed,sqft_basement cubed,yr_built cubed,yr_renovated cubed,zipcode cubed,lat cubed,long cubed,sqft_living15 cubed,sqft_lot15 cubed
0,7129300520,3,1.0,1180,5650,1.0,0,0,3,7,...,343,1643032000,0,7472058875,0,946329856735752,107247.702877,-1827348.0,2406104000,180362125000
1,6414100192,3,2.25,2570,7242,2.0,0,0,3,7,...,343,10218313000,64000000,7426288351,7892485271,944798095703125,108674.739386,-1830129.0,4826809000,445768658119
2,5631500400,2,1.0,770,10000,1.0,0,0,3,6,...,216,456533000,0,7222633237,0,941998966517952,108790.239078,-1826272.0,20123648000,523996494328
3,2487200875,4,3.0,1960,5000,1.0,0,0,5,7,...,343,1157625000,753571000,7587307125,0,945115872339456,107312.72666,-1833453.0,2515456000,125000000000
4,1954400510,3,2.0,1680,8080,1.0,0,0,3,8,...,512,4741632000,0,7845011803,0,943325698349224,107964.410613,-1817858.0,5832000000,422381452527


# Training Data
We have split the data into two halves, the first half will be the training data. Matrix trainingSamples contains our examples. Each row is an example and each column is a feature. Vector trainingLabels contains the labels for our training set.

In [3]:
total_rows = len(engineeredData)
trainingSetRows = total_rows // 2
trainingSamples = engineeredData[0:trainingSetRows].copy()
trainingLabels = labels[0:trainingSetRows].copy()

print(f"trainingSamples shape: {trainingSamples.shape}")
print(f"trainingLabels shape: {trainingLabels.shape}")

trainingSamples shape: (10806, 57)
trainingLabels shape: (10806,)


# Parameters
Parameter vector w and param b


In [4]:
b = 10
w = np.random.random_sample(trainingSamples.shape[1])
print(f"w shape: {w.shape}")
print(f"Initial cost: {compute_cost(trainingSamples.to_numpy(), trainingLabels.to_numpy(), w, b)}")
dj_dw, dj_db = compute_gradient(trainingSamples.to_numpy(), trainingLabels.to_numpy(), w, b)
print(f"Initial gradient dw:  {dj_dw})")
print(f"Initial gradient db: {dj_db})")

w shape: (57,)
Initial cost: 3.6204407586940944e+37
Initial gradient dw:  [-1.01350940e+26  1.10453257e+18  5.57476968e+17  6.06787941e+20
  1.32407967e+22  4.55234863e+17  2.31946185e+15  1.05669589e+17
  1.10693908e+18  2.33130982e+18  5.24204542e+20  8.25833990e+19
  6.32418302e+20  9.04579185e+18  3.16168827e+22  1.53049125e+19
 -3.93966338e+19  5.29867740e+20  6.83084467e+21  1.98915098e+37
  4.09059882e+18  1.03191003e+18  1.45224162e+24  7.52040189e+27
  7.29536733e+17  2.31946185e+15  2.74814726e+17  3.95883626e+18
  1.68835989e+19  1.09366486e+24  7.17053729e+22  1.24135693e+24
  1.77482098e+22  3.10192429e+27  7.26873223e+20  4.81628843e+21
  9.35270224e+23  1.02933791e+27  2.36759342e+37  1.64405390e+19
  1.97299721e+18  5.61376891e+27  8.30618717e+33  1.32109054e+18
  2.31946185e+15  6.94498057e+17  1.47677505e+19  1.22190739e+20
  3.23091846e+27  1.28798719e+26  2.43715795e+27  3.48208600e+25
  3.04329062e+32  3.45215630e+22 -5.88799049e+23  1.78763048e+27
  2.80784347e+32

In [5]:
# Run gradient descent
alpha = 5.0e-7
iterations = 1000
w_final, b_final, j_history = gradient_descent(trainingSamples.to_numpy(), trainingLabels.to_numpy(), w, b, compute_cost, compute_gradient, alpha, iterations)
print(f"Final cost: {compute_cost(trainingSamples.to_numpy(), trainingLabels.to_numpy(), w_final, b_final)}")


Iteration    0: Cost 6336264941478872257916155644190177379718793109231761261082638185976575753493525172646234552337956864.00   


  cost += (f_wb - y[i]) ** 2
  dj_dw[j] = dj_dw[j] + error * X[i, j]
  dj_dw[j] = dj_dw[j] + error * X[i, j]


Iteration   10: Cost      nan   
Iteration   20: Cost      nan   
Iteration   30: Cost      nan   
Iteration   40: Cost      nan   
Iteration   50: Cost      nan   
Iteration   60: Cost      nan   
Iteration   70: Cost      nan   
Iteration   80: Cost      nan   
Iteration   90: Cost      nan   
Iteration  100: Cost      nan   
Iteration  110: Cost      nan   
Iteration  120: Cost      nan   


KeyboardInterrupt: 