In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('kc_house_data.csv')

In [3]:
data.head()
list(data.columns)[3:]

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [4]:
class LinearRegressionS:
    def predict(self, X):
        return np.dot(X, self.W)
    def cost(self, Y_P, Y_T):
        return np.sum(np.square(Y_P - Y_T))/(2*Y_T.shape[0])
    def fit(self, X, Y, print_cost = False):
        self.W = np.linalg.inv(X.transpose().dot(X)).dot(X.transpose()).dot(Y)

In [5]:
Y = pd.DataFrame(data['price']).to_numpy(copy = True)

In [6]:
temp = pd.DataFrame(data[list(data.columns)[3:]]).to_numpy(copy = True)

In [7]:
X = np.ones((temp.shape[0], temp.shape[1] + 1))

In [8]:
X[:, 1:] = temp

In [9]:
X_train = X[:15000]
Y_train = Y[:15000]
X_test = X[15000:]
Y_test = Y[15000:]

In [10]:
# Copied just to check

linreg = LinearRegression()
linreg.fit(temp[:15000], Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
mse = mean_squared_error(Y_test, linreg.predict(temp[15000:]))

In [12]:
np.sqrt(mse)

203639.42220699383

In [13]:
linreg.coef_

array([[-3.75161333e+04,  3.96146160e+04,  1.13616150e+02,
         1.58664424e-01, -1.83870782e+03,  6.39890324e+05,
         4.91046529e+04,  2.56784236e+04,  8.96888763e+04,
         7.59909601e+01,  3.76251901e+01, -2.93882602e+03,
         2.11547051e+01, -5.72233981e+02,  5.76461175e+05,
        -1.82812408e+05,  3.19022425e+01, -4.56266875e-01]])

In [14]:
linreg.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}

In [15]:
ls = LinearRegressionS()
ls.fit(X_train, Y_train, print_cost = False)

In [16]:
X.shape

(21613, 19)

In [17]:
np.sqrt(mean_squared_error(Y_test, ls.predict(X_test)))

2918730.1090985616

In [18]:
r2_score(Y_test, ls.predict(X_test))

-61.913180841927826

In [19]:
ls.W

array([[ 5.92392481e+06],
       [ 3.01892510e+06],
       [-3.61208438e+06],
       [-3.07212907e+04],
       [ 4.16504983e-01],
       [ 7.95419957e+05],
       [ 3.03267590e+06],
       [-4.18168715e+04],
       [ 4.04502438e+04],
       [ 1.46903749e+05],
       [ 3.17577874e+04],
       [ 3.17836286e+04],
       [-2.93882602e+03],
       [ 2.11547051e+01],
       [-5.72233980e+02],
       [ 5.76461175e+05],
       [-1.82812408e+05],
       [ 3.19022425e+01],
       [-4.56266875e-01]])

In [20]:
class LinearRegressionSGD:
    def __init__(self, learning_rate = 1e-4, num_iterations = 1500):
        self.alpha = learning_rate
        self.n = num_iterations
    def init_parameters(self):
        self.W = np.zeros((self.n_x, 1))
    def predict(self, X):
        return np.dot(X, self.W)
    def cost(self, Y_P, Y_T):
        return np.sum(np.square(Y_P - Y_T))/(2*self.m)
    def gradient_step(self, X, Y):
        Y_P = self.predict(X)
        self.dW = np.dot(X.T,(Y_P - Y))/self.m
        self.W = self.W - self.alpha*self.dW
    def fit(self, X, Y, print_cost = False):
        self.n_x = X.shape[1]
        self.m = X.shape[0]
        self.init_parameters()
        for i in range(self.n):
            self.gradient_step(X, Y)
            if print_cost and i%100 == 0:
                print(self.cost(self.predict(X), Y))

In [21]:
lsgd = LinearRegressionSGD(learning_rate = 1e-10, num_iterations = 10000)
lsgd.fit(X_train, Y_train, print_cost = True)

66823083144.139824
65435262495.40102
64434672488.52414
63465609583.07572
62527067059.28024
61618077438.03944
60737703959.09513
59885039587.66988
59059206070.99302
58259353024.8183
57484657048.93453
56734320870.74906
56007572516.053055
55303664506.10586
54621873080.20303
53961497442.919334
53321859035.24333
52702300828.84559
52102186642.7459
51520900481.66884
50957845895.39921
50412445358.47083
49884139669.54336
49372387369.84223
48876664180.05663
48396462455.10982
47931290656.23425
47480672839.80263
47044148162.38276
46621270401.50142
46211607491.61869
45814741074.82983
45430266065.82737
45057790230.67088
44696933778.92596
44347328968.74829
44008619724.5019
43680461266.5136
43362519752.57878
43054471930.845116
42756004803.71351
42466815302.40623
42186609971.863914
41915104665.64361
41652024250.5001
41397102320.3437
41150080919.27652
40910710273.419075
40678748531.24859
40453961512.17837
40236122463.11702
40025011822.75414
39820416993.32723
39622132119.632286
39429957875.048416
39243701

In [22]:
r2_score(Y_test, lsgd.predict(X_test))

0.4723717589195736