# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# Reading the Dataset

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
house_data = pd.read_csv('kc_house_data.csv',dtype=dtype_dict)
house_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


# Splitting into Train-Test

In [4]:
train_data = pd.read_csv('kc_house_train_data.csv',dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv',dtype=dtype_dict)
print(train_data.shape,test_data.shape)

(17384, 21) (4229, 21)


# Getting data as a numpy matrix

In [5]:
def get_numpy_data(dataset,features,output):
    dataset['constant'] = 1
    features.insert(0,'constant')
    X = np.array(dataset[features])
    Y = np.array(dataset[output]).reshape(-1,1)
    return X,Y
    

# Predicting Output

In [6]:
def pred_output(X,W):
    Y_pred = np.dot(X,W.T) #(3,2) * (1,2) 
    return Y_pred #(3,1)

# Computing Derivative

In [7]:
def compute_grad(X,W,Y):
    Y_pred = pred_output(X,W) #(3,1)
    dW = -2 * np.dot((Y - Y_pred).T,X) # (3,1) - (3,1) -> (1,3) * (3,2) -> (1,2)
    return dW
    

# Gradient Descent

In [8]:

def grad(X,Y,W,learning_rate,tolerance):
    converged = False
    loss_arr = []
    
    while(not converged):
        grad_sum_sq = 0
        dW = compute_grad(X,W,Y)
        grad_sum_sq = np.sum(np.square(dW))
        W -= learning_rate * dW
        print(grad_sum_sq,' ')
        if(np.sqrt(grad_sum_sq) < tolerance):
            converged = True
    return W
        

# Let's Check Whether this is working for dummy example

In [9]:
X,Y = get_numpy_data(train_data,features=['sqft_living'],output = ['price'])
print(X.shape,Y.shape)
W = np.array([-47000.,1.]).reshape(1,-1)
print(W.shape)
learning_rate = 7e-12
tolerance = 2.5e7
W_learned= grad(X,Y,W,learning_rate,tolerance)

(17384, 2) (17384, 1)
(1, 2)
2.555457264704116e+27  
1.7232997044781152e+26  
1.1621254295552887e+25  
7.836916065790902e+23  
5.284907451530232e+22  
3.5639333814166125e+21  
2.4033762717437198e+20  
1.620742315195644e+19  
1.0929651764117583e+18  
7.370567547611414e+16  
4970830209874756.0  
335623032724346.2  


In [10]:
np.round(W_learned[0][1],1)

281.9

# Test Set

In [11]:
X_test,Y_test = get_numpy_data(test_data,features=['sqft_living'],output=['price'])

In [12]:
pred_output(X_test[0],W_learned)

array([356134.443255])

# RSS

In [13]:
np.sum(np.square(pred_output(X_test,W_learned) - Y_test))

275400044902128.3

# Now working on more than 1 features. Model-2

In [14]:
X,Y = get_numpy_data(train_data,features=['sqft_living','sqft_living15'],output = ['price'])
print(X.shape,Y.shape)
W = np.array([-100000., 1., 1.]).reshape(1,-1)
print(W.shape)
learning_rate = 4e-12
tolerance = 1e9
W_learned_model_2= grad(X,Y,W,learning_rate,tolerance)

(17384, 3) (17384, 1)
(1, 3)
5.339520188136633e+27  
5.14074949042657e+26  
4.985482015024819e+25  
5.178730370138792e+24  
8.630114778144385e+23  
4.307394451183367e+23  
3.728511162356119e+23  
3.517424298211411e+23  
3.34900549581351e+23  
3.191636242493768e+23  
3.041949240022408e+23  
2.8993101827573225e+23  
2.7633622281771433e+23  
2.6337890963363443e+23  
2.5102916299902257e+23  
2.3925849198037308e+23  
2.280397436956995e+23  
2.1734703865637986e+23  
2.071557108737322e+23  
1.974422509406782e+23  
1.881842517983211e+23  
1.7936035704705534e+23  
1.709502116814989e+23  
1.629344151354628e+23  
1.5529447652862312e+23  
1.480127720116658e+23  
1.4107250411150176e+23  
1.344576629827688e+23  
1.2815298947624057e+23  
1.2214393993894088e+23  
1.1641665266477397e+23  
1.109579159182666e+23  
1.0575513745767346e+23  
1.007963154871262e+23  
9.60700109708314e+22  
9.156532124544497e+22  
8.727185486975023e+22  
8.317970765363131e+22  
7.927943981103455e+22  
7.556205418422456e+22  
7

In [15]:
X_test,Y_test = get_numpy_data(test_data,features=['sqft_living','sqft_living15'],output=['price'])
pred_output(X_test[0],W_learned_model_2)

array([366651.41162949])

In [21]:
Y_test[0]

array([310000.])

In [22]:
np.sum(np.square(pred_output(X_test,W_learned_model_2) - Y_test))

270263443629803.56

# Questions

9. Quiz Question: What is the value of the weight for sqft_living -- the second element of ‘simple_weights’ (rounded to 1 decimal place)?
#### ANS 281.9
11. Quiz Question: What is the predicted price for the 1st house in the Test data set for model 1 (round to nearest dollar)?
#### ANS: 356134.443255
15. Quiz Question: What is the predicted price for the 1st house in the TEST data set for model 2 (round to nearest dollar)?
#### ANS: 366651.41162949
17. Quiz Question: Which estimate was closer to the true price for the 1st house on the TEST data set, model 1 or model 2?
#### ANS: model 1
19. Quiz Question: Which model (1 or 2) has lowest RSS on all of the TEST data?
#### ANS: model 2
