# Importing Libraries

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [16]:
house_data = pd.read_csv('kc_house_data.csv')
house_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [17]:
print(house_data.shape)

(21613, 21)


# Train-Test Split

In [22]:
train_data = pd.read_csv('kc_house_train_data.csv',dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv',dtype=dtype_dict)
print(train_data.shape,test_data.shape)

(17384, 21) (4229, 21)


In [18]:
house_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
id               21613 non-null int64
date             21613 non-null object
price            21613 non-null float64
bedrooms         21613 non-null int64
bathrooms        21613 non-null float64
sqft_living      21613 non-null int64
sqft_lot         21613 non-null int64
floors           21613 non-null float64
waterfront       21613 non-null int64
view             21613 non-null int64
condition        21613 non-null int64
grade            21613 non-null int64
sqft_above       21613 non-null int64
sqft_basement    21613 non-null int64
yr_built         21613 non-null int64
yr_renovated     21613 non-null int64
zipcode          21613 non-null int64
lat              21613 non-null float64
long             21613 non-null float64
sqft_living15    21613 non-null int64
sqft_lot15       21613 non-null int64
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB


In [19]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [20]:
house_data = house_data.astype(dtype_dict)

In [21]:
house_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
id               21613 non-null object
date             21613 non-null object
price            21613 non-null float64
bedrooms         21613 non-null float64
bathrooms        21613 non-null float64
sqft_living      21613 non-null float64
sqft_lot         21613 non-null int64
floors           21613 non-null object
waterfront       21613 non-null int64
view             21613 non-null int64
condition        21613 non-null int64
grade            21613 non-null int64
sqft_above       21613 non-null int64
sqft_basement    21613 non-null int64
yr_built         21613 non-null int64
yr_renovated     21613 non-null int64
zipcode          21613 non-null object
lat              21613 non-null float64
long             21613 non-null float64
sqft_living15    21613 non-null float64
sqft_lot15       21613 non-null float64
dtypes: float64(8), int64(9), object(4)
memory usage: 3.5+ MB


# Feature Engineering

In [23]:
train_data['bedrooms_squared'] = train_data['bedrooms'] * train_data['bedrooms']
train_data['bed_bath_rooms'] = train_data['bedrooms'] * train_data['bathrooms']
train_data['log_sqft_living'] = np.log(train_data['sqft_living'])
train_data['lat_plus_long'] = train_data['lat'] + train_data['long']

In [24]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,0,98178,47.5112,-122.257,1340.0,5650.0,9.0,3.0,7.07327,-74.7458
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,1991,98125,47.721,-122.319,1690.0,7639.0,9.0,6.75,7.851661,-74.598
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,0,98028,47.7379,-122.233,2720.0,8062.0,4.0,2.0,6.646391,-74.4951
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,0,98136,47.5208,-122.393,1360.0,5000.0,16.0,12.0,7.5807,-74.8722
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,0,98074,47.6168,-122.045,1800.0,7503.0,9.0,6.0,7.426549,-74.4282


In [25]:
test_data['bedrooms_squared'] = test_data['bedrooms'] * test_data['bedrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms'] * test_data['bathrooms']
test_data['log_sqft_living'] = np.log(test_data['sqft_living'])
test_data['lat_plus_long'] = test_data['lat'] + test_data['long']

In [26]:
test_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
0,114101516,20140528T000000,310000.0,3.0,1.0,1430.0,19901,1.5,0,0,...,0,98028,47.7558,-122.229,1780.0,12697.0,9.0,3.0,7.26543,-74.4732
1,9297300055,20150124T000000,650000.0,4.0,3.0,2950.0,5000,2.0,0,3,...,0,98126,47.5714,-122.375,2140.0,4000.0,16.0,12.0,7.98956,-74.8036
2,1202000200,20141103T000000,233000.0,3.0,2.0,1710.0,4697,1.5,0,0,...,0,98002,47.3048,-122.218,1030.0,4705.0,9.0,6.0,7.444249,-74.9132
3,8562750320,20141110T000000,580500.0,3.0,2.5,2320.0,3980,2.0,0,0,...,0,98027,47.5391,-122.07,2580.0,3980.0,9.0,7.5,7.749322,-74.5309
4,7589200193,20141110T000000,535000.0,3.0,1.0,1090.0,3000,1.5,0,0,...,0,98117,47.6889,-122.375,1570.0,5080.0,9.0,3.0,6.993933,-74.6861


In [27]:
print(np.round(np.mean(test_data['bedrooms_squared']),2))
print(np.round(np.mean(test_data['bed_bath_rooms']),2))
print(np.round(np.mean(test_data['log_sqft_living']),2))
print(np.round(np.mean(test_data['lat_plus_long']),2))

12.45
7.5
7.55
-74.65


# Comparing performance for 3 different models

### Model 1 of features: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’, and ‘long’

In [32]:
X1 = np.array(train_data[['sqft_living','bedrooms','bathrooms','lat','long']])
Y1 = np.array(train_data['price']).reshape(-1,1)
print(X1.shape,Y1.shape)

(17384, 5) (17384, 1)


### Model 2 of features: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, 'bed_bath_rooms'

In [34]:
X2 = np.array(train_data[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms']])
Y2 = np.array(train_data['price']).reshape(-1,1)
print(X2.shape,Y2.shape)

(17384, 6) (17384, 1)


### Model 3 of features: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, ‘bed_bath_rooms’, ‘bedrooms_squared’, ‘log_sqft_living’, and ‘lat_plus_long’

In [35]:
X3 = np.array(train_data[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms','bedrooms_squared','log_sqft_living','lat_plus_long']])
Y3 = np.array(train_data['price']).reshape(-1,1)
print(X3.shape,Y3.shape)

(17384, 9) (17384, 1)


## Fitting all models

In [36]:
model_1 = LinearRegression().fit(X1,Y1)
model_2 = LinearRegression().fit(X2,Y2)
model_3 = LinearRegression().fit(X3,Y3)

In [38]:
print(model_1.coef_)
print(model_2.coef_)
print(model_3.coef_)

[[ 3.12258646e+02 -5.95865332e+04  1.57067421e+04  6.58619264e+05
  -3.09374351e+05]]
[[ 3.06610053e+02 -1.13446368e+05 -7.14613083e+04  6.54844630e+05
  -2.94298969e+05  2.55796520e+04]]
[[ 5.29422820e+02  3.45142296e+04  6.70607813e+04  5.34085611e+05
  -4.06750711e+05 -8.57050439e+03 -6.78858667e+03 -5.61831484e+05
   1.27334900e+05]]


### Computing RSS for 3 models on train data

In [41]:
rss_1 = np.sum(np.square(Y1 - model_1.predict(X1)))
rss_2 = np.sum(np.square(Y2 - model_2.predict(X2)))
rss_3 = np.sum(np.square(Y3 - model_3.predict(X3)))
print(f'The RSS for model 1: {rss_1}\nThe RSS for model 2: {rss_2}\nThe RSS for model 3: {rss_3}\n')

The RSS for model 1: 967879963049545.8
The RSS for model 2: 958419635074070.0
The RSS for model 3: 903436455050479.2



### Computing RSS for 3 models on test data

In [42]:
X1 = np.array(test_data[['sqft_living','bedrooms','bathrooms','lat','long']])
Y1 = np.array(test_data['price']).reshape(-1,1)
print(X1.shape,Y1.shape)

(4229, 5) (4229, 1)


In [43]:
X2 = np.array(test_data[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms']])
Y2 = np.array(test_data['price']).reshape(-1,1)
print(X2.shape,Y2.shape)

(4229, 6) (4229, 1)


In [44]:
X3 = np.array(test_data[['sqft_living','bedrooms','bathrooms','lat','long','bed_bath_rooms','bedrooms_squared','log_sqft_living','lat_plus_long']])
Y3 = np.array(test_data['price']).reshape(-1,1)
print(X3.shape,Y3.shape)

(4229, 9) (4229, 1)


In [45]:
rss_1 = np.sum(np.square(Y1 - model_1.predict(X1)))
rss_2 = np.sum(np.square(Y2 - model_2.predict(X2)))
rss_3 = np.sum(np.square(Y3 - model_3.predict(X3)))
print(f'The RSS for model 1: {rss_1}\nThe RSS for model 2: {rss_2}\nThe RSS for model 3: {rss_3}\n')

The RSS for model 1: 225500469795490.34
The RSS for model 2: 223377462976467.5
The RSS for model 3: 259236319207179.94



# Questions 

4. Quiz Question: what are the mean (arithmetic average) values of your 4 new variables on TEST data? (round to 2 digits)
##### ANS: 12.45 ,7.5, 7.55, -74.65

6. Quiz Question: What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 1?
##### ANS: positive

7. Quiz Question: What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 2?
##### ANS: negative

8. Is the sign for the coefficient the same in both models? Think about why this might be the case.
##### ANS: No

9. Now using your three estimated models compute the RSS (Residual Sum of Squares) on the Training data.

10. Quiz Question: Which model (1, 2 or 3) had the lowest RSS on TRAINING data?
##### ANS: Model 3
11. Now using your three estimated models compute the RSS on the Testing data

12. Quiz Question: Which model (1, 2, or 3) had the lowest RSS on TESTING data?
##### ANS: Model 2

13. Did you get the same answer for 9 and 11? Think about why this might be the case.
##### ANS: No. Because It contains one useless feature (lat + long) which has no role in predicting price. 