In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model as lm
import matplotlib.pyplot as plt


In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int,\
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float,\
              'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int,\
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv("kc_house_data.csv", dtype = dtype_dict)
train_data = pd.read_csv("kc_house_train_data.csv", dtype = dtype_dict)
test_data = pd.read_csv("kc_house_test_data.csv", dtype = dtype_dict)

In [3]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [4]:
example_features = train_data[['sqft_living','bedrooms','bathrooms']]
#print(example_features)
example_model = lm.LinearRegression()
example_model.fit(example_features,train_data[['price']])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [5]:
print("regression intercept", example_model.intercept_)
print("regression coefficients", example_model.coef_)

regression intercept [87912.86581496]
regression coefficients [[   315.40669062 -65081.88711588   6942.16598637]]


In [6]:
print(example_model.predict(train_data[['sqft_living','bedrooms','bathrooms']]))

[[271789.26537997]
 [718882.27281845]
 [207554.4093435 ]
 ...
 [392594.85607494]
 [414673.32441803]
 [284670.54050078]]


In [7]:
#Compute SSR
def compute_SSR(model, X, y):
    y_predict = model.predict(X)
    residuals = y - y_predict
    return (residuals*residuals).sum()

In [8]:
print(compute_SSR(example_model, test_data[['sqft_living', 'bedrooms', 'bathrooms']], test_data[['price']]))

price    2.737619e+14
dtype: float64


In [9]:
#Create some new features
from math import log

train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)

train_data['bed_bath_rooms'] = train_data['bedrooms']*train_data['bathrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms']*test_data['bathrooms']

train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: log(x))
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x: log(x))

train_data['lat_plus_long'] = train_data['lat']+train_data['long']
test_data['lat_plus_long'] = test_data['lat']+test_data['long']

train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,0,98178,47.5112,-122.257,1340.0,5650.0,9.0,3.0,7.07327,-74.7458
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,1991,98125,47.721,-122.319,1690.0,7639.0,9.0,6.75,7.851661,-74.598
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,0,98028,47.7379,-122.233,2720.0,8062.0,4.0,2.0,6.646391,-74.4951
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,0,98136,47.5208,-122.393,1360.0,5000.0,16.0,12.0,7.5807,-74.8722
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,0,98074,47.6168,-122.045,1800.0,7503.0,9.0,6.0,7.426549,-74.4282


**Quiz Question: What is the mean (arithmetic average) value of your 4 new features on TEST data? (round to 2 digits)**

In [10]:
print(test_data[['bedrooms_squared', 'bed_bath_rooms', 'log_sqft_living', 'lat_plus_long']].mean())

bedrooms_squared    12.446678
bed_bath_rooms       7.503902
log_sqft_living      7.550275
lat_plus_long      -74.653334
dtype: float64


# Learning Multiple Models

* Model 1: squarefeet, # bedrooms, # bathrooms, latitude & longitude
* Model 2: add bedrooms\*bathrooms
* Model 3: Add log squarefeet, bedrooms squared, and the (nonsensical) latitude + longitude

In [11]:
model_1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features = model_1_features + ['bed_bath_rooms']
model_3_features = model_2_features + ['log_sqft_living', 'bedrooms_squared', 'lat_plus_long']

In [12]:
model_1 = lm.LinearRegression()
model_1.fit(train_data[model_1_features],train_data[['price']])

print("regression intercept", model_1.intercept_)
print("regression coefficients", model_1.coef_)

regression intercept [-69075726.79256983]
regression coefficients [[ 3.12258646e+02 -5.95865332e+04  1.57067421e+04  6.58619264e+05
  -3.09374351e+05]]


In [13]:
model_2 = lm.LinearRegression()
model_2.fit(train_data[model_2_features],train_data[['price']])

print("regression intercept", model_2.intercept_)
print("regression coefficients", model_2.coef_)

regression intercept [-66867968.87107886]
regression coefficients [[ 3.06610053e+02 -1.13446368e+05 -7.14613083e+04  6.54844630e+05
  -2.94298969e+05  2.55796520e+04]]


In [14]:
model_3 = lm.LinearRegression()
model_3.fit(train_data[model_3_features],train_data[['price']])

print("regression intercept", model_3.intercept_)
print("regression coefficients", model_3.coef_)

regression intercept [-62036084.98609828]
regression coefficients [[ 5.29422820e+02  3.45142296e+04  6.70607813e+04  5.34085611e+05
  -4.06750711e+05 -8.57050439e+03 -5.61831484e+05 -6.78858667e+03
   1.27334900e+05]]


# Comparing multiple models

Now that you've learned three models and extracted the model weights we want to evaluate which model is best.

In [16]:
print(compute_SSR(model_1, train_data[model_1_features], train_data[['price']]))
print(compute_SSR(model_2, train_data[model_2_features], train_data[['price']]))
print(compute_SSR(model_3, train_data[model_3_features], train_data[['price']]))

price    9.678800e+14
dtype: float64
price    9.584196e+14
dtype: float64
price    9.034365e+14
dtype: float64


In [17]:
print(compute_SSR(model_1, test_data[model_1_features], test_data[['price']]))
print(compute_SSR(model_2, test_data[model_2_features], test_data[['price']]))
print(compute_SSR(model_3, test_data[model_3_features], test_data[['price']]))

price    2.255005e+14
dtype: float64
price    2.233775e+14
dtype: float64
price    2.592363e+14
dtype: float64
