In [1]:
import pandas as pd
import numpy as np

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 
              'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 
              'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [3]:
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [4]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


In [5]:
from math import log

train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)
train_data['bed_bath_rooms'] = train_data['bedrooms'] * train_data['bathrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms'] * test_data['bathrooms']

train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: log(x))
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x: log(x))

train_data['lat_plus_long'] = train_data['lat'] + train_data['long']
test_data['lat_plus_long'] = test_data['lat'] + test_data['long']

In [6]:
print (test_data['bedrooms_squared'].mean())
print (test_data['bed_bath_rooms'].mean())
print (test_data['log_sqft_living'].mean())
print (test_data['lat_plus_long'].mean())

12.4466777015843
7.5039016315913925
7.550274679645921
-74.65333355403185


In [7]:
model_1_features = train_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']]
model_2_features = train_data[[ 'sqft_living', 'bedrooms', 'bathrooms', 'lat','long', 'bed_bath_rooms']]
model_3_features = train_data[[ 'sqft_living', 'bedrooms', 'bathrooms', 'lat','long', 'bed_bath_rooms', 
                               'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']]

In [8]:
from sklearn import linear_model

In [9]:
regr = linear_model.LinearRegression()
regr.fit(model_1_features, train_data['price'])

print(regr.coef_)

[ 3.12258646e+02 -5.95865332e+04  1.57067421e+04  6.58619264e+05
 -3.09374351e+05]


In [10]:
regr2 = linear_model.LinearRegression()
regr2.fit(model_2_features, train_data['price'])

print(regr2.coef_)

[ 3.06610053e+02 -1.13446368e+05 -7.14613083e+04  6.54844630e+05
 -2.94298969e+05  2.55796520e+04]


In [11]:
regr3 = linear_model.LinearRegression()
regr3.fit(model_3_features, train_data['price'])

print(regr3.coef_)

[ 5.29422820e+02  3.45142296e+04  6.70607813e+04  5.34085611e+05
 -4.06750711e+05 -8.57050439e+03 -6.78858667e+03 -5.61831484e+05
  1.27334900e+05]


In [12]:
print("Model Performance: ", regr.score(model_1_features, train_data['price']))
print("Model Performance: ", regr2.score(model_2_features, train_data['price']))
print("Model Performance: ", regr3.score(model_3_features, train_data['price']))

Model Performance:  0.5926022811353863
Model Performance:  0.5965842997575893
Model Performance:  0.6197276883725951


In [17]:
def get_residual_sum_of_squares(predicted, original):
    residuals = original - predicted
    RSS = (residuals * residuals).sum()
    return(RSS)

In [18]:
output1 = regr.predict(model_1_features)
output2 = regr2.predict(model_2_features)
output3 = regr3.predict(model_3_features)
print(get_residual_sum_of_squares(output1, train_data['price']))
print(get_residual_sum_of_squares(output2, train_data['price']))
print(get_residual_sum_of_squares(output3, train_data['price']))

967879963049546.4
958419635074069.2
903436455050479.0


In [19]:
model_1_test = test_data[['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']]
model_2_test = test_data[[ 'sqft_living', 'bedrooms', 'bathrooms', 'lat','long', 'bed_bath_rooms']]
model_3_test = test_data[[ 'sqft_living', 'bedrooms', 'bathrooms', 'lat','long', 'bed_bath_rooms', 
                               'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']]



output1_test = regr.predict(model_1_test)
output2_test = regr2.predict(model_2_test)
output3_test = regr3.predict(model_3_test)

print(get_residual_sum_of_squares(output1_test, test_data['price']))
print(get_residual_sum_of_squares(output2_test, test_data['price']))
print(get_residual_sum_of_squares(output3_test, test_data['price']))

225500469795490.16
223377462976466.88
259236319207179.44
