In [22]:
import pandas as pd
import numpy as np
import scipy
from sklearn import linear_model

In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [33]:
train_data['bedrooms_squared'] = train_data['bedrooms'] * train_data['bedrooms']
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)
train_data['bed_bath_rooms'] = train_data['bedrooms'] * train_data['bathrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms'] * test_data['bathrooms']
train_data['log_sqft_living'] = np.log(train_data['sqft_living'])
test_data['log_sqft_living'] = np.log(test_data['sqft_living'])
train_data['lat_plus_long'] = train_data['lat'] + train_data['long']
test_data['lat_plus_long'] = test_data['lat'] + test_data['long']

In [40]:
print "Mean bedrooms_squared : %.2f" % np.mean(test_data['bedrooms_squared'])
print "Mean Bed Bath rooms : %.2f" % np.mean(test_data['bed_bath_rooms'])
print "Mean log sqft living : %.2f" % np.mean(test_data['log_sqft_living'])
print "Mean lat plus logn : %.2f" % np.mean(test_data['lat_plus_long'])

Mean bedrooms_squared : 12.45
Mean Bed Bath rooms : 7.50
Mean log sqft living : 7.55
Mean lat plus logn : -74.65


In [29]:
def train_linear_model(train_data, test_data, features, label):
    X_train = train_data[features]
    y_train = train_data[label]
    X_test = test_data[features]
    y_test = test_data[label]
    regr = linear_model.LinearRegression()
    regr.fit(X_train, y_train)
    print('Intercept: \n', regr.intercept_)
    print('Coefficients: \n', regr.coef_)
    print("Residual sum of squares on TRAIN: %.2f"
          % np.mean((regr.predict(X_train) - y_train) ** 2))
    print("Residual sum of squares on TEST: %.2f"
          % np.mean((regr.predict(X_test) - y_test) ** 2))
    

In [30]:
label = 'price'
model1_feat = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
train_linear_model(train_data, test_data, model1_feat, label)

('Intercept: \n', -69075726.792569786)
('Coefficients: \n', array([  3.12258646e+02,  -5.95865332e+04,   1.57067421e+04,
         6.58619264e+05,  -3.09374351e+05]))
Residual sum of squares on TRAIN: 55676481997.79
Residual sum of squares on TEST: 53322409504.73


In [31]:
label = 'price'
model2_feat = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']
train_linear_model(train_data, test_data, model2_feat, label)

('Intercept: \n', -66867968.871078826)
('Coefficients: \n', array([  3.06610053e+02,  -1.13446368e+05,  -7.14613083e+04,
         6.54844630e+05,  -2.94298969e+05,   2.55796520e+04]))
Residual sum of squares on TRAIN: 55132284576.28
Residual sum of squares on TEST: 52820397960.86


In [32]:
label = 'price'
model3_feat = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']
train_linear_model(train_data, test_data, model3_feat, label)

('Intercept: \n', -62036084.98609814)
('Coefficients: \n', array([  5.29422820e+02,   3.45142296e+04,   6.70607813e+04,
         5.34085611e+05,  -4.06750711e+05,  -4.28525220e+03,
        -4.28525220e+03,  -6.78858667e+03,  -5.61831484e+05,
         1.27334900e+05]))
Residual sum of squares on TRAIN: 51969423323.20
Residual sum of squares on TEST: 61299673494.25
