#### Read Data

In [1]:
import graphlab
import numpy as np
sales = graphlab.SFrame('Data1/')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1503817709.log


This non-commercial license of GraphLab Create for academic use is assigned to B140007@e.ntu.edu.sg and will expire on August 27, 2018.


#### Split train and test

In [4]:
train_data, test_data = sales.random_split(.8,seed=0)

#### Learn a multiple regression model

In [6]:
example_features = ['sqft_living','bedrooms','bathrooms']
example_model = graphlab.linear_regression.create(train_data,target = "price",features = example_features,validation_set = None)

#### Make Predictions

In [8]:
example_predictions = example_model.predict(train_data)
print example_predictions[0]

271789.505878


#### Compute RSS

In [9]:
def get_residual_sum_of_squares(model, data, outcome):
    predictions = model.predict(data)
    residuals = outcome - predictions
    RSS = (residuals*residuals).sum()
    return RSS

In [11]:
rss_example_train = get_residual_sum_of_squares(example_model, test_data,test_data['price'])
print rss_example_train

2.7376153833e+14


#### Create new feature

In [12]:
train_data["bedrooms"]*train_data["bathrooms"]

dtype: float
Rows: 17384
[3.0, 6.75, 2.0, 12.0, 6.0, 18.0, 6.75, 4.5, 3.0, 7.5, 7.5, 2.0, 5.25, 10.0, 6.0, 4.0, 2.0, 3.0, 7.0, 8.25, 12.5, 3.0, 6.75, 5.25, 3.0, 5.25, 10.0, 3.0, 7.0, 11.0, 7.5, 4.0, 8.0, 4.0, 10.0, 13.75, 3.0, 3.0, 7.5, 3.0, 10.0, 5.25, 7.5, 3.0, 12.5, 5.25, 3.5, 11.25, 10.0, 7.5, 16.25, 12.0, 6.75, 5.25, 5.25, 6.75, 8.25, 10.0, 10.0, 3.0, 11.25, 16.25, 6.0, 10.0, 16.0, 7.0, 3.0, 3.0, 10.0, 7.5, 2.0, 4.5, 3.0, 4.5, 11.25, 9.75, 4.5, 3.5, 13.75, 3.0, 4.5, 4.5, 10.0, 6.0, 5.25, 7.5, 7.5, 6.0, 6.75, 7.5, 3.0, 7.5, 3.0, 3.5, 10.0, 10.0, 5.25, 5.25, 5.25, 10.5, ... ]

In [15]:
from math import log

train_data['bedrooms_squared'] = train_data['bedrooms'].apply(lambda x: x**2)
test_data['bedrooms_squared'] = test_data['bedrooms'].apply(lambda x: x**2)
train_data['bed_bath_rooms'] = train_data['bedrooms']*train_data['bathrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms']*test_data['bathrooms']
train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x: log(x))
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x: log(x))
train_data['lat_plus_long'] = train_data['lat']+train_data['long']
test_data['lat_plus_long'] = test_data['lat']+test_data['long']

In [16]:
#Squared - increase the difference between small and large values
#Multiply Two Attribute - Interaction Feature
#Log - smooth the difference

#### Learn Multiple Models

In [17]:
model_1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features = model_1_features + ['bed_bath_rooms']
model_3_features = model_2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

model_1 = graphlab.linear_regression.create(train_data,target = "price",features = model_1_features,validation_set = None)
model_2 = graphlab.linear_regression.create(train_data,target = "price",features = model_2_features,validation_set = None)
model_3 = graphlab.linear_regression.create(train_data,target = "price",features = model_3_features,validation_set = None)

In [18]:
print model_1.coefficients
print model_2.coefficients
print model_3.coefficients

+-------------+-------+----------------+---------------+
|     name    | index |     value      |     stderr    |
+-------------+-------+----------------+---------------+
| (intercept) |  None | -56140675.7444 | 1649985.42028 |
| sqft_living |  None | 310.263325778  | 3.18882960408 |
|   bedrooms  |  None | -59577.1160682 | 2487.27977322 |
|  bathrooms  |  None | 13811.8405418  | 3593.54213297 |
|     lat     |  None | 629865.789485  | 13120.7100323 |
|     long    |  None | -214790.285186 | 13284.2851607 |
+-------------+-------+----------------+---------------+
[6 rows x 4 columns]

+----------------+-------+----------------+---------------+
|      name      | index |     value      |     stderr    |
+----------------+-------+----------------+---------------+
|  (intercept)   |  None | -54410676.1152 | 1650405.16541 |
|  sqft_living   |  None | 304.449298057  | 3.20217535637 |
|    bedrooms    |  None | -116366.043231 | 4805.54966546 |
|   bathrooms    |  None | -77972.3305135 | 7565

In [20]:
print get_residual_sum_of_squares(model_1, train_data,train_data['price'])
print get_residual_sum_of_squares(model_2, train_data,train_data['price'])
print get_residual_sum_of_squares(model_3, train_data,train_data['price'])

9.71328233544e+14
9.61592067856e+14
9.05276314555e+14


In [21]:
print get_residual_sum_of_squares(model_1, test_data,test_data['price'])
print get_residual_sum_of_squares(model_2, test_data,test_data['price'])
print get_residual_sum_of_squares(model_3, test_data,test_data['price'])

2.26568089093e+14
2.24368799994e+14
2.51829318952e+14
