# Working with Multiple Regression


## Set Up


In [5]:
import graphlab
import math
graphlab.canvas.set_target('ipynb')
sales = graphlab.SFrame('./kc_house_data.gl/')
sales.head()

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650,1,0
6414100192,2014-12-09 00:00:00+00:00,538000.0,3.0,2.25,2570.0,7242,2,0
5631500400,2015-02-25 00:00:00+00:00,180000.0,2.0,1.0,770.0,10000,1,0
2487200875,2014-12-09 00:00:00+00:00,604000.0,4.0,3.0,1960.0,5000,1,0
1954400510,2015-02-18 00:00:00+00:00,510000.0,3.0,2.0,1680.0,8080,1,0
7237550310,2014-05-12 00:00:00+00:00,1225000.0,4.0,4.5,5420.0,101930,1,0
1321400060,2014-06-27 00:00:00+00:00,257500.0,3.0,2.25,1715.0,6819,2,0
2008000270,2015-01-15 00:00:00+00:00,291850.0,3.0,1.5,1060.0,9711,1,0
2414600126,2015-04-15 00:00:00+00:00,229500.0,3.0,1.0,1780.0,7470,1,0
3793500160,2015-03-12 00:00:00+00:00,323000.0,3.0,2.5,1890.0,6560,2,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398
0,3,7,2170,400,1951,1991,98125,47.72102274
0,3,6,770,0,1933,0,98028,47.73792661
0,5,7,1050,910,1965,0,98136,47.52082
0,3,8,1680,0,1987,0,98074,47.61681228
0,3,11,3890,1530,2001,0,98053,47.65611835
0,3,7,1715,0,1995,0,98003,47.30972002
0,3,7,1060,0,1963,0,98198,47.40949984
0,3,7,1050,730,1960,0,98146,47.51229381
0,3,7,1890,0,2003,0,98038,47.36840673

long,sqft_living15,sqft_lot15
-122.25677536,1340.0,5650.0
-122.3188624,1690.0,7639.0
-122.23319601,2720.0,8062.0
-122.39318505,1360.0,5000.0
-122.04490059,1800.0,7503.0
-122.00528655,4760.0,101930.0
-122.32704857,2238.0,6819.0
-122.31457273,1650.0,9711.0
-122.33659507,1780.0,8113.0
-122.0308176,2390.0,7570.0


## Adding Columns
Add 4 new variables to both train_data and test_data...so just add to sales directly then compute the split in train/test

In [6]:
sales['bedrooms_squared'] = sales['bedrooms'] ** 2
sales['bed_bath_rooms'] = sales['bedrooms'] * sales['bathrooms']
sales['log_sqft_living'] = sales['sqft_living'].apply(lambda x: math.log(float(x)))
sales['lat_plus_long'] = sales['lat'] + sales['long']

In [7]:
sales.head()

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
7129300520,2014-10-13 00:00:00+00:00,221900.0,3.0,1.0,1180.0,5650,1,0
6414100192,2014-12-09 00:00:00+00:00,538000.0,3.0,2.25,2570.0,7242,2,0
5631500400,2015-02-25 00:00:00+00:00,180000.0,2.0,1.0,770.0,10000,1,0
2487200875,2014-12-09 00:00:00+00:00,604000.0,4.0,3.0,1960.0,5000,1,0
1954400510,2015-02-18 00:00:00+00:00,510000.0,3.0,2.0,1680.0,8080,1,0
7237550310,2014-05-12 00:00:00+00:00,1225000.0,4.0,4.5,5420.0,101930,1,0
1321400060,2014-06-27 00:00:00+00:00,257500.0,3.0,2.25,1715.0,6819,2,0
2008000270,2015-01-15 00:00:00+00:00,291850.0,3.0,1.5,1060.0,9711,1,0
2414600126,2015-04-15 00:00:00+00:00,229500.0,3.0,1.0,1780.0,7470,1,0
3793500160,2015-03-12 00:00:00+00:00,323000.0,3.0,2.5,1890.0,6560,2,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,7,1180,0,1955,0,98178,47.51123398
0,3,7,2170,400,1951,1991,98125,47.72102274
0,3,6,770,0,1933,0,98028,47.73792661
0,5,7,1050,910,1965,0,98136,47.52082
0,3,8,1680,0,1987,0,98074,47.61681228
0,3,11,3890,1530,2001,0,98053,47.65611835
0,3,7,1715,0,1995,0,98003,47.30972002
0,3,7,1060,0,1963,0,98198,47.40949984
0,3,7,1050,730,1960,0,98146,47.51229381
0,3,7,1890,0,2003,0,98038,47.36840673

long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
-122.25677536,1340.0,5650.0,9.0,3.0,7.07326971746,-74.74554138
-122.3188624,1690.0,7639.0,9.0,6.75,7.85166117789,-74.59783966
-122.23319601,2720.0,8062.0,4.0,2.0,6.64639051485,-74.4952694
-122.39318505,1360.0,5000.0,16.0,12.0,7.58069975222,-74.87236505
-122.04490059,1800.0,7503.0,9.0,6.0,7.4265490724,-74.42808831
-122.00528655,4760.0,101930.0,16.0,18.0,8.59785109443,-74.3491682
-122.32704857,2238.0,6819.0,9.0,6.75,7.4471683596,-75.01732855
-122.31457273,1650.0,9711.0,9.0,4.5,6.96602418711,-74.90507289
-122.33659507,1780.0,8113.0,9.0,3.0,7.48436864329,-74.82430126
-122.0308176,2390.0,7570.0,9.0,7.5,7.54433210805,-74.66241087


## Question 1
what are the mean (arithmetic average) values of the 4 new variables on TEST data?
- 'bedrooms_squared' = 12.45
- 'bed_bath_rooms' = 7.50
- 'log_sqft_living' = 7.55(with log base e)
- 'lat_plus_long' = -74.65

In [8]:
#Split train/test
train_data, test_data = sales.random_split(.8, seed=0)

In [9]:
print test_data['bedrooms_squared'].mean()
print test_data['bed_bath_rooms'].mean()
print test_data['log_sqft_living'].mean()
print test_data['lat_plus_long'].mean()

12.4466777016
7.50390163159
7.55027467965
-74.6533349722


In [10]:
full_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long',
                'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living',
                 'lat_plus_long']

basic_model = graphlab.linear_regression.create(train_data,
                                               target = 'price',
                                                validation_set = None,
                                               features = full_features[:5])
mid_model = graphlab.linear_regression.create(train_data,
                                             target = 'price',
                                              validation_set = None,
                                             features = full_features[:6])
full_model = graphlab.linear_regression.create(train_data,
                                              target = 'price',
                                               validation_set = None,
                                              features = full_features)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROGRESS: Number of features          : 5
PROGRESS: Number of unpacked features : 5
PROGRESS: Number of coefficients    : 6
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 1.051609     | 4074878.213096     | 236378.596455 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: SUCCESS: Optimal solution found.
PROGRESS:
PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROG

## Coefficients
Pos/Neg Coefficient for 'bathrooms' in basic_model: +

Pos/Neg Coefficient for 'bathrooms' in mid_model: -

NOT the same!

In [11]:
basic_model['coefficients']

name,index,value,stderr
(intercept),,-56140675.7444,1649985.42028
sqft_living,,310.263325778,3.18882960408
bedrooms,,-59577.1160682,2487.27977322
bathrooms,,13811.8405418,3593.54213297
lat,,629865.789485,13120.7100323
long,,-214790.285186,13284.2851607


In [12]:
mid_model['coefficients']

name,index,value,stderr
(intercept),,-54410676.1152,1650405.16541
sqft_living,,304.449298057,3.20217535637
bedrooms,,-116366.043231,4805.54966546
bathrooms,,-77972.3305135,7565.05991091
lat,,625433.834953,13058.3530972
long,,-203958.60296,13268.1283711
bed_bath_rooms,,26961.6249092,1956.36561555


## Observing Error
Which model has the lowest RSS on TRAINING data: FULL (#3)

Which model has the lowest RSS on TESTING data: MID (#2)

In [13]:
print basic_model.evaluate(train_data)['rmse']
print mid_model.evaluate(train_data)['rmse']
print full_model.evaluate(train_data)['rmse']

236378.596455
235190.935428
228200.043155


In [14]:
print basic_model.evaluate(test_data)['rmse']
print mid_model.evaluate(test_data)['rmse']
print full_model.evaluate(test_data)['rmse']

231462.4407
230336.301982
244024.989587


In [16]:
sales[full_features].to_numpy()

array([[  1.18000000e+03,   3.00000000e+00,   1.00000000e+00, ...,
          9.00000000e+00,   7.07326972e+00,  -7.47455414e+01],
       [  2.57000000e+03,   3.00000000e+00,   2.25000000e+00, ...,
          9.00000000e+00,   7.85166118e+00,  -7.45978397e+01],
       [  7.70000000e+02,   2.00000000e+00,   1.00000000e+00, ...,
          4.00000000e+00,   6.64639051e+00,  -7.44952694e+01],
       ..., 
       [  1.02000000e+03,   2.00000000e+00,   7.50000000e-01, ...,
          4.00000000e+00,   6.92755791e+00,  -7.47042963e+01],
       [  1.60000000e+03,   3.00000000e+00,   2.50000000e+00, ...,
          9.00000000e+00,   7.37775891e+00,  -7.45345885e+01],
       [  1.02000000e+03,   2.00000000e+00,   7.50000000e-01, ...,
          4.00000000e+00,   6.92755791e+00,  -7.47045755e+01]])