# Compare Multiple Regression Algorithms


## Step 1 - Load Data

In [1]:
import os
import urllib.request

data_url = 'https://elephantscale-public.s3.amazonaws.com/data/house-prices/house-sales-full.csv'
#data_url = 'https://raw.githubusercontent.com/elephantscale/datasets/master/house-prices/house-sales-full.csv'

data_location = os.path.basename(data_url)

if not os.path.exists (data_location):
    print("Downloading : ", data_url)
    urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)


Downloading :  https://elephantscale-public.s3.amazonaws.com/data/house-prices/house-sales-full.csv
data_location: house-sales-full.csv


In [2]:
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format

data = pd.read_csv(data_location)
data.sample(5)

Unnamed: 0,DocumentID,Date,SalePrice,PropertyID,PropertyType,ym,zhvi_px,zhvi_idx,AdjSalePrice,NbrLivingUnits,...,Bathrooms,Bedrooms,BldgGrade,YrBuilt,YrRenovated,TrafficNoise,LandVal,ImpsVal,ZipCode,NewConstruction
14136,14137,1/6/10,975000,4139430560,Single Family,1/1/10,350800,0.81,1209578.0,1,...,2.5,4,11,1998,0,0,461000,820000,98006,False
21401,21402,7/17/09,675000,7852070090,Single Family,7/1/09,354400,0.81,828894.0,1,...,2.5,3,9,2002,0,0,245000,423000,98065,False
17508,17509,8/14/13,332218,6181500210,Single Family,8/1/13,374300,0.86,386271.0,1,...,2.5,4,8,2013,0,0,71000,305000,-1,True
1819,1820,4/15/13,409000,730000020,Townhouse,4/1/13,356000,0.82,499991.0,1,...,3.0,3,8,2013,0,0,117000,360000,-1,True
1686,1687,1/13/10,440000,662310590,Single Family,1/1/10,350800,0.81,545861.0,1,...,2.5,4,9,1996,0,0,90000,299000,98023,False


## Step 2 - Explore Data (EDA)
EDA is a great way to get a sense of the data.  

Try to find answers to the following questions, by looking at the output of `describe` below

- How many 'max' bedrooms do we have? :-)
- Find min/max of 'SalePrice'
- Do you think we have outliers in data

In [3]:
## TODO use 'describe()' function to understand data
data.describe()

Unnamed: 0,DocumentID,SalePrice,PropertyID,zhvi_px,zhvi_idx,AdjSalePrice,NbrLivingUnits,SqFtLot,SqFtTotLiving,SqFtFinBasement,Bathrooms,Bedrooms,BldgGrade,YrBuilt,YrRenovated,TrafficNoise,LandVal,ImpsVal,ZipCode
count,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0,27063.0
mean,13532.0,511626.2,4680324882.08,390750.58,0.9,570918.76,1.02,10997.68,2122.96,275.3,2.26,3.38,7.78,1977.09,86.73,0.21,213820.07,317211.67,82223.04
std,7812.56,342821.17,2896350979.15,37024.46,0.09,380236.63,0.15,28110.66,939.84,428.71,0.77,0.9,1.18,30.92,407.32,0.55,177213.41,234038.34,36106.67
min,1.0,3000.0,1000102.0,311600.0,0.72,3368.0,1.0,494.0,370.0,0.0,0.0,0.0,3.0,1900.0,0.0,0.0,0.0,0.0,-1.0
25%,6766.5,329000.0,2213000057.5,357100.0,0.82,366918.5,1.0,4257.5,1440.0,0.0,1.75,3.0,7.0,1954.0,0.0,0.0,105000.0,183000.0,98019.0
50%,13532.0,425000.0,3972900140.0,400600.0,0.92,475664.0,1.0,6636.0,1940.0,0.0,2.5,3.0,8.0,1986.0,0.0,0.0,172000.0,261000.0,98053.0
75%,20297.5,590000.0,7504001385.0,421200.0,0.97,655061.0,1.0,9450.0,2610.0,510.0,2.5,4.0,8.0,2006.0,0.0,0.0,258000.0,380000.0,98115.0
max,27063.0,11000000.0,9906000035.0,435200.0,1.0,11644855.0,5.0,1024068.0,10740.0,3500.0,8.0,33.0,13.0,2016.0,2016.0,3.0,5538000.0,5772000.0,98354.0


In [4]:
## any correlated columns?
## TODO : use 'corr' function to verify
data.corr()

  data.corr()


Unnamed: 0,DocumentID,SalePrice,PropertyID,zhvi_px,zhvi_idx,AdjSalePrice,NbrLivingUnits,SqFtLot,SqFtTotLiving,SqFtFinBasement,Bathrooms,Bedrooms,BldgGrade,YrBuilt,YrRenovated,TrafficNoise,LandVal,ImpsVal,ZipCode,NewConstruction
DocumentID,1.0,-0.02,0.99,-0.02,-0.02,-0.02,-0.02,-0.11,-0.02,-0.03,-0.01,-0.03,-0.01,0.03,-0.01,-0.04,-0.03,-0.01,-0.01,0.05
SalePrice,-0.02,1.0,-0.02,0.12,0.12,0.98,0.02,0.14,0.68,0.29,0.52,0.32,0.66,0.08,0.09,-0.01,0.79,0.81,-0.02,0.04
PropertyID,0.99,-0.02,1.0,-0.02,-0.02,-0.02,-0.01,-0.11,-0.02,-0.03,-0.01,-0.03,-0.01,0.03,-0.01,-0.04,-0.03,-0.02,-0.01,0.05
zhvi_px,-0.02,0.12,-0.02,1.0,1.0,-0.03,0.02,-0.0,-0.01,0.02,-0.03,0.0,-0.04,-0.06,0.01,0.01,-0.0,-0.02,0.09,-0.02
zhvi_idx,-0.02,0.12,-0.02,1.0,1.0,-0.03,0.02,-0.0,-0.01,0.02,-0.03,0.0,-0.04,-0.06,0.01,0.01,-0.0,-0.02,0.09,-0.02
AdjSalePrice,-0.02,0.98,-0.02,-0.03,-0.03,1.0,0.02,0.14,0.69,0.29,0.52,0.32,0.67,0.08,0.09,-0.02,0.8,0.82,-0.03,0.04
NbrLivingUnits,-0.02,0.02,-0.01,0.02,0.02,0.02,1.0,-0.0,0.05,0.06,0.1,0.16,-0.05,-0.11,0.01,0.08,0.03,-0.0,0.04,-0.05
SqFtLot,-0.11,0.14,-0.11,-0.0,-0.0,0.14,-0.0,1.0,0.21,0.05,0.1,0.08,0.14,0.03,0.01,-0.01,0.08,0.11,0.06,-0.09
SqFtTotLiving,-0.02,0.68,-0.02,-0.01,-0.01,0.69,0.05,0.21,1.0,0.39,0.75,0.62,0.76,0.3,0.05,-0.07,0.47,0.76,-0.1,0.11
SqFtFinBasement,-0.03,0.29,-0.03,0.02,0.02,0.29,0.06,0.05,0.39,1.0,0.26,0.3,0.14,-0.22,0.11,0.05,0.29,0.28,0.1,-0.14


## Step 3 - Shape Data

In [5]:
## TODO : select columns
feature_columns= ['NbrLivingUnits', 'SqFtLot', 'SqFtTotLiving', 'SqFtFinBasement', 'Bathrooms', 'Bedrooms', 
                  'BldgGrade', 'YrBuilt', 'YrRenovated', 'TrafficNoise',  'LandVal', 'ImpsVal', 'NewConstruction']

label_column = ['SalePrice']


In [6]:
X = data[feature_columns]
y = data[label_column]

print ("X.shape = ", X.shape)
print ("y.shape = ", y.shape)

X.shape =  (27063, 13)
y.shape =  (27063, 1)


In [7]:
from sklearn.model_selection import train_test_split

## TODO : split data into train / test, with 20% for test
## hint : 20% is 0.2

X_train,X_test,y_train, y_test = train_test_split(X,y,  test_size=0.2, random_state=123)


print ("x_train :" , X_train.shape )
print ("x_test :", X_test.shape)
print ("y_train :", y_train.shape)
print ("y_test :", y_test.shape)

x_train : (21650, 13)
x_test : (5413, 13)
y_train : (21650, 1)
y_test : (5413, 1)


## Step 4 : Run Regression

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import  LassoLarsIC
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

algorithms = [
    LinearRegression(),
    LassoLarsIC(criterion='aic', max_iter=500),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
]

In [9]:
import time
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

cm_labels = np.unique(y)
    
for algo in algorithms:
    print ()
    print ("============== Running {} ======".format(algo))
    t1 = time.perf_counter()
    model = algo.fit (X_train, np.ravel(y_train))
    t2 = time.perf_counter()
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    t3 = time.perf_counter()
    
    print ("Training time : {:,.2f} secs ({:,.1f} ms)".format ( (t2-t1), (t2-t1)*1e3))
    print ("Prediction time : {:,.2f} secs ({:,.1f} ms)".format ( (t3-t2), (t3-t2)*1e3))
    
    print ("Training R2 : {:,.2f}".format (r2_score(y_train, y_pred_train)))
    print ("Testing R2  : {:,.2f}".format (r2_score(y_test, y_pred_test)))
    print ("Training RMSE : {:,.2f}".format (sqrt(mean_squared_error(y_train, y_pred_train))))
    print ("Testing RMSE  : {:,.2f}".format (sqrt(mean_squared_error(y_test, y_pred_test))))
    
    



Training time : 0.06 secs (55.6 ms)
Prediction time : 0.02 secs (17.6 ms)
Training R2 : 0.83
Testing R2  : 0.82
Training RMSE : 140,710.35
Testing RMSE  : 153,564.46

Training time : 0.07 secs (71.7 ms)
Prediction time : 0.02 secs (22.7 ms)
Training R2 : 0.83
Testing R2  : 0.82
Training RMSE : 140,710.94
Testing RMSE  : 153,524.68

Training time : 0.27 secs (272.8 ms)
Prediction time : 0.02 secs (19.3 ms)
Training R2 : 0.99
Testing R2  : 0.75
Training RMSE : 24,198.97
Testing RMSE  : 182,887.14

Training time : 20.27 secs (20,266.5 ms)
Prediction time : 0.88 secs (875.1 ms)
Training R2 : 0.98
Testing R2  : 0.83
Training RMSE : 50,978.96
Testing RMSE  : 149,655.53

Training time : 3.35 secs (3,350.1 ms)
Prediction time : 0.04 secs (40.9 ms)
Training R2 : 0.90
Testing R2  : 0.86
Training RMSE : 106,519.92
Testing RMSE  : 139,067.50
