# Build Model

## Step-1 : Load and Shape Data

In [1]:
import os
import urllib.request

data_location = "../data/house-prices/house-sales-full.csv"
data_url = 'https://elephantscale-public.s3.amazonaws.com/data/house-prices/house-sales-full.csv'

if not os.path.exists (data_location):
    data_location = os.path.basename(data_location)
    if not os.path.exists(data_location):
        print("Downloading : ", data_url)
        urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)

Downloading :  https://elephantscale-public.s3.amazonaws.com/data/house-prices/house-sales-full.csv
data_location: house-sales-full.csv


In [2]:
import pandas as pd

data = pd.read_csv(data_location)
data.sample(5)

Unnamed: 0,DocumentID,Date,SalePrice,PropertyID,PropertyType,ym,zhvi_px,zhvi_idx,AdjSalePrice,NbrLivingUnits,...,Bathrooms,Bedrooms,BldgGrade,YrBuilt,YrRenovated,TrafficNoise,LandVal,ImpsVal,ZipCode,NewConstruction
11842,11843,10/20/06,482000,3582000130,Single Family,10/1/06,418300,0.961167,501474.0,1,...,2.25,3,8,1976,0,0,292000,257000,98028,False
13803,13804,5/8/06,495000,4046400270,Single Family,5/1/06,400600,0.920496,537753.0,1,...,2.5,4,8,1968,0,0,280000,265000,98008,False
277,278,9/8/08,335000,112900110,Single Family,9/1/08,394900,0.907399,369187.0,1,...,2.5,3,7,2001,0,0,90000,230000,98019,False
1627,1628,11/14/08,249965,629650350,Single Family,11/1/08,385800,0.886489,281972.0,1,...,2.5,4,7,2009,0,0,69000,152000,-1,True
25066,25067,9/27/06,406000,9195700110,Single Family,9/1/06,414800,0.953125,425967.0,1,...,1.0,3,7,1980,0,0,265000,108000,98027,False


In [3]:
from sklearn.model_selection import train_test_split

X = data[['Bedrooms', 'Bathrooms', 'SqFtTotLiving', 'SqFtLot',  'LandVal']]
y = data['SalePrice']

X_train,X_test,y_train, y_test = train_test_split(X,y,  test_size=0.2)

print ("X_train :" , X_train.shape )
print ("X_test :", X_test.shape)
print ("y_train :", y_train.shape)
print ("y_test :", y_test.shape)

X_train : (21650, 5)
X_test : (5413, 5)
y_train : (21650,)
y_test : (5413,)


## Step-2: Build a Model

In [4]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor()
model = gb.fit(X_train, y_train)

In [5]:
print ("Training score: ", model.score(X_train, y_train))
print ("Test score: ", model.score(X_test, y_test))

Training score:  0.8397508631293956
Test score:  0.8002964095661829


## Step-3: Save Model with Pickle

Pickle models may need exact Python versions to load back.

JobLib is a better choice as it saves models efficiently.

In [6]:
model_file = 'model.pkl'

In [None]:
# ## Use Pickle
# import pickle

# with open(model_file,'wb') as f:
#     pickle.dump(model,f)
# print ("model saved to : ", model_file)

In [7]:
## Use JobLib

import joblib

joblib.dump(model, model_file) 
print ("model saved to : ", model_file)

model saved to :  model.pkl
