# Importing Libraries/CSV

In [0]:
import numpy as np
import pandas as pd

In [74]:
!unzip c1_properties_2016.csv.zip

Archive:  c1_properties_2016.csv.zip
  inflating: c1_properties_2016.csv  
  inflating: __MACOSX/._c1_properties_2016.csv  


In [138]:
df = pd.read_csv("c1_properties_2016.csv")

df.shape

(2471261, 8)

In [139]:
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,regionidzip,yearbuilt,calculatedfinishedsquarefeet,taxvaluedollarcnt
0,11324547,2.0,4.0,97329.0,2005.0,3633.0,291973.0
1,11524947,2.0,2.0,96072.0,2004.0,1090.0,352198.0
2,11585547,2.0,3.0,96152.0,1926.0,2077.0,637046.0
3,12508747,3.0,0.0,96234.0,2004.0,9893.0,1716511.0
4,12606547,3.0,3.0,96161.0,2005.0,2166.0,659000.0


In [140]:
df.describe()

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,regionidzip,yearbuilt,calculatedfinishedsquarefeet,taxvaluedollarcnt
count,2471261.0,2471261.0,2471261.0,2471261.0,2471261.0,2471261.0,2471261.0
mean,12822000.0,2.253286,3.176577,96547.37,1963.883,1794.259,423101.9
std,2210337.0,0.9534987,0.9696219,3876.956,22.14628,930.6037,585933.1
min,10711720.0,1.0,0.0,95982.0,1801.0,1.0,23.0
25%,11550800.0,2.0,3.0,96186.0,1951.0,1219.0,181910.0
50%,12478050.0,2.0,3.0,96373.0,1962.0,1570.0,307614.0
75%,13974570.0,3.0,4.0,96967.0,1980.0,2118.0,493404.0
max,163275900.0,20.0,18.0,399675.0,2015.0,115554.0,88833750.0


# Train/Test Split

In [0]:
X = df[['bathroomcnt', 'bedroomcnt', 'calculatedfinishedsquarefeet', 
        'yearbuilt']].values
y = df['taxvaluedollarcnt'].values

In [142]:
y

array([291973., 352198., 637046., ..., 402544., 256009., 428749.])

In [199]:
X

array([[2.000e+00, 4.000e+00, 3.633e+03, 2.005e+03],
       [2.000e+00, 2.000e+00, 1.090e+03, 2.004e+03],
       [2.000e+00, 3.000e+00, 2.077e+03, 1.926e+03],
       ...,
       [2.000e+00, 3.000e+00, 1.917e+03, 1.946e+03],
       [2.000e+00, 4.000e+00, 1.987e+03, 1.955e+03],
       [1.000e+00, 2.000e+00, 7.980e+02, 2.006e+03]])

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X = scaler.fit_transform(X)

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.15, test_size=0.03, random_state=42)

In [202]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((370689, 4), (74138, 4), (370689,), (74138,))

# Linear Regression Base Model

In [204]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [0]:
y_pred = model.predict(X_test)

In [206]:
print('Linear Regression:')
print('Mean Absolute Error', mean_absolute_error(y_test, y_pred))
print('R^2 score', r2_score(y_test, y_pred))

Linear Regression:
Mean Absolute Error 215390.62275749177
R^2 score 0.4160587636683326


# Random Forest Regressor Model

In [0]:
from sklearn.ensemble import RandomForestRegressor

In [0]:
rfr_model = RandomForestRegressor(max_depth=7, n_estimators=100, random_state=44,
                                 n_jobs=5)

In [209]:
rfr_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=5,
                      oob_score=False, random_state=44, verbose=0,
                      warm_start=False)

In [0]:
y_pred = rfr_model.predict(X_test)

In [211]:
print('Random Forest Regressor:')
print('Mean Absolute Error', mean_absolute_error(y_test, y_pred))
print('R^2 score', r2_score(y_test, y_pred))

Random Forest Regressor:
Mean Absolute Error 195670.46037264873
R^2 score 0.46069830171410897


## RandomizedSearchCV

In [0]:
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from scipy.stats import randint, uniform

In [0]:
rfr_model1 = RandomForestRegressor(random_state=42)

In [0]:

param_distributions = {
              "n_estimators" : [100,250],
              "max_depth": [3, 5],
              "min_samples_split": randint(2, 11),
              "min_samples_leaf": randint(1, 11),
              "bootstrap": [True, False]
}

search = RandomizedSearchCV(
    rfr_model, 
    param_distributions=param_distributions, 
    n_iter=10, 
    cv=5, 
    scoring='neg_mean_absolute_error', 
    verbose=10, 
    return_train_score=True, 
    n_jobs=-1
)


search.fit(X_train, y_train)

# XGB Regressor Model

In [0]:
import xgboost as xgb
from xgboost import XGBRegressor

In [0]:
xgb_model = XGBRegressor(max_depth=7, random_state=42, learning_rate=0.5, n_estimators=250, n_jobs=-1)

In [214]:
xgb_model.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.5, max_delta_step=0,
             max_depth=7, min_child_weight=1, missing=None, n_estimators=250,
             n_jobs=-1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [0]:
y_pred = xgb_model.predict(X_test)

In [216]:
print('XGB Regressor:')
print('Mean Absolute Error', mean_absolute_error(y_test, y_pred))
print('R^2 score', r2_score(y_test, y_pred))

XGB Regressor:
Mean Absolute Error 198233.66242429658
R^2 score 0.3783534393852276


# Pickle Data

In [0]:
import pickle

In [0]:
pickle.dump(model, open('zillow_lr_model.pkl','wb'))


In [0]:
pickle.dump(rfr_model, open('zillow_rfr_model.pkl','wb'))

In [0]:
pickle.dump(xgb_model, open('zillow_xgb_model.pkl','wb'))