# Data Preprocessing

### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn import __version__ as sklearn_version
import datetime
import pickle
import os
#from library.sb_utils import save_file

In [2]:
#!pip install library

In [3]:
#!pip freeze

### Data Load

In [4]:
Housing_Data = pd.read_csv(r'..\Data\housing_data_step3_features.csv', index_col = 0)
Housing_Data.head()

Unnamed: 0,Id,Transaction_Date,Price,Bedroom_Count,Bathroom_Count,Sqft_Living,Sqft_Plot,Floor_Count,Waterfront,View,...,Change_In_Sqft_Plot,Listing_per_Zip_Code,Zipcode_Median_housePrice,sqft_living_zipcode_ratio,sqft_lot_zipcode_ratio,sqft_above_zipcode_ratio,sqft_basement_zipcode_ratio,sqft_living15_zipcode_ratio,sqft_lot15_zipcode_ratio,sqft_median_price_zipcode_ratio
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,...,0,258,279500.0,0.002633,0.002636,0.00344,0.0,0.003147,0.002693,0.793918
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,397,402,425000.0,0.003649,0.002342,0.003792,0.003032,0.002533,0.002595,1.265882
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,...,-1938,282,442500.0,0.001286,0.00298,0.001532,0.0,0.004613,0.002828,0.40678
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,0,...,0,263,489950.0,0.004316,0.003178,0.002953,0.009234,0.003092,0.003304,1.232779
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,0,...,-577,435,645000.0,0.001456,0.001273,0.001579,0.0,0.001598,0.001305,0.790698


#### Change in Data Type

In [5]:
Housing_Data['Transaction_Date'] = pd.to_datetime(Housing_Data['Transaction_Date'])
Housing_Data['Id'] = Housing_Data['Id'].astype('object')
Housing_Data['Zipcode'] = Housing_Data['Zipcode'].astype('object')

In [6]:
Housing_Data.shape

(21417, 32)

In [7]:
Housing_Data.columns

Index(['Id', 'Transaction_Date', 'Price', 'Bedroom_Count', 'Bathroom_Count',
       'Sqft_Living', 'Sqft_Plot', 'Floor_Count', 'Waterfront', 'View',
       'Condition', 'Grade', 'Sqft_Above', 'Sqft_Basement', 'Zipcode',
       'Latitude', 'Longitude', 'Sqft_Living_2015', 'Sqft_Plot_2015',
       'Plot-Living', 'Age', 'Change_In_Sqft_Living', 'Change_In_Sqft_Plot',
       'Listing_per_Zip_Code', 'Zipcode_Median_housePrice',
       'sqft_living_zipcode_ratio', 'sqft_lot_zipcode_ratio',
       'sqft_above_zipcode_ratio', 'sqft_basement_zipcode_ratio',
       'sqft_living15_zipcode_ratio', 'sqft_lot15_zipcode_ratio',
       'sqft_median_price_zipcode_ratio'],
      dtype='object')

### Train/Test Split for Final Evaluation

In [8]:
train, test = train_test_split(Housing_Data.drop(columns = ['Transaction_Date']), test_size = 0.01, random_state = 7)

In [9]:
(train.shape, test.shape)

((21202, 31), (215, 31))

In [10]:
path = r'..\Data\test_data.csv'
test.to_csv(path)
path2 = r'..\Data\train_data.csv'
train.to_csv(path2)

### Train/Test Split for model selection

In [11]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(columns = ['Price']), train.Price, test_size = 0.1, random_state = 7)

In [12]:
X_train.columns

Index(['Id', 'Bedroom_Count', 'Bathroom_Count', 'Sqft_Living', 'Sqft_Plot',
       'Floor_Count', 'Waterfront', 'View', 'Condition', 'Grade', 'Sqft_Above',
       'Sqft_Basement', 'Zipcode', 'Latitude', 'Longitude', 'Sqft_Living_2015',
       'Sqft_Plot_2015', 'Plot-Living', 'Age', 'Change_In_Sqft_Living',
       'Change_In_Sqft_Plot', 'Listing_per_Zip_Code',
       'Zipcode_Median_housePrice', 'sqft_living_zipcode_ratio',
       'sqft_lot_zipcode_ratio', 'sqft_above_zipcode_ratio',
       'sqft_basement_zipcode_ratio', 'sqft_living15_zipcode_ratio',
       'sqft_lot15_zipcode_ratio', 'sqft_median_price_zipcode_ratio'],
      dtype='object')

In [13]:
name_list = ['Id', 'Zipcode']
names_train = X_train[name_list]
names_test = X_test[name_list]

In [14]:
X_train.dtypes

Id                                  object
Bedroom_Count                        int64
Bathroom_Count                     float64
Sqft_Living                          int64
Sqft_Plot                            int64
Floor_Count                        float64
Waterfront                           int64
View                                 int64
Condition                            int64
Grade                                int64
Sqft_Above                           int64
Sqft_Basement                        int64
Zipcode                             object
Latitude                           float64
Longitude                          float64
Sqft_Living_2015                     int64
Sqft_Plot_2015                       int64
Plot-Living                          int64
Age                                  int64
Change_In_Sqft_Living                int64
Change_In_Sqft_Plot                  int64
Listing_per_Zip_Code                 int64
Zipcode_Median_housePrice          float64
sqft_living

##### Mean Price of Training 

In [15]:
train_mean = y_train.mean()
train_mean

541369.4660133116

### Data Modelling 

#### Model Type 1: Dummy Regressor(Benchmark Model)

In [16]:
# Fitting Model
dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[541369.46601331]])

In [17]:
# Predicting Value
y_tr_pred = dumb_reg.predict(X_train)
y_te_pred = dumb_reg.predict(X_test)

##### Printing Results

In [18]:
print("R-Squared value for Training Data: ", r2_score(y_train, y_tr_pred))
print("R-Squared value for Test Data: ", r2_score(y_test, y_te_pred))

R-Squared value for Training Data:  0.0
R-Squared value for Test Data:  -0.00040831701386956354


In [19]:
print("Mean Square Error for Training Data: ", mean_squared_error(y_train, y_tr_pred))
print("Mean Square Error for Test Data: ", mean_squared_error(y_test, y_te_pred))

Mean Square Error for Training Data:  133469432671.03212
Mean Square Error for Test Data:  154086719982.84085


In [20]:
print("Mean Absolute Error for Training Data: ", mean_absolute_error(y_train, y_tr_pred))
print("Mean Absolute Error for Test Data: ", mean_absolute_error(y_test, y_te_pred))

Mean Absolute Error for Training Data:  232971.33425403337
Mean Absolute Error for Test Data:  243007.7646908446


#### Data Scaling 

In [21]:
scaler = StandardScaler()
scaler.fit(X_train, X_test)
X_tr_scaled = scaler.transform(X_train)
X_te_scaled = scaler.transform(X_test)

#### Model 2: Linear Regression

In [22]:
lm = LinearRegression().fit(X_tr_scaled, y_train)
y_tr_pred = lm.predict(X_tr_scaled)
y_te_pred = lm.predict(X_te_scaled)

##### Printing Results

In [23]:
print("R-Squared value for Training Data: ", r2_score(y_train, y_tr_pred))
print("R-Squared value for Test Data: ", r2_score(y_test, y_te_pred))

R-Squared value for Training Data:  0.921431144174052
R-Squared value for Test Data:  0.918670621652827


In [24]:
print("Mean Square Error for Training Data: ", mean_squared_error(y_train, y_tr_pred))
print("Mean Square Error for Test Data: ", mean_squared_error(y_test, y_te_pred))

Mean Square Error for Training Data:  10486540612.701387
Mean Square Error for Test Data:  12526662298.41593


In [25]:
print("Mean Absolute Error for Training Data: ", mean_absolute_error(y_train, y_tr_pred))
print("Mean Absolute Error for Test Data: ", mean_absolute_error(y_test, y_te_pred))

Mean Absolute Error for Training Data:  53362.87598160564
Mean Absolute Error for Test Data:  56377.42218946571


#### Model 3: Linear Regression with SelectKBest(Basic)

In [26]:
# Fitting Model
pipe = make_pipeline(
        StandardScaler(),
    SelectKBest(f_regression),
    LinearRegression()
)
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selectkbest',
                 SelectKBest(k=10,
                             score_func=<function f_regression at 0x000001CDB9495438>)),
                ('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [27]:
# Predicting Price
y_tr_pred = pipe.predict(X_train)
y_te_pred = pipe.predict(X_test)

##### Printing Results

In [28]:
print("R-Squared value for Training Data: ", r2_score(y_train, y_tr_pred))
print("R-Squared value for Test Data: ", r2_score(y_test, y_te_pred))

R-Squared value for Training Data:  0.9177049079294216
R-Squared value for Test Data:  0.9182132658376132


In [29]:
print("Mean Square Error for Training Data: ", mean_squared_error(y_train, y_tr_pred))
print("Mean Square Error for Test Data: ", mean_squared_error(y_test, y_te_pred))

Mean Square Error for Training Data:  10983879250.270456
Mean Square Error for Test Data:  12597105992.488485


In [30]:
print("Mean Absolute Error for Training Data: ", mean_absolute_error(y_train, y_tr_pred))
print("Mean Absolute Error for Test Data: ", mean_absolute_error(y_test, y_te_pred))

Mean Absolute Error for Training Data:  52979.522866740954
Mean Absolute Error for Test Data:  56076.910340852526


##### Applying Cross Validation

In [31]:
cv_results = cross_validate(pipe, X_train, y_train, cv = 5, n_jobs = -1,scoring='r2')

In [32]:
cv_scores = cv_results['test_score']
cv_scores

array([0.92385655, 0.91101427, 0.90983876, 0.9297808 , 0.91138376])

In [33]:
print("Mean Cross Validation Scores: ", np.mean(cv_scores))
print("Std of Cross Validation Scores: ", np.std(cv_scores))

Mean Cross Validation Scores:  0.917174825882926
Std of Cross Validation Scores:  0.008110029050030916


##### Applying GridSearchCV for Hyper parameter Selection

In [34]:
k = [k+1 for k in range(len(X_train.columns))]
grid_params = {'selectkbest__k': k}
grid_params

{'selectkbest__k': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30]}

In [35]:
lr_grid_cv = GridSearchCV(pipe, param_grid=grid_params, cv=5, n_jobs=-1, scoring='r2')


In [36]:
lr_grid_cv.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('selectkbest',
                                        SelectKBest(k=10,
                                                    score_func=<function f_regression at 0x000001CDB9495438>)),
                                       ('linearregression',
                                        LinearRegression(copy_X=True,
                                                         fit_intercept=True,
                                                         n_jobs=None,
                                                         normalize=False))],
                                verbose=False),
             iid='depreca

In [37]:
cv_results = cross_validate(lr_grid_cv.best_estimator_, X_train, y_train, cv=5, n_jobs = -1,scoring='r2')
cv_scores = cv_results['test_score']
cv_scores

array([0.92598983, 0.91515689, 0.91447526, 0.93240011, 0.91225004])

In [38]:
print("Mean Cross Validation Scores: ", np.mean(cv_scores))
print("Std of Cross Validation Scores: ", np.std(cv_scores))

Mean Cross Validation Scores:  0.9200544244028569
Std of Cross Validation Scores:  0.007793155468778798


In [39]:
# Printing Best value of hyper parameter
lr_grid_cv.best_params_

{'selectkbest__k': 26}

In [40]:
# Printing Top Features Contributing 
selected = lr_grid_cv.best_estimator_.named_steps.selectkbest.get_support()
selected
coefs = lr_grid_cv.best_estimator_.named_steps.linearregression.coef_
features = X_train.columns[selected]
pd.Series(coefs, index=features).sort_values(ascending=False)

sqft_median_price_zipcode_ratio    242600.749981
Zipcode_Median_housePrice          213827.733874
sqft_above_zipcode_ratio            59572.720143
sqft_lot15_zipcode_ratio            14304.737752
Sqft_Basement                       12580.575781
Sqft_Living                         10597.872565
Sqft_Living_2015                    10221.754847
Sqft_Above                           5051.784606
sqft_lot_zipcode_ratio               4284.347279
View                                 2686.327825
sqft_basement_zipcode_ratio          1878.784897
Listing_per_Zip_Code                 1850.717018
Waterfront                           -264.355526
Grade                                -842.189803
Bathroom_Count                      -1637.312906
Change_In_Sqft_Plot                 -1699.793086
Zipcode                             -2443.929954
Latitude                            -3047.893855
Change_In_Sqft_Living               -4580.332842
Floor_Count                         -7468.644011
Sqft_Plot           

#### Model 3: Linear Regression with SelectKBest(Best Parameters)

In [41]:
# Fitting Model
pipe21 = make_pipeline(
    StandardScaler(),
    SelectKBest(f_regression, k=21),
    LinearRegression()
)
pipe21.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selectkbest',
                 SelectKBest(k=21,
                             score_func=<function f_regression at 0x000001CDB9495438>)),
                ('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

In [42]:
y_tr_predict = pipe.predict(X_train)
y_te_predict = pipe.predict(X_test)

##### Printing Results

In [43]:
print("R-Squared value for Training Data: ", r2_score(y_train, y_tr_pred))
print("R-Squared value for Test Data: ", r2_score(y_test, y_te_pred))

R-Squared value for Training Data:  0.9177049079294216
R-Squared value for Test Data:  0.9182132658376132


In [44]:
print("Mean Absolute Error for Training Data: ", mean_absolute_error(y_train, y_tr_pred))
print("Mean Absolute Error for Test Data: ", mean_absolute_error(y_test, y_te_pred))

Mean Absolute Error for Training Data:  52979.522866740954
Mean Absolute Error for Test Data:  56076.910340852526


In [45]:
print("Mean Square Error for Training Data: ", mean_squared_error(y_train, y_tr_pred))
print("Mean Square Error for Test Data: ", mean_squared_error(y_test, y_te_pred))

Mean Square Error for Training Data:  10983879250.270456
Mean Square Error for Test Data:  12597105992.488485


#### Model 4: Random Forest

In [46]:
RFRmodel=RandomForestRegressor(random_state=1)
RFRmodel.fit(X_train,y_train)
RFRpred=RFRmodel.predict(X_test)
rfrScore = RFRmodel.score(X_test,y_test)
print("Mean absolute error:",mean_absolute_error(RFRpred,y_test))
print("Model score",rfrScore)

Mean absolute error: 7304.161221122114
Model score 0.9830496480100623


In [47]:
# Fitting Model
pipe = make_pipeline(
        StandardScaler(),
    SelectKBest(f_regression),
    RandomForestRegressor(random_state=1)
)
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selectkbest',
                 SelectKBest(k=10,
                             score_func=<function f_regression at 0x000001CDB9495438>)),
                ('randomforestregressor',
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=None,
                                       max_features='auto', max_leaf_nodes=None,
                                       max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=100, n_jobs=None,
                                       o

In [48]:
# Predicting Price
y_tr_pred = pipe.predict(X_train)
y_te_pred = pipe.predict(X_test)

##### Printing Results

In [49]:
print("R-Squared value for Training Data: ", r2_score(y_train, y_tr_pred))
print("R-Squared value for Test Data: ", r2_score(y_test, y_te_pred))

R-Squared value for Training Data:  0.9977071490462202
R-Squared value for Test Data:  0.9865669148348859


In [50]:
print("Mean Square Error for Training Data: ", mean_squared_error(y_train, y_tr_pred))
print("Mean Square Error for Test Data: ", mean_squared_error(y_test, y_te_pred))

Mean Square Error for Training Data:  306025516.0002279
Mean Square Error for Test Data:  2069015218.2270267


In [51]:
print("Mean Square Error for Training Data: ", mean_absolute_error(y_train, y_tr_pred))
print("Mean Square Error for Test Data: ", mean_absolute_error(y_test, y_te_pred))

Mean Square Error for Training Data:  1980.5033745610817
Mean Square Error for Test Data:  6128.626402640263


##### Applying Cross Validation


In [52]:
cv_results = cross_validate(pipe, X_train, y_train, cv=5, n_jobs = -1,scoring='r2')
cv_scores = cv_results['test_score']
cv_scores

array([0.99418418, 0.98648856, 0.99096487, 0.97797082, 0.97407116])

In [53]:
print("Mean Cross Validation Scores: ", np.mean(cv_scores))
print("Std of Cross Validation Scores: ", np.std(cv_scores))

Mean Cross Validation Scores:  0.9847359179762547
Std of Cross Validation Scores:  0.007624234264294771


In [54]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'selectkbest', 'randomforestregressor', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'selectkbest__k', 'selectkbest__score_func', 'randomforestregressor__bootstrap', 'randomforestregressor__ccp_alpha', 'randomforestregressor__criterion', 'randomforestregressor__max_depth', 'randomforestregressor__max_features', 'randomforestregressor__max_leaf_nodes', 'randomforestregressor__max_samples', 'randomforestregressor__min_impurity_decrease', 'randomforestregressor__min_impurity_split', 'randomforestregressor__min_samples_leaf', 'randomforestregressor__min_samples_split', 'randomforestregressor__min_weight_fraction_leaf', 'randomforestregressor__n_estimators', 'randomforestregressor__n_jobs', 'randomforestregressor__oob_score', 'randomforestregressor__random_state', 'randomforestregressor__verbose', 'randomforestregressor__warm_start'])

In [55]:
n_est = [int(n) for n in np.logspace(start=1, stop=2, num=20)]
grid_params = {
        'randomforestregressor__n_estimators': n_est,
        'standardscaler': [StandardScaler(), None],
}
grid_params

{'randomforestregressor__n_estimators': [10,
  11,
  12,
  14,
  16,
  18,
  20,
  23,
  26,
  29,
  33,
  37,
  42,
  48,
  54,
  61,
  69,
  78,
  88,
  100],
 'standardscaler': [StandardScaler(copy=True, with_mean=True, with_std=True),
  None]}

#### Applying GridSearchCV for Hyper Parameter Selection

In [56]:
rf_grid_cv = GridSearchCV(pipe, param_grid=grid_params, cv=5, n_jobs=-1, scoring='r2')

The cell below will take time to run.

In [57]:
rf_grid_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('selectkbest',
                                        SelectKBest(k=10,
                                                    score_func=<function f_regression at 0x000001CDB9495438>)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              ccp_alpha=0.0,
                                                              criterion='mse',
                                                              max_depth=None,
                                                  

In [58]:
rf_grid_cv.best_params_

{'randomforestregressor__n_estimators': 16,
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True)}

In [59]:
cv_results = cross_validate(rf_grid_cv.best_estimator_, X_train, y_train, cv= 5, n_jobs = -1,scoring='r2')
cv_scores = cv_results['test_score']
cv_scores

array([0.99324368, 0.99008696, 0.98711712, 0.98107086, 0.97401646])

In [60]:
selected = rf_grid_cv.best_estimator_.named_steps.selectkbest.get_support()
features = rf_grid_cv.best_estimator_.named_steps.randomforestregressor.feature_importances_
columns = X_train.columns[selected]
pd.Series(features, index=columns).sort_values(ascending=False)

sqft_median_price_zipcode_ratio    0.374756
Zipcode_Median_housePrice          0.299436
Grade                              0.284169
Sqft_Living                        0.017250
sqft_living_zipcode_ratio          0.009623
Change_In_Sqft_Living              0.008560
Sqft_Above                         0.003833
Sqft_Living_2015                   0.001187
Bathroom_Count                     0.001070
View                               0.000116
dtype: float64

In [61]:
print("Mean Cross Validation Scores: ", np.mean(cv_scores))
print("Std of Cross Validation Scores: ", np.std(cv_scores))

Mean Cross Validation Scores:  0.9851070158696285
Std of Cross Validation Scores:  0.006845968716555976


#### Model 5: Random Forest with SelectKBest(best estimator)

In [62]:
# Fitting Model
pipe_rf_new_param = make_pipeline(
    StandardScaler(),
    SelectKBest(f_regression),
    RandomForestRegressor(random_state=29)
)
pipe_rf_new_param.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selectkbest',
                 SelectKBest(k=10,
                             score_func=<function f_regression at 0x000001CDB9495438>)),
                ('randomforestregressor',
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=None,
                                       max_features='auto', max_leaf_nodes=None,
                                       max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=100, n_jobs=None,
                                       o

In [63]:
# Predicting Price
y_tr_pred = pipe_rf_new_param.predict(X_train)
y_te_pred = pipe_rf_new_param.predict(X_test)

##### Printing Results

In [64]:
print("R-Squared value for Training Data: ", r2_score(y_train, y_tr_pred))
print("R-Squared value for Test Data: ", r2_score(y_test, y_te_pred))

R-Squared value for Training Data:  0.9978388001678369
R-Squared value for Test Data:  0.9843191999980323


In [65]:
print("Mean Absolute Error for Training Data: ", mean_absolute_error(y_train, y_tr_pred))
print("Mean Absolute Error for Test Data: ", mean_absolute_error(y_test, y_te_pred))

Mean Absolute Error for Training Data:  1981.1812137728627
Mean Absolute Error for Test Data:  6502.9726025459695


In [66]:
print("Mean Square Error for Training Data: ", mean_squared_error(y_train, y_tr_pred))
print("Mean Square Error for Test Data: ", mean_squared_error(y_test, y_te_pred))

Mean Square Error for Training Data:  288454115.4875359
Mean Square Error for Test Data:  2415216864.8720126


### Saving Model

In [67]:
#Code task 28#
#This may not be "production grade ML deployment" practice, but adding some basic
#information to your saved models can save your bacon in development.
#Just what version model have you just loaded to reuse? What version of `sklearn`
#created it? When did you make it?
#Assign the pandas version number (`pd.__version__`) to the `pandas_version` attribute,
#the numpy version (`np.__version__`) to the `numpy_version` attribute,
#the sklearn version (`sklearn_version`) to the `sklearn_version` attribute,
#and the current datetime (`datetime.datetime.now()`) to the `build_datetime` attribute
#Let's call this model version '1.0'
best_model = rf_grid_cv.best_estimator_
best_model.version = rf_grid_cv
best_model.pandas_version = pd.__version__
best_model.numpy_version = np.__version__
best_model.sklearn_version = sklearn_version
best_model.X_columns = [col for col in X_train.columns]
best_model.build_datetime = datetime.datetime.now()

In [68]:
modelpath = r'..\Model\my_model.pkl'
output = open(modelpath, 'wb')
pickle.dump(best_model, output)
output.close()

**Top Features Given by Random Forest model**
* sqft_median_price_zipcode_ratio    0.374756
* Zipcode_Median_housePrice          0.299436
* Grade                              0.284169
* Sqft_Living                        0.017250
* sqft_living_zipcode_ratio          0.009623
* Change_In_Sqft_Living              0.008560
* Sqft_Above                         0.003833
* Sqft_Living_2015                   0.001187
* Bathroom_Count                     0.001070
* View  

**Top Features Given by Linear Regression Model**
* sqft_median_price_zipcode_ratio    242600.749981
* Zipcode_Median_housePrice          213827.733874
* sqft_above_zipcode_ratio            59572.720143
* sqft_lot15_zipcode_ratio            14304.737752
* Sqft_Basement                       12580.575781
* Sqft_Living                         10597.872565
* Sqft_Living_2015                    10221.754847
* Sqft_Above                           5051.784606
* sqft_lot_zipcode_ratio               4284.347279
* View                                 2686.327825