### Model Creation:

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import xgboost

from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso, SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

%matplotlib inline
pd.set_option('display.max_columns',None)
warnings.simplefilter(action='ignore')

  from pandas import MultiIndex, Int64Index


### Load Data:

In [2]:
df = pd.read_csv('final_zillow_dataset.csv')
df.shape

(70260, 21)

In [3]:
df.head()

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,buildingqualitytypeid,finishedsquarefeet12,fips,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,propertylandusetypeid,rawcensustractandblock,regionidcity,regionidcounty,regionidzip,roomcnt,unitcnt,taxamount,logerror,yeardifference,propertyzoningdesc_labels
0,10726315,3.0,3.0,4.0,2445.0,6037.0,2.0,34.1843,-118.657,63878.0,266.0,60.371352,12447.0,3101.0,96389.0,0.0,1.0,7170.22,0.0383,33.0,627
1,10727091,2.0,3.0,7.0,1160.0,6037.0,2.0,34.188121,-118.646361,7688.0,261.0,60.371352,12447.0,3101.0,96342.0,0.0,1.0,5421.96,0.003,55.0,638
2,10730788,2.0,4.0,7.0,1570.0,6037.0,2.0,34.188446,-118.616724,11308.0,261.0,60.371351,12447.0,3101.0,96342.0,0.0,1.0,5097.78,-0.002,56.0,638
3,10735394,3.0,4.0,4.0,2863.0,6037.0,2.0,34.171439,-118.646883,16376.0,261.0,60.371373,12447.0,3101.0,96389.0,0.0,1.0,7475.21,0.0129,52.0,632
4,10743512,3.0,2.0,4.0,1394.0,6037.0,2.0,34.15289,-118.791494,77543.0,269.0,60.378003,34278.0,3101.0,96385.0,0.0,1.0,5550.36,0.063,17.0,769


### Train-Test split:

In [4]:
X = df.drop('logerror', axis=1)
y = df['logerror']
new_df = df.copy()
X.shape, y.shape

((70260, 20), (70260,))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 100)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((52695, 20), (52695,), (17565, 20), (17565,))

### Feature Scaling:

In [6]:
train_vars = [var for var in X_train.columns if var not in ['parcelid', 'logerror']]
len(train_vars)

19

In [7]:
scaler = StandardScaler()

scaler.fit(X_train[train_vars]) 

X_train[train_vars] = scaler.transform(X_train[train_vars])

X_test[train_vars] = scaler.transform(X_test[train_vars])

In [8]:
X_train.head()

Unnamed: 0,parcelid,bathroomcnt,bedroomcnt,buildingqualitytypeid,finishedsquarefeet12,fips,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,propertylandusetypeid,rawcensustractandblock,regionidcity,regionidcounty,regionidzip,roomcnt,unitcnt,taxamount,yeardifference,propertyzoningdesc_labels
15602,10934030,1.039614,1.077319,-1.450606,1.368148,-0.672408,-0.526789,0.533872,-0.840819,-0.282457,-0.50568,-0.703341,-0.813301,0.672408,-1.208086,-0.439343,-0.081247,-0.954568,1.254891,-0.470974
27292,12116223,-1.441419,0.059583,0.629159,0.535918,-0.672408,-0.526789,0.674431,-0.036819,-0.267228,-0.50568,-0.671488,0.974128,0.672408,-0.60415,-0.439343,-0.081247,-0.737606,2.552634,1.68683
37134,11533593,-0.200903,0.059583,0.629159,-0.728171,-0.672408,1.921281,0.016094,-1.104821,-0.334003,-0.50568,-0.690187,-0.813301,0.672408,-1.213056,-0.439343,-0.081247,1.636555,0.779052,-0.470974
19660,13078527,-0.200903,1.077319,0.629159,-0.591716,-0.672408,-0.526789,0.122551,0.902323,-0.274704,-0.50568,-0.677385,0.333907,0.672408,-0.032524,-0.439343,-0.081247,-0.229807,0.173439,2.541898
19926,12163237,-0.200903,-0.958153,0.629159,-0.860129,-0.672408,-0.526789,0.764055,-0.126408,-0.311506,-0.50568,-0.671698,0.103886,0.672408,-0.671254,-0.439343,-0.081247,-0.688036,1.903762,0.161445


### Dropping parcelid:

In [9]:
X_train_new = X_train.copy()
X_test_new = X_test.copy()

X_train.drop(columns='parcelid', axis=1, inplace=True)
X_test.drop(columns='parcelid', axis=1, inplace=True)

In [10]:
# !pip install mlfoundry==0.2.8

In [13]:
import mlfoundry as mlf

In [14]:
from getpass import getpass
api_token = getpass("TrueFoundry API Token:")

In [15]:
mlf_api = mlf.get_client(api_key=api_token)

### Linear Regression Model:

In [16]:
linear_reg = LinearRegression()

linear_reg.fit(X_train, y_train)

LinearRegression()

In [17]:
linear_reg_pred = linear_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, linear_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, linear_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, linear_reg_pred))))

Mean Absolute Error : 0.052712690099218626

Mean Squared Error : 0.007219378757545292

Root Mean Squared Error : 0.0849669274338274


In [18]:
mlf_run = mlf_api.create_run(project_name='home-value-prediction', run_name="reg-model")

mlf_run.log_model(linear_reg, mlf.ModelFramework.SKLEARN)

metrics_dict = {
    "mse": mean_squared_error(y_test, linear_reg_pred),
    "mae": mean_absolute_error(y_test, linear_reg_pred),
    "rmse": sqrt(mean_squared_error(y_test, linear_reg_pred))
}
mlf_run.log_metrics(metrics_dict)

2023-03-21 21:21:22.417 INFO    mlfoundry.mlfoundry_api: Run is created with id 0bc9359b7c2c4e1e80f7149e2811121e and name reg-model_2023-03-21_15-51-22_utc
2023-03-21 21:21:26.788 INFO    mlfoundry.mlfoundry_run: Model logged Successfully
2023-03-21 21:21:26.794 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


In [19]:
import shap
explainer = shap.LinearExplainer(linear_reg, X_train)
shap_values = explainer.shap_values(X_test)

feature_columns = list(X_test.columns)
X_test_df = X_test.copy()
X_test_df['targets'] = y_test
X_test_df['predictions'] = linear_reg.predict(X_test)

# compute and log stats for test data with shap
mlf_run.log_dataset_stats(
    X_test_df, 
    data_slice=mlf.DataSlice.TEST,
    data_schema=mlf.Schema(
        feature_column_names=feature_columns,
        prediction_column_name="predictions",
        actual_column_name="targets",
    ),
    shap_values=shap_values,
    model_type=mlf.ModelType.REGRESSION,
)

2023-03-21 21:21:27.421 INFO    whylogs.app.config: No config file loaded


WARN: Missing config


2023-03-21 21:21:40.927 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully
2023-03-21 21:21:40.940 INFO    mlfoundry.mlfoundry_run: Dataset stats have been successfully computed and logged


In [23]:
mlf_run.log_dataset(
   X_test_df,
   data_slice=mlf.DataSlice.TEST
)

### Elastic Net Model:

In [24]:
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)

elastic_net.fit(X_train, y_train)

ElasticNet(alpha=0.1)

In [25]:
elastic_net_pred = elastic_net.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, elastic_net_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, elastic_net_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, elastic_net_pred))))

Mean Absolute Error : 0.052772037640842616

Mean Squared Error : 0.007263564946868809

Root Mean Squared Error : 0.08522655071554175


In [26]:
mlf_run = mlf_api.create_run(project_name='home-value-prediction', run_name="elasticnet-model")

mlf_run.log_model(elastic_net, mlf.ModelFramework.SKLEARN)

metrics_dict = {
    "mse": mean_squared_error(y_test, elastic_net_pred),
    "mae": mean_absolute_error(y_test, elastic_net_pred),
    "rmse": sqrt(mean_squared_error(y_test, elastic_net_pred))
}
mlf_run.log_metrics(metrics_dict)

2023-03-21 21:25:41.615 INFO    mlfoundry.mlfoundry_api: Run is created with id 80bbcbe05c484b6295948be93a6e1ec3 and name elasticnet-model_2023-03-21_15-55-41_utc
2023-03-21 21:25:44.749 INFO    mlfoundry.mlfoundry_run: Model logged Successfully
2023-03-21 21:25:44.765 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


In [27]:
import shap
explainer = shap.LinearExplainer(elastic_net, X_train)
shap_values = explainer.shap_values(X_test)

feature_columns = list(X_test.columns)
X_test_df = X_test.copy()
X_test_df['targets'] = y_test
X_test_df['predictions'] = elastic_net.predict(X_test)

# compute and log stats for train data with shap
mlf_run.log_dataset_stats(
    X_test_df, 
    data_slice=mlf.DataSlice.TEST,
    data_schema=mlf.Schema(
        feature_column_names=feature_columns,
        prediction_column_name="predictions",
        actual_column_name="targets",
    ),
    shap_values=shap_values,
    model_type=mlf.ModelType.REGRESSION,
)

2023-03-21 21:26:05.766 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully
2023-03-21 21:26:05.782 INFO    mlfoundry.mlfoundry_run: Dataset stats have been successfully computed and logged


In [28]:
mlf_run.log_dataset(
    X_test_df,
    data_slice=mlf.DataSlice.TEST   
)

### Ridge Regression Model:

In [29]:
ridge_reg = Ridge(alpha=1, solver='cholesky')

ridge_reg.fit(X_train, y_train)

Ridge(alpha=1, solver='cholesky')

In [30]:
ridge_reg_pred = ridge_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, ridge_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, ridge_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, ridge_reg_pred))))

Mean Absolute Error : 0.05271298978108193

Mean Squared Error : 0.007219087862614829

Root Mean Squared Error : 0.08496521560388598


In [31]:
mlf_run = mlf_api.create_run(project_name='home-value-prediction', run_name="ridge-model")

mlf_run.log_model(ridge_reg, mlf.ModelFramework.SKLEARN)

metrics_dict = {
    "mse": mean_squared_error(y_test, ridge_reg_pred),
    "mae": mean_absolute_error(y_test, ridge_reg_pred),
    "rmse": sqrt(mean_squared_error(y_test, ridge_reg_pred))
}
mlf_run.log_metrics(metrics_dict)

2023-03-21 21:26:42.458 INFO    mlfoundry.mlfoundry_api: Run is created with id 661f09769c694f71b481a870535274e8 and name ridge-model_2023-03-21_15-56-42_utc
2023-03-21 21:26:45.517 INFO    mlfoundry.mlfoundry_run: Model logged Successfully
2023-03-21 21:26:45.527 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


In [32]:
import shap
explainer = shap.LinearExplainer(ridge_reg, X_train)
shap_values = explainer.shap_values(X_test)

feature_columns = list(X_test.columns)
X_test_df = X_test.copy()
X_test_df['targets'] = y_test
X_test_df['predictions'] = ridge_reg.predict(X_test)

# compute and log stats for train data with shap
mlf_run.log_dataset_stats(
    X_test_df, 
    data_slice=mlf.DataSlice.TEST,
    data_schema=mlf.Schema(
        feature_column_names=feature_columns,
        prediction_column_name="predictions",
        actual_column_name="targets",
    ),
    shap_values=shap_values,
    model_type=mlf.ModelType.REGRESSION,
)

2023-03-21 21:26:57.114 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully
2023-03-21 21:26:57.130 INFO    mlfoundry.mlfoundry_run: Dataset stats have been successfully computed and logged


In [33]:
mlf_run.log_dataset(
    X_test_df,
    data_slice=mlf.DataSlice.TEST   
)

### Lasso Regression Model:

In [34]:
lasso_reg = Lasso(alpha=0.1)

lasso_reg.fit(X_train, y_train)

Lasso(alpha=0.1)

In [35]:
lasso_reg_pred = lasso_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, lasso_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, lasso_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, lasso_reg_pred))))

Mean Absolute Error : 0.052772037640842616

Mean Squared Error : 0.007263564946868809

Root Mean Squared Error : 0.08522655071554175


In [36]:
mlf_run = mlf_api.create_run(project_name='home-value-prediction', run_name="Lasso-model")

mlf_run.log_model(lasso_reg, mlf.ModelFramework.SKLEARN)

metrics_dict = {
    "mse": mean_squared_error(y_test, lasso_reg_pred),
    "mae": mean_absolute_error(y_test, lasso_reg_pred),
    "rmse": sqrt(mean_squared_error(y_test, lasso_reg_pred))
}
mlf_run.log_metrics(metrics_dict)

2023-03-21 21:27:12.189 INFO    mlfoundry.mlfoundry_api: Run is created with id ab38487637aa437a847f12a741efe506 and name Lasso-model
2023-03-21 21:27:15.165 INFO    mlfoundry.mlfoundry_run: Model logged Successfully
2023-03-21 21:27:15.173 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


In [37]:
feature_columns = list(X_test.columns)
X_test_df = X_test.copy()
X_test_df['targets'] = y_test
X_test_df['predictions'] = lasso_reg.predict(X_test)

mlf_run.log_dataset(
    X_test_df,
    data_slice=mlf.DataSlice.TEST   
)

### XG Boost Regression Model:

In [38]:
xgb_reg = xgboost.XGBRegressor()

xgb_reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [39]:
xgb_reg_pred = xgb_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, xgb_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, xgb_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, xgb_reg_pred))))

Mean Absolute Error : 0.05380005751526278

Mean Squared Error : 0.007353353803794397

Root Mean Squared Error : 0.08575169854757629


In [40]:
mlf_run = mlf_api.create_run(project_name='home-value-prediction', run_name="XGB-model")

mlf_run.log_model(xgb_reg, mlf.ModelFramework.SKLEARN)

metrics_dict = {
    "mse": mean_squared_error(y_test, xgb_reg_pred),
    "mae": mean_absolute_error(y_test, xgb_reg_pred),
    "rmse": sqrt(mean_squared_error(y_test, xgb_reg_pred))
}
mlf_run.log_metrics(metrics_dict)

2023-03-21 21:27:24.540 INFO    mlfoundry.mlfoundry_api: Run is created with id 5b0dbe7f9e914127a13a614642c383aa and name XGB-model
2023-03-21 21:27:27.581 INFO    mlfoundry.mlfoundry_run: Model logged Successfully
2023-03-21 21:27:27.591 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


In [41]:
import shap
explainer = shap.TreeExplainer(xgb_reg, X_train)
shap_values = explainer.shap_values(X_test)

feature_columns = list(X_test.columns)
X_test_df = X_test.copy()
X_test_df['targets'] = y_test
X_test_df['predictions'] = xgb_reg.predict(X_test)

# compute and log stats for train data with shap
mlf_run.log_dataset_stats(
    X_test_df, 
    data_slice=mlf.DataSlice.TEST,
    data_schema=mlf.Schema(
        feature_column_names=feature_columns,
        prediction_column_name="predictions",
        actual_column_name="targets",
    ),
    shap_values=shap_values,
    model_type=mlf.ModelType.REGRESSION,
)

2023-03-21 21:28:23.325 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully
2023-03-21 21:28:23.341 INFO    mlfoundry.mlfoundry_run: Dataset stats have been successfully computed and logged


In [42]:
mlf_run.log_dataset(
    X_test_df,
    data_slice=mlf.DataSlice.TEST   
)

### Ada Boost Regression Model:

In [43]:
adaboost_reg = AdaBoostRegressor()

adaboost_reg.fit(X_train, y_train)

AdaBoostRegressor()

In [44]:
adaboost_reg_pred = adaboost_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, adaboost_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, adaboost_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, adaboost_reg_pred))))

Mean Absolute Error : 0.05603821043988978

Mean Squared Error : 0.007465795161402974

Root Mean Squared Error : 0.08640483297479935


In [45]:
mlf_run = mlf_api.create_run(project_name='home-value-prediction', run_name='adaboost-model')

mlf_run.log_model(adaboost_reg, mlf.ModelFramework.SKLEARN)

metrics_dict = {
    "mse": mean_squared_error(y_test, adaboost_reg_pred),
    "mae": mean_absolute_error(y_test, adaboost_reg_pred),
    "rmse": sqrt(mean_squared_error(y_test, adaboost_reg_pred))
}
mlf_run.log_metrics(metrics_dict)

2023-03-21 21:28:28.091 INFO    mlfoundry.mlfoundry_api: Run is created with id 6ee483d783b44f9a96ca692a060f3ce2 and name adaboost-model
2023-03-21 21:28:31.741 INFO    mlfoundry.mlfoundry_run: Model logged Successfully
2023-03-21 21:28:31.747 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


In [46]:
feature_columns = list(X_test.columns)
X_test_df = X_test.copy()
X_test_df['targets'] = y_test
X_test_df['predictions'] = adaboost_reg.predict(X_test)

# compute and log stats for train data with shap
mlf_run.log_dataset_stats(
    X_test_df, 
    data_slice=mlf.DataSlice.TEST,
    data_schema=mlf.Schema(
        feature_column_names=feature_columns,
        prediction_column_name="predictions",
        actual_column_name="targets",
    ),
    model_type=mlf.ModelType.REGRESSION,
)

2023-03-21 21:28:46.539 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully
2023-03-21 21:28:46.552 INFO    mlfoundry.mlfoundry_run: Dataset stats have been successfully computed and logged


In [47]:
mlf_run.log_dataset(
    X_test_df,
    data_slice=mlf.DataSlice.TEST   
)

### Gradient Boosting Regression Model:

In [48]:
gb_reg = GradientBoostingRegressor()

gb_reg.fit(X_train, y_train)

GradientBoostingRegressor()

In [49]:
gb_reg_pred = gb_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, gb_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, gb_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, gb_reg_pred))))

Mean Absolute Error : 0.05256414057499189

Mean Squared Error : 0.007178670266147259

Root Mean Squared Error : 0.08472703385665793


In [50]:
mlf_run = mlf_api.create_run(project_name='home-value-prediction', run_name='GB-model')


mlf_run.log_model(gb_reg, mlf.ModelFramework.SKLEARN)

metrics_dict = {
    "mse": mean_squared_error(y_test, gb_reg_pred),
    "mae": mean_absolute_error(y_test, gb_reg_pred),
    "rmse": sqrt(mean_squared_error(y_test, gb_reg_pred))
}
mlf_run.log_metrics(metrics_dict)

2023-03-21 21:29:21.888 INFO    mlfoundry.mlfoundry_api: Run is created with id f06d2679b80c4ea1a208041c746f031e and name GB-model
2023-03-21 21:29:25.740 INFO    mlfoundry.mlfoundry_run: Model logged Successfully
2023-03-21 21:29:25.756 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


In [51]:
feature_columns = list(X_test.columns)
X_test_df = X_test.copy()
X_test_df['targets'] = y_test
X_test_df['predictions'] = gb_reg.predict(X_test)

# compute and log stats for train data with shap
mlf_run.log_dataset_stats(
    X_test_df, 
    data_slice=mlf.DataSlice.TEST,
    data_schema=mlf.Schema(
        feature_column_names=feature_columns,
        prediction_column_name="predictions",
        actual_column_name="targets",
    ),
    model_type=mlf.ModelType.REGRESSION,
)

2023-03-21 21:29:48.675 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully
2023-03-21 21:29:48.692 INFO    mlfoundry.mlfoundry_run: Dataset stats have been successfully computed and logged


In [52]:
mlf_run.log_dataset(
    X_test_df,
    data_slice=mlf.DataSlice.TEST   
)

### Decision Tree Regressor:

In [53]:
tree_reg = DecisionTreeRegressor(max_depth=5)

tree_reg.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=5)

In [54]:
tree_reg_pred = tree_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, tree_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, tree_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, tree_reg_pred))))

Mean Absolute Error : 0.05290321940281464

Mean Squared Error : 0.007258230552748901

Root Mean Squared Error : 0.08519524959027294


### Random Forest Regression Model:

In [55]:
forest_reg = RandomForestRegressor(n_estimators= 500, max_depth=6)

forest_reg.fit(X_train, y_train)

RandomForestRegressor(max_depth=6, n_estimators=500)

In [56]:
forest_reg_pred = forest_reg.predict(X_test)

print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, forest_reg_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, forest_reg_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, forest_reg_pred))))

Mean Absolute Error : 0.05248509061419572

Mean Squared Error : 0.007175566884175688

Root Mean Squared Error : 0.08470871787588151


In [57]:
mlf_run = mlf_api.create_run(project_name='home-value-prediction', run_name="random-forest")

mlf_run.log_model(forest_reg, mlf.ModelFramework.SKLEARN)

metrics_dict = {
    "mse": mean_squared_error(y_test, forest_reg_pred),
    "mae": mean_absolute_error(y_test, forest_reg_pred),
    "rmse": sqrt(mean_squared_error(y_test, forest_reg_pred))
}
mlf_run.log_metrics(metrics_dict)

2023-03-21 21:31:40.071 INFO    mlfoundry.mlfoundry_api: Run is created with id ad47a6e357624ce0a857776c10aca117 and name random-forest
2023-03-21 21:31:42.158 INFO    mlfoundry.mlfoundry_run: Model logged Successfully
2023-03-21 21:31:42.165 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully


In [58]:
feature_columns = list(X_test.columns)
X_test_df = X_test.copy()
X_test_df['targets'] = y_test
X_test_df['predictions'] = forest_reg.predict(X_test)

# compute and log stats for train data with shap
mlf_run.log_dataset_stats(
    X_test_df, 
    data_slice=mlf.DataSlice.TEST,
    data_schema=mlf.Schema(
        feature_column_names=feature_columns,
        prediction_column_name="predictions",
        actual_column_name="targets",
    ),
    model_type=mlf.ModelType.REGRESSION,
)

2023-03-21 21:31:48.756 INFO    mlfoundry.mlfoundry_run: Metrics logged successfully
2023-03-21 21:31:48.760 INFO    mlfoundry.mlfoundry_run: Dataset stats have been successfully computed and logged


In [59]:
mlf_run.log_dataset(
    X_test_df,
    data_slice=mlf.DataSlice.TEST   
)

### Cross Validation & Hyperparameter Optimization:

In [60]:
scores = cross_val_score(forest_reg, 
                         X_train, 
                         y_train, 
                         scoring="neg_mean_squared_error", 
                         cv = 5)

In [61]:
forest_reg_rmse_scores = np.sqrt(-scores)
forest_reg_rmse_scores

array([0.0860324 , 0.08502352, 0.08290707, 0.08320319, 0.08450128])

In [62]:
param_grid = [
    {'n_estimators': [300, 400, 500], 'max_features': [2, 4, 6]},
    {'bootstrap': [False], 'n_estimators': [300, 400, 500], 'max_features': [2, 4, 6]}
]

forest_regressor = RandomForestRegressor()

grid_search = GridSearchCV(forest_regressor, 
                           param_grid, 
                           scoring='neg_mean_squared_error',
                           return_train_score=True,
                           cv=3)


In [63]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6],
                          'n_estimators': [300, 400, 500]},
                         {'bootstrap': [False], 'max_features': [2, 4, 6],
                          'n_estimators': [300, 400, 500]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [64]:
grid_search.best_params_

{'max_features': 4, 'n_estimators': 500}

In [65]:
grid_search.best_estimator_

RandomForestRegressor(max_features=4, n_estimators=500)

In [66]:
final_predictor = grid_search.best_estimator_
final_predictor.fit(X_train, y_train)
final_pred = final_predictor.predict(X_test)

In [67]:
print('Mean Absolute Error : {}'.format(mean_absolute_error(y_test, final_pred)))
print()
print('Mean Squared Error : {}'.format(mean_squared_error(y_test, final_pred)))
print()
print('Root Mean Squared Error : {}'.format(sqrt(mean_squared_error(y_test, final_pred))))

Mean Absolute Error : 0.054792638213956746

Mean Squared Error : 0.007390612663187975

Root Mean Squared Error : 0.08596867256848843


### Checking feature importance:

In [68]:
feature_importances = grid_search.best_estimator_.feature_importances_

attrs = list(df.select_dtypes(include = ['float64','int64']))

sorted(zip(attrs, feature_importances), reverse=True)

[('unitcnt', 0.1405796605866549),
 ('taxamount', 0.09128268057201341),
 ('roomcnt', 0.0016133327012335774),
 ('regionidzip', 0.008412441233926302),
 ('regionidcounty', 0.058979559230169606),
 ('regionidcity', 0.00100211583811888),
 ('rawcensustractandblock', 0.030770038344498478),
 ('propertylandusetypeid', 0.07910219730317315),
 ('parcelid', 0.023506265320558814),
 ('lotsizesquarefeet', 0.008558020811701525),
 ('longitude', 0.11061633790557397),
 ('logerror', 0.0490322535353873),
 ('latitude', 0.11485804311509559),
 ('heatingorsystemtypeid', 0.11520666308888161),
 ('fips', 0.009847493676435237),
 ('finishedsquarefeet12', 0.001003430812685474),
 ('buildingqualitytypeid', 0.11678447061062543),
 ('bedroomcnt', 0.008646739868210487),
 ('bathroomcnt', 0.030198255445056114)]

### Saving Predictions:

In [69]:
model_pred = pd.DataFrame({'parcelid':X_test_new.parcelid, 'logerror':final_pred})
model_pred.to_csv('model-predictions.csv',index=False)
model_pred.head()

Unnamed: 0,parcelid,logerror
11056,11923769,0.01474
20399,14196198,0.005035
51034,12408039,0.005489
41560,10931254,0.02139
69896,14665147,-0.036781
