<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#model1" data-toc-modified-id="model1-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>model1</a></span></li><li><span><a href="#Feature-Selection-for-Random-Forest" data-toc-modified-id="Feature-Selection-for-Random-Forest-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Feature Selection for Random Forest</a></span></li><li><span><a href="#Random-forest-with-polynomial-features" data-toc-modified-id="Random-forest-with-polynomial-features-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Random forest with polynomial features</a></span></li><li><span><a href="#Best-Linear-Regression-Script" data-toc-modified-id="Best-Linear-Regression-Script-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Best Linear Regression Script</a></span></li></ul></div>

# model1

In [37]:
%%writefile ../models/regression_modelling_final_random_forest.py
#!/usr/bin/env python

__doc__ = """
Author: Bhishan Poudel

Task
-------------------
Regression modelling of King Country Seattle house price estimation.

Model used
-------------------------
Random forest 
n_estimators = 50
max_depth = 50
topN features = 40

Result:
---------------------------
Adjusted R-Squared (test): 0.890


"""
#=============================================================================
# Imports
#=============================================================================
import numpy as np
import pandas as pd

import os
import time
import collections
import itertools

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# random state
random_state=100
np.random.seed=random_state
np.random.set_state=random_state


#=============================================================================
# Utilities
#=============================================================================
def multiple_linear_regression(df,features,target,model,
                               verbose=1,cv=5,test_size=0.3):
    """ Multiple Linear Regression Modelling using given model.
    
    Depends:
    Depends on function adjusted r-squared.
    
    
    Returns:
    rmse, r2_train, ar2_train, r2_test, ar2_test, cv
    """
    def adjustedR2(rsquared,nrows,kcols):
        return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)

    
    # train test split
    train, test = train_test_split(df, test_size=0.2, random_state=100)

    # train test values
    X = df[features].values
    y = df[target].values.ravel()

    Xtrain = train[features].values
    ytrain = train[target].values.ravel()

    Xtest = test[features].values
    ytest = test[target].values.ravel()
    
    # fitting
    model.fit(Xtrain,ytrain)

    # prediction
    ypreds = model.predict(Xtest)

    # metrics
    rmse = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
    r2_train = model.score(Xtrain, ytrain).round(3)
    r2_test = model.score(Xtest, ytest).round(3)

    cv = cross_val_score(model, X, y, cv=5,n_jobs=-1,
                         verbose=verbose).mean().round(3)

    ar2_train = adjustedR2(model.score(Xtrain,ytrain),
                           Xtrain.shape[0],
                           len(features)).round(3)
    ar2_test  = adjustedR2(model.score(Xtest,ytest),
                           Xtest.shape[0] ,
                           len(features)).round(3)
    
    return (model, rmse, r2_train, ar2_train, r2_test, ar2_test, cv)


df_eval = pd.DataFrame({'Model': [],
                           'Details':[],
                           'Root Mean Squared Error (RMSE)':[],
                           'R-squared (training)':[],
                           'Adjusted R-squared (training)':[],
                           'R-squared (test)':[],
                           'Adjusted R-squared (test)':[],
                           '5-Fold Cross Validation':[]})

t0 = time.time()
#=============================================================================
# Data Loading
#=============================================================================
# load the data
df_raw = pd.read_csv('../data/processed/data_cleaned_encoded.csv')

#=============================================================================
# Train test split
#=============================================================================
# train test split
train, test = train_test_split(df_raw,train_size = 0.8,random_state=random_state)

#=============================================================================
# Feature Selection
#=============================================================================
# feature selection
features_orig = ['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15']

cols_num = ['bedrooms', 'bathrooms',
            'sqft_living', 'sqft_lot','sqft_above','sqft_basement',
            'yr_built', 'yr_renovated',
           'lat','long',
           'sqft_living15', 'sqft_lot15', 'yr_sales']

cols_bool = ['basement_bool', 'renovation_bool']

cols_new = ['zipcode_houses']

cols_cat = [
    # waterfront
    'waterfront_0', 'waterfront_1',
    
    #view
    'view_0', 'view_1', 'view_2', 'view_3','view_4',
    
    # condition
    'condition_1', 'condition_2', 'condition_3',
    'condition_4','condition_5',
    
    # grade
    'grade_1', 'grade_10', 'grade_11', 'grade_12','grade_13',
    'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7','grade_8', 'grade_9',
            
    # zipcode
    'zipcode_top10_98004', 'zipcode_top10_98006',
    'zipcode_top10_98033', 'zipcode_top10_98039',
    'zipcode_top10_98040','zipcode_top10_98102',
    'zipcode_top10_98105', 'zipcode_top10_98155',
    'zipcode_top10_98177']



cols_cat_age = [ 'age_cat_0', 'age_cat_1', 'age_cat_2',
                               'age_cat_3', 'age_cat_4', 'age_cat_5',
                               'age_cat_6', 'age_cat_7', 'age_cat_8',
                               'age_cat_9']

cols_cat_agernv = [
                'age_after_renovation_cat_0','age_after_renovation_cat_1',
                'age_after_renovation_cat_2', 'age_after_renovation_cat_3',
                'age_after_renovation_cat_4', 'age_after_renovation_cat_5',
                'age_after_renovation_cat_6', 'age_after_renovation_cat_7',
                'age_after_renovation_cat_8', 'age_after_renovation_cat_9']

features_all_encoded = cols_num + cols_bool + cols_new + cols_cat + cols_cat_age + cols_cat_agernv
target = ['price']

#=============================================================================
# Random Forest All encoded features after grid search best model
#=============================================================================
target = ['price']
features = features_all_encoded
df = df_raw[features + target]

model = RandomForestRegressor(n_estimators= 40,random_state=random_state,
                              max_features=69,
                              max_depth=50, bootstrap=True)

fitted_model, rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(df, features, target,model,
                               verbose=2,test_size=0.2)


df_eval.loc[len(df_eval)] = ['Random Forest Regressor after grid search',
                             ' all encoded features, best grid search,\
                             n_estimators=40, max_features=69, max_depth=50',
                             rmse,r2_train,ar2_train,
                             r2_test,ar2_test,cv]

#=============================================================================
# Random Forest Feature Importance
#=============================================================================
importances = fitted_model.feature_importances_
df_imp = pd.DataFrame({'feature': features, 'importance': importances})
topN = 40
top_cols = df_imp.head(topN)['feature'].values.tolist()

features = top_cols
target = ['price']

df = df_raw[features + target]

model = RandomForestRegressor(n_estimators= 50,random_state=random_state,
                              max_depth=50, bootstrap=True)

fitted_model, rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(df, features, target,model,
                               verbose=2,test_size=0.2)

df_eval.loc[len(df_eval)] = ['Random Forest Regressor',
                             'n_estimators = 50, max_depth = 50,\
                             topN features = '+str(topN),
                             rmse,r2_train,ar2_train,
                             r2_test,ar2_test,cv]

#=============================================================================
# Print Results
#=============================================================================
print('Features used:\n', df.columns.values)
print()
for k,v in df_eval.to_dict().items():
    print(k, ':', v)

t1 = time.time() - t0
print('\n\nTime taken: {:.0f} min {:.0f} secs'.format(*divmod(t1,60)))

Overwriting ../models/regression_modelling_final_random_forest.py


In [38]:
! /Users/poudel/miniconda3/envs/dataSc/bin/python ../models/regression_modelling_final_random_forest.py

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total=   9.4s
[CV]  ................................................................
[CV] ................................................. , total=   9.6s
[CV] ................................................. , total=   9.6s
[CV] ................................................. , total=   9.6s
[CV] ................................................. , total=   5.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[CV]  .............................................................

# Feature Selection for Random Forest
Feature selection, log transform and scaling did not help increasing the
R-squared score for random forest. Just the plain features do the best.
Hyperparameter tuning is important but not the feature selection.

In [8]:
# %%writefile ../models/regression_modelling_final_random_forest_attempts.py
#!/usr/bin/env python

__doc__ = """
Author: Bhishan Poudel

Task: Regression modelling of King Country Seattle house price estimation.

Model used: Random forest with n_estimators = 49
  

adjusted r-squared
-------------------
num + nologs + cats: 0.886 (plain)
num + nologs + cats_encoded : 0.883  (ENCODING IS BAD)
num + nologs + cats_age + cats_agernv: 0.847
num + nologs + cats_age + cats_agernv + cats :0.885

"""

# Imports
import numpy as np
import pandas as pd

import os
import time
import collections
import itertools

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


def remove_outliers(df):
    df = df.drop(df[df["bedrooms"]>=10].index )
    df = df.drop(df[df["bathrooms"]>=7].index )
    df = df.drop(df[df["grade"].isin([3,1])].index )
    
    # we must reset index after removing outliers
    df = df.reset_index(drop=True)
    return df


def standard_scaling(df):
    from sklearn.preprocessing import StandardScaler

    ss = StandardScaler()
    array_scaled_feat = ss.fit_transform(df.drop('price',axis=1))
    df_feat = pd.DataFrame(array_scaled_feat,
                           columns = df.drop('price',axis=1).columns)
    df = pd.concat([df_feat, df[target]], axis=1)

    return df


def adjustedR2(rsquared,nrows,kcols):
    return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)


def multiple_linear_regression(df,features,target,model,
                               verbose=1,cv=5,test_size=0.3):
    """ Multiple Linear Regression Modelling using given model.
    
    Depends:
    Depends on function adjusted r-squared.
    
    
    Returns:
    rmse, r2_train, ar2_train, r2_test, ar2_test, cv
    """
    
    # train test split
    train, test = train_test_split(df, test_size=test_size, random_state=100)

    # train test values
    X = df[features].values
    y = df[target].values.ravel()

    Xtrain = train[features].values
    ytrain = train[target].values.ravel()

    Xtest = test[features].values
    ytest = test[target].values.ravel()
    
    # fitting
    model.fit(Xtrain,ytrain)

    # prediction
    ypreds = model.predict(Xtest)

    # metrics
    rmse = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
    r2_train = model.score(Xtrain, ytrain).round(3)
    r2_test = model.score(Xtest, ytest).round(3)

    cv = cross_val_score(model, X, y, cv=5,n_jobs=-1,
                         verbose=verbose).mean().round(3)

    ar2_train = adjustedR2(model.score(Xtrain,ytrain),
                           Xtrain.shape[0],
                           len(features)).round(3)
    ar2_test  = adjustedR2(model.score(Xtest,ytest),
                           Xtest.shape[0] ,
                           len(features)).round(3)
    
    return (rmse, r2_train, ar2_train, r2_test, ar2_test, cv)


df_eval = pd.DataFrame({'Model': [],
                           'Details':[],
                           'Root Mean Squared Error (RMSE)':[],
                           'R-squared (training)':[],
                           'Adjusted R-squared (training)':[],
                           'R-squared (test)':[],
                           'Adjusted R-squared (test)':[],
                           '5-Fold Cross Validation':[]})
#-----------------------------------------------------------------------------
if __name__ == '__main__':

    t0 = time.time()
     
    # load the data
    df = pd.read_csv('../data/processed/data_cleaned_encoded.csv')
    
    
    target = ['price']

    # plain features
    features_num = ['bedrooms', 'bathrooms',  'yr_built', 'lat', 'long']
    features_cat = ['waterfront', 'view', 'condition', 'grade','zipcode']
    features_no_log = ['sqft_living','sqft_lot','sqft_above',
                       'sqft_basement','sqft_living15','sqft_lot15']
    
    
    # log
    features_log = ['log1p_sqft_living','log1p_sqft_lot',
                    'log1p_sqft_above','log1p_sqft_basement',
                    'log1p_sqft_living15','log1p_sqft_lot15']

    # categorical encoding
    features_cat_age = [ 'age_cat_0', 'age_cat_1', 'age_cat_2',
                         'age_cat_3', 'age_cat_4', 'age_cat_5',
                         'age_cat_6', 'age_cat_7', 'age_cat_8',
                         'age_cat_9']

    feature_cat_agernv = [
                    'age_after_renovation_cat_0','age_after_renovation_cat_1',
                    'age_after_renovation_cat_2', 'age_after_renovation_cat_3',
                    'age_after_renovation_cat_4', 'age_after_renovation_cat_5',
                    'age_after_renovation_cat_6', 'age_after_renovation_cat_7',
                    'age_after_renovation_cat_8', 'age_after_renovation_cat_9']
    
    # newly created boolean features
    features_bool = ['basement_bool', 'renovation_bool']
    
    # newly created number of houses in given zipcode
    features_zipcode_extra = ['zipcode_houses']
    
    
    # all categorical features encoded.
    features_cat_encoded = [
        # waterfront
        'waterfront_0', 'waterfront_1',
        # view
        'view_0', 'view_1', 'view_2','view_3','view_4',
        # condition
        'condition_1', 'condition_2','condition_3', 'condition_4',
        'condition_5',
        # grade
        'grade_1', 'grade_10', 'grade_11', 'grade_12',
        'grade_13', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7',
        'grade_8', 'grade_9',
        # zipcode
        'zipcode_top10_98004', 'zipcode_top10_98006','zipcode_top10_98033',
        'zipcode_top10_98039', 'zipcode_top10_98040','zipcode_top10_98102',
        'zipcode_top10_98105', 'zipcode_top10_98155','zipcode_top10_98177',
        # age
        'age_cat_0', 'age_cat_1', 'age_cat_2','age_cat_3', 'age_cat_4',
        'age_cat_5', 'age_cat_6', 'age_cat_7','age_cat_8', 'age_cat_9',
        # age after renovation
        'age_after_renovation_cat_0',
        'age_after_renovation_cat_1', 'age_after_renovation_cat_2',
        'age_after_renovation_cat_3', 'age_after_renovation_cat_4',
        'age_after_renovation_cat_5', 'age_after_renovation_cat_6',
        'age_after_renovation_cat_7', 'age_after_renovation_cat_8',
        'age_after_renovation_cat_9']

    
    features = features_num + features_no_log + features_cat
    df = df[features + target]
    
    # options
    use_scaling = True
    use_remove_outliers = False
  
    text = "use_scaling = {}, remove_outliers = {} ".format(
        use_scaling, use_remove_outliers)
        
    if use_scaling:
        df = standard_scaling(df)
        
    if use_remove_outliers:
        df = remove_outliers(df)

    model = RandomForestRegressor(n_estimators= 50,random_state=100)
    rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
        multiple_linear_regression(df, features, target,model,
                                   verbose=0,test_size=0.2)


    df_eval.loc[len(df_eval)] = ['Random Forest Regressor',
                                 text, rmse,r2_train,ar2_train,
                                 r2_test,ar2_test,cv]
    print( 'used features\n', df.columns.values)
    print()
    for k,v in df_eval.to_dict().items():
        print(k, ':', v[0])

    t1 = time.time() - t0
    print('\n\nTime taken: {:.0f} min {:.0f} secs'.format(*divmod(t1,60)))

['bedrooms' 'bathrooms' 'yr_built' 'lat' 'long' 'sqft_living' 'sqft_lot'
 'sqft_above' 'sqft_basement' 'sqft_living15' 'sqft_lot15' 'waterfront'
 'view' 'condition' 'grade' 'zipcode' 'price']
Model : {0: 'Random Forest Regressor'}
Details : {0: 'use_scaling = True, remove_outliers = False '}
Root Mean Squared Error (RMSE) : {0: 123663.449}
R-squared (training) : {0: 0.981}
Adjusted R-squared (training) : {0: 0.981}
R-squared (test) : {0: 0.887}
Adjusted R-squared (test) : {0: 0.886}
5-Fold Cross Validation : {0: 0.873}


Time taken: 0 min 19 secs


# Random forest with polynomial features

Polynomial featrues did not increase the r2-score for random forest method.

In [21]:
# %%writefile ../models/regression_modelling_final_random_forest.py
#!/usr/bin/env python

__doc__ = """
Author: Bhishan Poudel

Task: Regression modelling of King Country Seattle house price estimation.

Model used: Random forest with n_estimators = 49

Note:
- In random forest log transform, and scaling are not much important.

adjusted r-squared
-------------------
0.883 No change with polynomial features.

"""

# Imports
import numpy as np
import pandas as pd

import os
import time
import collections
import itertools

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


def multiple_linear_regression(df, features, target,model,
                               test_size=0.2, cv=5,verbose=0,deg=2):
    """ Multiple Linear Regression Modelling using given model.
    
    
    Returns:
    rmse, r2_train, ar2_train, r2_test, ar2_test, cv
    """
    def adjustedR2(rsquared,nrows,kcols):
        return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import cross_val_score
    
    # train test split
    train, test = train_test_split(df, test_size=test_size, random_state=100)
    
    polyfeat = PolynomialFeatures(degree=deg)

    X = polyfeat.fit_transform(df[features])
    y = df[target].values.ravel()
    
    Xtrain = polyfeat.fit_transform(train[features])
    Xtest = polyfeat.fit_transform(test[features])

    
    ytrain = train[target].values.ravel()
    ytest = test[target].values.ravel()

    
    # fitting
    model.fit(Xtrain,ytrain)

    # prediction
    ypreds = model.predict(Xtest)

    # metrics
    rmse = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
    r2_train = model.score(Xtrain, ytrain).round(3)
    r2_test = model.score(Xtest, ytest).round(3)

    cv = cross_val_score(model, X, y, cv=5,n_jobs=-1,
                         verbose=verbose).mean().round(3)

    ar2_train = adjustedR2(model.score(Xtrain,ytrain),
                           Xtrain.shape[0],
                           len(features)).round(3)
    ar2_test  = adjustedR2(model.score(Xtest,ytest),
                           Xtest.shape[0] ,
                           len(features)).round(3)
    
    return (rmse, r2_train, ar2_train, r2_test, ar2_test, cv)


if __name__ == '__main__':

    t0 = time.time()
     
    # load the data
    df = pd.read_csv('../data/raw/kc_house_data.csv')
    
    df.drop(['id','date'],axis=1,inplace=True)
    
    
    target = ['price']
    features = df.drop(['price'],axis=1).columns
    

    model = RandomForestRegressor(n_estimators= 50,random_state=100)
    rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
        multiple_linear_regression(df, features, target,model,
                                   verbose=0,test_size=0.2,deg=2)


    df_eval.loc[len(df_eval)] = ['Random Forest Regressor',
                                 'polynomial features deg=2, all raw features',
                                 rmse,r2_train,ar2_train,
                                 r2_test,ar2_test,cv]

    for k,v in df_eval.to_dict().items():
        print(k, ':', v[0])

    t1 = time.time() - t0
    print('\n\nTime taken: {:.0f} min {:.0f} secs'.format(*divmod(t1,60)))

Model : Polynomial Regression
Details : deg=2, all features,                              unprocessed, no regularization
Root Mean Squared Error (RMSE) : 158822.055
R-squared (training) : 0.831
Adjusted R-squared (training) : 0.831
R-squared (test) : 0.813
Adjusted R-squared (test) : 0.812
5-Fold Cross Validation : 0.813


Time taken: 3 min 50 secs


# Best Linear Regression Script

In [17]:
# %%writefile ../models/regression_modelling_polynomial_regression.py
#!/usr/bin/env python

__doc__ = """
Author: Bhishan Poudel

Task: Regression modelling of King Country Seattle house price estimation.

Model used: Polynomial regression deg=2 only raw features

Result:
--------
Adjusted R-squared (test) :  0.813

"""
#=============================================================================
# Imports
#=============================================================================
import numpy as np
import pandas as pd
import os
import time

# random state
random_state=100
np.random.seed=random_state
np.random.set_state=random_state


# scale and split
from sklearn.model_selection import train_test_split

# regressors
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

# regressor preprocessing
from sklearn.preprocessing import PolynomialFeatures

# metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error


features_raw_all = ['bedrooms','bathrooms','sqft_living','sqft_lot',
                    'floors','waterfront','view','condition','grade',
                    'sqft_above','yr_built','yr_renovated',
                    'zipcode','lat','long','sqft_living15','sqft_lot15']

# cross validation
from sklearn.model_selection import cross_val_score

df_eval = pd.DataFrame({'Model': [],
                           'Details':[],
                           'Root Mean Squared Error (RMSE)':[],
                           'R-squared (training)':[],
                           'Adjusted R-squared (training)':[],
                           'R-squared (test)':[],
                           'Adjusted R-squared (test)':[],
                           '5-Fold Cross Validation':[]})
#=============================================================================
# Data Loading
#=============================================================================
# load the data
df = pd.read_csv('../data/processed/data_cleaned_encoded.csv')

#=============================================================================
# Train test split
#=============================================================================
# train test split
train, test = train_test_split(df,train_size = 0.8,random_state=random_state)

#=============================================================================
# Feature Selection
#=============================================================================
# feature selection
target = ['price']
features_raw_all = ['bedrooms','bathrooms','sqft_living','sqft_lot',
                    'floors','waterfront','view','condition','grade',
                    'sqft_above','yr_built','yr_renovated',
                    'zipcode','lat','long','sqft_living15','sqft_lot15']

features = features_raw_all


polyfeat = PolynomialFeatures(degree=2)

X = polyfeat.fit_transform(df[features])

Xtrain = polyfeat.fit_transform(train[features])
Xtest = polyfeat.fit_transform(test[features])

y = df[target].values.reshape(-1,1)
ytrain = train[target].values.reshape(-1,1)
ytest = test[target].values.reshape(-1,1)


#=============================================================================
# Modelling
#=============================================================================
model = linear_model.LinearRegression(n_jobs=-1)

# modelling
def multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest,cv=5,verbose=0):
    """ Multiple Linear Regression Modelling using given model.
    
    
    Returns:
    rmse, r2_train, ar2_train, r2_test, ar2_test, cv
    """
    def adjustedR2(rsquared,nrows,kcols):
        return rsquared- (kcols-1)/(nrows-kcols) * (1-rsquared)
    
    # fitting
    model.fit(Xtrain,ytrain)

    # prediction
    ypreds = model.predict(Xtest)

    # metrics
    rmse = np.sqrt(mean_squared_error(ytest,ypreds)).round(3)
    r2_train = model.score(Xtrain, ytrain).round(3)
    r2_test = model.score(Xtest, ytest).round(3)

    cv = cross_val_score(model, X, y, cv=5,n_jobs=-1,verbose=verbose).mean().round(3)

    ar2_train = adjustedR2(model.score(Xtrain,ytrain),
                           Xtrain.shape[0],
                           len(features)).round(3)
    ar2_test  = adjustedR2(model.score(Xtest,ytest),
                           Xtest.shape[0] ,
                           len(features)).round(3)
    
    return (rmse, r2_train, ar2_train, r2_test, ar2_test, cv)

#=============================================================================
# Model Evaluation
#=============================================================================
rmse, r2_train, ar2_train, r2_test, ar2_test, cv = \
    multiple_linear_regression(model,X,y, Xtrain, ytrain, Xtest,ytest,verbose=0)


df_eval.loc[len(df_eval)] = ['Polynomial Regression','deg=2, all features,\
                              unprocessed, no regularization',
                             rmse,r2_train,ar2_train,r2_test,ar2_test,cv]

for k,v in df_eval.to_dict().items():
    print(k, ' : ', v[0])

Model  :  Polynomial Regression
Details  :  deg=2, all features,                              unprocessed, no regularization
Root Mean Squared Error (RMSE)  :  158822.055
R-squared (training)  :  0.831
Adjusted R-squared (training)  :  0.831
R-squared (test)  :  0.813
Adjusted R-squared (test)  :  0.812
5-Fold Cross Validation  :  0.813
