### Evaluation

Aim: make Kaggle submission and evalute model performance seperately

In [64]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold 

from sklearn.metrics import explained_variance_score, max_error, mean_absolute_error, r2_score

import os

import pickle

import xgboost as xgb

import matplotlib.pyplot as plt

In [2]:
path_to_data = os.getenv('Documents') + '/House Prices - Advanced Regression Techniques/Datasets/'

In [3]:
test_data = pd.read_csv(path_to_data + 'test.csv')

In [4]:
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [5]:
ids = test_data['Id']
test_data = test_data.drop(columns=['Id'], axis=1)

### Preprocessing on test data

In [6]:
test_data = test_data.drop(columns=['Fence', 'PoolQC', 'Alley', 'MiscFeature', 'BsmtHalfBath', 'KitchenAbvGr'], axis=1)

In [7]:
# Categorical Columns

cat_cols = ['GarageQual',
             'BsmtCond',
             'Foundation',
             'LandContour',
             'Neighborhood',
             'RoofMatl',
             'GarageFinish',
             'Functional',
             'LandSlope',
             'Exterior2nd',
             'ExterQual',
             'Exterior1st',
             'MSZoning',
             'PavedDrive',
             'LotConfig',
             'BsmtQual',
             'BsmtFinType1',
             'FireplaceQu',
             'RoofStyle',
             'Street',
             'Utilities',
             'BsmtExposure',
             'LotShape',
             'Condition2',
             'HeatingQC',
             'GarageCond',
             'KitchenQual',
             'HouseStyle',
             'GarageType',
             'Heating',
             'MasVnrType',
             'Electrical',
             'Condition1',
             'CentralAir',
             'SaleType',
             'SaleCondition',
             'BldgType',
             'ExterCond',
             'BsmtFinType2']

In [8]:
test_data = pd.get_dummies(data=test_data, columns=cat_cols)

In [9]:
test_data.shape

(1459, 257)

In [34]:
# For selected features

train_data = pd.read_csv(path_to_data + 'train_clean_stp1.csv')
sel_features = list(train_data.columns)
sel_features.remove('SalePrice')

In [11]:
len(sel_features)

94

In [12]:
drop_cols = list(set(test_data.columns) - set(sel_features))

In [13]:
test_data = test_data.drop(columns=drop_cols, axis=1)

In [14]:
test_data.shape

(1459, 93)

In [15]:
train_data.shape

(1459, 95)

In [16]:
train_data.columns

Index(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'FullBath', 'HalfBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', 'ScreenPorch', 'PoolArea', 'MoSold', 'YrSold',
       'GarageQual_Fa', 'GarageQual_TA', 'BsmtCond_Fa', 'BsmtCond_TA',
       'Foundation_CBlock', 'LandContour_Bnk', 'LandContour_HLS',
       'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr',
       'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_NAmes',
       'Neighborhood_OldTown', 'Neighborhood_Somerst', 'Neighborhood_StoneBr',
       'Neighborhood_Timber', 'RoofMatl_CompShg', 'GarageFinish_Unf',
       'Functional_Maj2', 'Functional_Typ', 'LandSlope_Gtl',
       'Exterior2nd_Brk Cmn', 'Exterior2nd_HdBoard', 'E

In [17]:
test_data.columns

Index(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'FullBath', 'HalfBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', 'ScreenPorch', 'PoolArea', 'MoSold', 'YrSold',
       'GarageQual_Fa', 'GarageQual_TA', 'BsmtCond_Fa', 'BsmtCond_TA',
       'Foundation_CBlock', 'LandContour_Bnk', 'LandContour_HLS',
       'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr',
       'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_NAmes',
       'Neighborhood_OldTown', 'Neighborhood_Somerst', 'Neighborhood_StoneBr',
       'Neighborhood_Timber', 'RoofMatl_CompShg', 'GarageFinish_Unf',
       'Functional_Maj2', 'Functional_Typ', 'LandSlope_Gtl',
       'Exterior2nd_Brk Cmn', 'Exterior2nd_HdBoard', 'E

In [18]:
dif = set(list(train_data.columns)) - set(list(test_data.columns))

In [19]:
dif

{'Heating_OthW', 'SalePrice'}

In [20]:
train_data['Heating_OthW'].value_counts()

0    1457
1       2
Name: Heating_OthW, dtype: int64

In [21]:
test_data['Heating_OthW'] = 0

In [22]:
test_data['Heating_OthW'].value_counts()

0    1459
Name: Heating_OthW, dtype: int64

### Importing final model

In [62]:
# Didn't close the file? TO DO .... rerun hyperparameter tuning with the file closed

with open(os.getenv('Documents') + '/House Prices - Advanced Regression Techniques/' + 'model_stp1.pkl', 'rb') as file:
    model = pickle.load(file)

EOFError: Ran out of input

In [63]:
print(os.path.getsize(os.getenv('Documents') + '/House Prices - Advanced Regression Techniques/' + 'model_stp1.pkl'))

114455


### Model Training 

In [35]:
y_train = train_data['SalePrice']
X_train = train_data.drop(columns=['SalePrice'], axis=1)

In [24]:
# Will use quick fix for the moment

parameters = {'objective': 'reg:squarederror',
             'base_score': 0.5,
             'booster': 'gbtree',
             'colsample_bylevel': 1,
             'colsample_bynode': 1,
             'colsample_bytree': 1,
             'enable_categorical': False,
             'gamma': 0,
             'gpu_id': -1,
             'importance_type': None,
             'interaction_constraints': '',
             'learning_rate': 0.300000012,
             'max_delta_step': 0,
             'max_depth': 3,
             'min_child_weight': 3,
             'monotone_constraints': '()',
             'n_estimators': 100,
             'n_jobs': 8,
             'num_parallel_tree': 1,
             'predictor': 'auto',
             'random_state': 0,
             'reg_alpha': 3,
             'reg_lambda': 2,
             'scale_pos_weight': 1,
             'subsample': 1,
             'tree_method': 'hist',
             'validate_parameters': 1,
             'verbosity': None,
             'learning_rage': 0.3}

In [25]:
model = xgb.XGBRegressor()

In [26]:
model.set_params(**parameters)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rage=0.3,
             learning_rate=0.300000012, max_delta_step=0, max_depth=3,
             min_child_weight=3, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=3, reg_lambda=2, scale_pos_weight=1,
             subsample=1, tree_method='hist', validate_parameters=1,
             verbosity=None)

In [27]:
model.fit(X_train, y_train)

Parameters: { "learning_rage" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rage=0.3,
             learning_rate=0.300000012, max_delta_step=0, max_depth=3,
             min_child_weight=3, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=3, reg_lambda=2, scale_pos_weight=1,
             subsample=1, tree_method='hist', validate_parameters=1,
             verbosity=None)

### Kaggle Submission

In [28]:
prediction = model.predict(test_data)

In [29]:
prediction_df = pd.DataFrame(columns=['Id', 'SalePrice'])
prediction_df['Id'] = ids
prediction_df['SalePrice'] = prediction

In [30]:
prediction_df.head()

Unnamed: 0,Id,SalePrice
0,1461,122130.242188
1,1462,156187.5625
2,1463,181485.515625
3,1464,200744.890625
4,1465,206020.09375


In [93]:
prediction_df.to_csv(os.getenv('Documents') + '/House Prices - Advanced Regression Techniques/submission_1.csv', index=False)

### Model Evaluation

In [67]:
n_splits =5

fivefold = KFold(n_splits=n_splits)
explained_variance_score, max_error, mean_absolute_error, r2_score
results_dict = {}

split = 1
for train_index, test_index in fivefold.split(X_train):
    
    X_tr, X_te = X_train.iloc[train_index], X_train.iloc[test_index]
    y_tr, y_te = y_train.iloc[train_index], y_train.iloc[test_index]
    
    model = xgb.XGBRegressor()
    
    model.set_params(**parameters)
    
    model.fit(X_tr, y_tr)
    
    prediction = model.predict(X_te)
    
    row = {'explained_variance_score': explained_variance_score(y_te, prediction), 
           'max_error': max_error(y_te, prediction), 
           'mean_absolute_error': mean_absolute_error(y_te, prediction), 
           'r2_score': r2_score(y_te, prediction)}
    
    results_dict[split] = row
    
    split += 1

Parameters: { "learning_rage" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "learning_rage" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "learning_rage" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "learning_rage" } might no

In [53]:
results_dict

{1: {'explained_variance_score': 0.9012811098501311,
  'max_error': 110857.0625,
  'mean_absolute_error': 15729.072586686643,
  'r2_score': 0.9012306409795778},
 2: {'explained_variance_score': 0.8660023251933515,
  'max_error': 298885.96875,
  'mean_absolute_error': 16800.25975224743,
  'r2_score': 0.8652967902431217},
 3: {'explained_variance_score': 0.895399444814744,
  'max_error': 161179.9375,
  'mean_absolute_error': 17095.90063142123,
  'r2_score': 0.8938226850996438},
 4: {'explained_variance_score': 0.9003266090192033,
  'max_error': 121836.5,
  'mean_absolute_error': 15643.327630029966,
  'r2_score': 0.899104398269532},
 5: {'explained_variance_score': 0.8714887015881428,
  'max_error': 220622.375,
  'mean_absolute_error': 17645.261289196736,
  'r2_score': 0.8713074530621358}}

In [54]:
results_df = pd.DataFrame(data=results_dict)

In [55]:
results_df

Unnamed: 0,1,2,3,4,5
explained_variance_score,0.901281,0.866002,0.895399,0.900327,0.871489
max_error,110857.0625,298885.96875,161179.9375,121836.5,220622.375
mean_absolute_error,15729.072587,16800.259752,17095.900631,15643.32763,17645.261289
r2_score,0.901231,0.865297,0.893823,0.899104,0.871307


In [62]:
results_df.mean(axis=1)

explained_variance_score         0.886900
max_error                   182676.368750
mean_absolute_error          16582.764378
r2_score                         0.886152
dtype: float64

In [65]:
# Will take last cross validation for evaluation graphs

