# Kaggle
## House Prices - Advanced Regression Techniques

September 2022

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool

In [2]:
# Read data into RAM
train = pd.read_csv('data/train.csv', header=0)
print('train shape: ', train.shape)
val = pd.read_csv('data/test.csv', header=0)
print('test shape: ', val.shape)

train shape:  (1460, 81)
test shape:  (1459, 80)


In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
val.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [5]:
# Fill NA with generic value for the preliminary model, go back and impute missing more intelligently for next model.
train = train.fillna('999')
val = val.fillna('999')

In [6]:
# Split the train (80%) and test (20%) samples to prevent overfitting.
train, test = train_test_split(train, test_size=0.2, random_state=1234)

In [7]:
train['SalePrice'].describe()

count      1168.000000
mean     181528.605308
std       80709.778721
min       39300.000000
25%      129900.000000
50%      162700.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [8]:
test['SalePrice'].describe()

count       292.000000
mean     178491.558219
std       74239.301193
min       34900.000000
25%      130000.000000
50%      165075.000000
75%      212000.000000
max      485000.000000
Name: SalePrice, dtype: float64

In [9]:
# Specify dependent and independent variables for the train and test samples
keep_vars = ['Id', '1stFlrSF','2ndFlrSF', 'SaleType', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageCars', 'GarageArea', 'Neighborhood', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt']
train_y = train['SalePrice']
train_x = train[keep_vars]
train_x = train_x.drop(['Id'], axis=1)
test_y = test['SalePrice']
test_x = test[keep_vars]
test_x = test_x.drop(['Id'], axis=1)
val = val[keep_vars]
val = val.set_index('Id')

In [10]:
# list(train_x.columns)

In [11]:
train_x.dtypes

1stFlrSF         int64
2ndFlrSF         int64
SaleType        object
Fireplaces       int64
FireplaceQu     object
GarageType      object
GarageFinish    object
GarageCars       int64
GarageArea       int64
Neighborhood    object
BldgType        object
HouseStyle      object
OverallQual      int64
OverallCond      int64
YearBuilt        int64
dtype: object

In [12]:
cat_features = np.where(train_x.dtypes != np.int64)
cat_features

(array([ 2,  4,  5,  6,  9, 10, 11]),)

In [13]:
# Create the CatBoost Pool and specify categorical features for the model
cat_features = np.where(train_x.dtypes != np.int64)[0]
train_pool = Pool(train_x, label=train_y, cat_features=cat_features)
test_pool = Pool(test_x, label=test_y, cat_features=cat_features)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


In [14]:
# Initialize the Regressor model
model = CatBoostRegressor()
# model = CatBoostRegressor(iterations=2000, depth=12, learning_rate=0.1, loss_function='RMSE')

In [15]:
# Fit the model
model.fit(X=train_x, y=train_y, eval_set=test_pool, cat_features=cat_features, verbose=100)

Learning rate set to 0.052178
0:	learn: 78092.4952974	test: 71794.2009492	best: 71794.2009492 (0)	total: 61.6ms	remaining: 1m 1s
100:	learn: 23926.3670619	test: 28829.1825215	best: 28829.1825215 (100)	total: 339ms	remaining: 3.02s
200:	learn: 19021.0646425	test: 27083.6870659	best: 27083.6870659 (200)	total: 611ms	remaining: 2.43s
300:	learn: 16585.7184865	test: 26477.8993688	best: 26477.8993688 (300)	total: 936ms	remaining: 2.17s
400:	learn: 14927.1758513	test: 26212.6972037	best: 26205.7942922 (398)	total: 1.43s	remaining: 2.13s
500:	learn: 13481.5674466	test: 26002.2673511	best: 26002.2673511 (500)	total: 1.86s	remaining: 1.85s
600:	learn: 12434.7076485	test: 25879.7275269	best: 25869.5116690 (591)	total: 2.19s	remaining: 1.45s
700:	learn: 11596.0187140	test: 25826.6971211	best: 25824.6893491 (685)	total: 2.52s	remaining: 1.08s
800:	learn: 10801.2806275	test: 25733.8484348	best: 25733.8484348 (800)	total: 2.82s	remaining: 700ms
900:	learn: 10168.0565590	test: 25749.7309914	best: 257

<catboost.core.CatBoostRegressor at 0x7fd12fd5b550>

In [16]:
# Get feature importance
varimportance = model.get_feature_importance(train_pool, fstr_type='FeatureImportance')
variables = list(train_x)
variable_importance = pd.DataFrame({'feature':variables, 'importance':varimportance}).sort_values('importance', ascending=False)
variable_importance.head(50)

Unnamed: 0,feature,importance
12,OverallQual,22.774389
0,1stFlrSF,17.147623
1,2ndFlrSF,12.952707
7,GarageCars,9.203536
14,YearBuilt,5.703667
9,Neighborhood,5.620284
8,GarageArea,4.68235
6,GarageFinish,4.581621
4,FireplaceQu,4.525886
3,Fireplaces,3.175216


In [17]:
# val.index
submission = pd.DataFrame(model.predict(val), index=val.index)
submission.columns = ['SalePrice']
submission.head()

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,131675.089159
1462,145890.421033
1463,175522.627285
1464,176830.834496
1465,190344.943204


In [18]:
# Print CSV of scores
filename = 'submissions/submission_'+dt.datetime.now().strftime("%Y%m%d-%H%M%S")
submission.to_csv(filename, header=True, index=True)