In [1]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [3]:
import pandas as pd
import numpy as np

# Visualisations
import matplotlib.pyplot as plt 

# Statistics
from scipy import stats
from scipy.stats import norm, skew
from statistics import mode
from scipy.special import boxcox1p

# Machine Learning
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from catboost import Pool, CatBoostRegressor, cv

In [4]:
dftrain = pd.read_csv('train.csv')
dftest = pd.read_csv('test.csv')

In [None]:
dftrain.count()

Id               1460
MSSubClass       1460
MSZoning         1460
LotFrontage      1201
LotArea          1460
                 ... 
MoSold           1460
YrSold           1460
SaleType         1460
SaleCondition    1460
SalePrice        1460
Length: 81, dtype: int64

In [None]:
dftrain.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
dftrain.shape,dftest.shape

((1460, 81), (1459, 80))

In [6]:
test  = dftest.copy()
train  = dftrain.copy()
SalesPrice=dftrain.iloc[:,80]
# dftrain=dftrain.drop(["SalePrice"],axis=1)
train=train.drop(["SalePrice"],axis=1)

data= pd.concat([train,test], keys=['train', 'test'])
data=data.drop(["Id"],axis=1)



In [None]:
data.head()

Unnamed: 0,Unnamed: 1,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
train,0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
train,1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
train,2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
train,3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
train,4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [None]:
data.shape


(2919, 79)

Removing features which have more than 50% missing values

In [7]:
Nullvals = data.isnull().sum().sort_values(ascending=False) 
percentageofnulls = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False) 
missing= pd.concat([Nullvals, percentageofnulls], axis=1, keys=['Total', 'Percentage'])
data = data.drop((missing[missing["Percentage"] > 0.05]).index,1)
data.isnull().sum()

  data = data.drop((missing[missing["Percentage"] > 0.05]).index,1)


MSSubClass       0
MSZoning         4
LotArea          0
Street           0
LotShape         0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         1
SaleCondition    0
Length: 68, dtype: int64

In [8]:
numerical_cols=data._get_numeric_data().columns.tolist()
numerical_cols

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [9]:
categorical_cols=set(data.columns)-set(numerical_cols)
categorical_cols

{'BldgType',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtQual',
 'CentralAir',
 'Condition1',
 'Condition2',
 'Electrical',
 'ExterCond',
 'ExterQual',
 'Exterior1st',
 'Exterior2nd',
 'Foundation',
 'Functional',
 'Heating',
 'HeatingQC',
 'HouseStyle',
 'KitchenQual',
 'LandContour',
 'LandSlope',
 'LotConfig',
 'LotShape',
 'MSZoning',
 'MasVnrType',
 'Neighborhood',
 'PavedDrive',
 'RoofMatl',
 'RoofStyle',
 'SaleCondition',
 'SaleType',
 'Street',
 'Utilities'}

In [10]:
for col in numerical_cols:
    data[col].fillna(data[col].mean(),inplace=True)

In [11]:
for col in categorical_cols:
  data[col].fillna(data[col].mode()[0],inplace=True)

In [12]:
for col in categorical_cols:
    print(data[col].value_counts())

Typ     2719
Min2      70
Min1      65
Mod       35
Maj1      19
Maj2       9
Sev        2
Name: Functional, dtype: int64
Lvl    2622
HLS     120
Bnk     117
Low      60
Name: LandContour, dtype: int64
Unf    930
GLQ    849
ALQ    429
Rec    288
BLQ    269
LwQ    154
Name: BsmtFinType1, dtype: int64
Ex    1493
TA     857
Gd     474
Fa      92
Po       3
Name: HeatingQC, dtype: int64
Unf    2573
Rec     105
LwQ      87
BLQ      68
ALQ      52
GLQ      34
Name: BsmtFinType2, dtype: int64
TA    1493
Gd    1151
Ex     205
Fa      70
Name: KitchenQual, dtype: int64
Gtl    2778
Mod     125
Sev      16
Name: LandSlope, dtype: int64
No    1986
Av     418
Gd     276
Mn     239
Name: BsmtExposure, dtype: int64
Y    2723
N     196
Name: CentralAir, dtype: int64
Norm      2889
Feedr       13
Artery       5
PosN         4
PosA         4
RRNn         2
RRAn         1
RRAe         1
Name: Condition2, dtype: int64
GasA     2874
GasW       27
Grav        9
Wall        6
OthW        2
Floor       1
Name

Removing Categorical features which have very low variance i.e. majority of population in one category

In [13]:
data=data.drop(["RoofMatl","Heating","Condition2","BsmtCond","CentralAir","Functional","Electrical",
              "LandSlope","ExterCond","Condition1","GarageArea","BsmtUnfSF","3SsnPorch","MiscVal",
              "BsmtFinType2","Utilities","Street","Exterior2nd","Neighborhood"],axis=1) 

In [14]:
dftrain[dftrain.columns[1:]].corr()['SalePrice'][:].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePrice, dtype: float64

Removing Features which have very low correlation with the target sales price

In [15]:
data=data.drop(["MoSold","BsmtFinSF2","BsmtHalfBath","OverallCond","YrSold",
            "MSSubClass","EnclosedPorch","KitchenAbvGr","ScreenPorch","2ndFlrSF","OverallQual","GrLivArea"],axis=1)

In [None]:
data.shape

(2919, 37)

In [16]:
n_numfeatures = data.select_dtypes(exclude = ["object"]).columns

In [17]:
# encoding the categorical vars
data_final=pd.get_dummies(data)
data_final.shape

(2919, 125)

In [18]:
# standardizing the features
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
data_final[data_final.columns] = scaler.fit_transform(data_final[data_final.columns])

In [19]:
train=data_final.loc["train"]
train.shape

(1460, 125)

In [20]:
test=data_final.loc["test"]
test.shape

(1459, 125)

In [21]:
train.insert(2,column="SalePrice",value=SalesPrice)
train.head()

Unnamed: 0,LotArea,YearBuilt,SalePrice,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,1stFlrSF,LowQualFinSF,BsmtFullBath,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-0.217879,1.046258,208500,0.896833,0.525202,0.580907,-0.444328,-0.773861,-0.101197,1.087023,...,-0.052423,-0.298629,-0.049029,0.394439,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
1,-0.072044,0.154764,181500,-0.395604,-0.57225,1.178112,0.477111,0.261075,-0.101197,-0.819679,...,-0.052423,-0.298629,-0.049029,0.394439,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
2,0.137197,0.980221,223500,0.848965,0.334828,0.097873,-0.299076,-0.610718,-0.101197,1.087023,...,-0.052423,-0.298629,-0.049029,0.394439,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
3,-0.078385,-1.859351,140000,-0.682812,-0.57225,-0.494941,-0.671283,-0.506205,-0.101197,1.087023,...,-0.052423,-0.298629,-0.049029,0.394439,3.789876,-0.064249,-0.09105,-0.126535,-2.155466,-0.302693
4,0.518903,0.947203,250000,0.753229,1.387486,0.468931,0.211573,-0.03717,-0.101197,1.087023,...,-0.052423,-0.298629,-0.049029,0.394439,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693


In [22]:
# Log transforming to remove the skewness of the target and make it near to normal
train["SalePrice"] = np.log1p(train["SalePrice"])

In [23]:
xtrain=train.drop(["SalePrice"],axis=True)
ytrain=train["SalePrice"]

In [24]:
x_train,x_insampletest,y_train,y_insampletest = train_test_split(xtrain,ytrain,test_size=0.25,random_state=40)

In [25]:
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, x_train, y_train, scoring="neg_mean_squared_error", cv = 10))
    return(rmse)

In [28]:
parameters = {'n_estimators':[800,1200,1600],'random_state':[5], "min_samples_split":[10,15,20],
              'max_depth':[4,5,6],'min_samples_leaf':[10,15]}
clf = GridSearchCV(GradientBoostingRegressor(), parameters)
gridsearched = clf.fit(x_train, y_train)

In [None]:
clf.get_params()

In [62]:
# Initiating Gradient Boosting Regressor using the above params
model_gbr = GradientBoostingRegressor(n_estimators=1200, 
                                      learning_rate=0.01,
                                      max_depth=3, 
                                      max_features='sqrt',
                                      min_samples_leaf=15, 
                                      min_samples_split=10, 
                                      loss='huber',
                                      random_state=5)

In [38]:
# Initiating CatBoost Regressor model
model_cat = CatBoostRegressor(iterations=1000,
                              learning_rate=0.10,
                              depth=5,
                              l2_leaf_reg=4,
                              border_count=10,
                              loss_function='RMSE',
                              verbose=200)

# Initiating parameters ready for CatBoost's CV function, which I will use below
params = {'iterations':1000,
          'learning_rate':0.10,
          'depth':5,
          'l2_leaf_reg':4,
          'border_count':10,
          'loss_function':'RMSE',
          'verbose':200}

Gridsearch for CatBoost

In [36]:
clf = CatBoostRegressor()
params = {'iterations': [500,1000,2000],
          'depth': [3,4, 5],
          'loss_function': ['RMSE'],
          'l2_leaf_reg': [3,4,5],
          'border_count':[10,15],
          'random_seed': [40]
         }
clf_grid = GridSearchCV(estimator=clf, param_grid=params,cv=10)

In [37]:
clf_grid.fit(x_train, y_train)
best_param = clf_grid.best_params_
best_param

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0:	learn: 0.3966574	total: 1.02ms	remaining: 2.05s
1:	learn: 0.3900322	total: 2.16ms	remaining: 2.16s
2:	learn: 0.3834021	total: 3.27ms	remaining: 2.17s
3:	learn: 0.3772567	total: 4.33ms	remaining: 2.16s
4:	learn: 0.3709661	total: 5.35ms	remaining: 2.13s
5:	learn: 0.3653873	total: 6.4ms	remaining: 2.13s
6:	learn: 0.3596291	total: 8.41ms	remaining: 2.39s
7:	learn: 0.3542405	total: 9.52ms	remaining: 2.37s
8:	learn: 0.3486038	total: 10.6ms	remaining: 2.34s
9:	learn: 0.3432431	total: 11.6ms	remaining: 2.31s
10:	learn: 0.3378594	total: 12.7ms	remaining: 2.29s
11:	learn: 0.3327650	total: 13.7ms	remaining: 2.27s
12:	learn: 0.3280142	total: 14.7ms	remaining: 2.25s
13:	learn: 0.3242540	total: 15.5ms	remaining: 2.2s
14:	learn: 0.3195848	total: 16.5ms	remaining: 2.19s
15:	learn: 0.3154083	total: 17.5ms	remaining: 2.16s
16:	learn: 0.3114036	total: 18.5ms	remaining: 2.15s
17:	learn: 0.3073771	total: 19.5ms	remaining: 2.15s
18:	learn: 

{'border_count': 10,
 'depth': 5,
 'iterations': 1000,
 'l2_leaf_reg': 4,
 'loss_function': 'RMSE',
 'random_seed': 40}

In [33]:
# Initiating parameters ready for CatBoost's CV function, which I will use below
params = {'iterations':[1000,1500,2000],
          'learning_rate':0.10,
          'depth':[3,4,5],
          'l2_leaf_reg':[3,4,5],
          'border_count':15,
          'loss_function':'RMSE',
          'verbose':200}

In [39]:
# Define pool
pool = Pool(x_train, y_train)

# CV Catboost algorithm with Gridsearch in the params
cv_cat = cv(pool=pool, params=params, fold_count=10, shuffle=True)

Training on fold [0/10]
0:	learn: 10.8908197	test: 10.9295000	best: 10.9295000 (0)	total: 1.21ms	remaining: 1.21s
200:	learn: 0.1275132	test: 0.2137132	best: 0.2137132 (200)	total: 186ms	remaining: 741ms
400:	learn: 0.0898878	test: 0.2013263	best: 0.2013250 (399)	total: 356ms	remaining: 532ms
600:	learn: 0.0687558	test: 0.1966041	best: 0.1966041 (600)	total: 536ms	remaining: 356ms
800:	learn: 0.0552153	test: 0.1946236	best: 0.1945491 (792)	total: 717ms	remaining: 178ms
999:	learn: 0.0452016	test: 0.1939432	best: 0.1937828 (994)	total: 924ms	remaining: 0us

bestTest = 0.1937827803
bestIteration = 994

Training on fold [1/10]
0:	learn: 10.8778851	test: 10.8975971	best: 10.8975971 (0)	total: 1.04ms	remaining: 1.04s
200:	learn: 0.1396502	test: 0.2850970	best: 0.2850970 (200)	total: 168ms	remaining: 669ms
400:	learn: 0.0963616	test: 0.2603835	best: 0.2603835 (400)	total: 345ms	remaining: 515ms
600:	learn: 0.0722644	test: 0.2558522	best: 0.2558509 (595)	total: 525ms	remaining: 348ms
800:	lea

In [40]:
# Select best model
cv_cat = cv_cat.at[999, 'train-RMSE-mean']


In [41]:
# Performance of both Catboost and GBM
cv_gbr = rmse_cv(model_gbr).mean()
results = pd.DataFrame({
    'Model': ['Gradient Boosting Regressor',
              'CatBoost'],
    'RMSE_CV': [cv_gbr,
              cv_cat]})

# Build dataframe of values
result_df = results.sort_values(by='RMSE_CV', ascending=True).reset_index(drop=True)
result_df.head(8)

Unnamed: 0,Model,RMSE_CV
0,CatBoost,0.044956
1,Gradient Boosting Regressor,0.152786


In [66]:
model_gbr.fit(x_train, y_train)
gbr_pred = model_gbr.predict(x_insampletest)

model_cat.fit(x_train, y_train)
cat_pred = model_cat.predict(x_insampletest)

0:	learn: 0.3846201	total: 1.01ms	remaining: 1.01s
200:	learn: 0.0949277	total: 137ms	remaining: 545ms
400:	learn: 0.0703398	total: 281ms	remaining: 420ms
600:	learn: 0.0547385	total: 408ms	remaining: 271ms
800:	learn: 0.0451363	total: 534ms	remaining: 133ms
999:	learn: 0.0369285	total: 670ms	remaining: 0us


In [58]:
gbr_pred_train = model_gbr.predict(x_train)
cat_pred_train = model_cat.predict(x_train)
stacked_train = ((0.5*gbr_pred_train)+(0.5*cat_pred_train)) 

In [59]:
stacked_test = ((0.5*gbr_pred)+(0.5*cat_pred))

In [60]:
print('Root Mean Square Error test = ' + str(np.sqrt(mean_squared_error(y_insampletest, stacked_test))))
print('Root Mean Square Error train = ' + str(np.sqrt(mean_squared_error(y_train, stacked_train))))

Root Mean Square Error test = 0.12756920420618956
Root Mean Square Error test = 0.07791506728343885


In [67]:
gbr_pred_test = model_gbr.predict(test)
cat_pred_test = model_cat.predict(test)
stacked = ((0.5*gbr_pred_test)+(0.5*cat_pred_test)) 
test['SalePrice']=stacked
test.to_csv('house_price_predictions_test.csv',index=False)