# CatBoost Analysis

## Setup

In [52]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

%matplotlib inline

In [53]:
# Load data
data = pd.read_csv('data/ames_cat.csv')

In [54]:
# Create the independent variables and the target
predictors = data.drop('SalePrice', axis=1)
target = data['SalePrice']

In [55]:
# Check columns
print(predictors.columns)

Index(['GrLivArea', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea',
       'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'En

In [56]:
predictors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2576 entries, 0 to 2575
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GrLivArea        2576 non-null   int64  
 1   MSSubClass       2576 non-null   int64  
 2   MSZoning         2576 non-null   object 
 3   LotFrontage      2576 non-null   float64
 4   LotArea          2576 non-null   int64  
 5   Street           2576 non-null   object 
 6   Alley            2576 non-null   object 
 7   LotShape         2576 non-null   object 
 8   LandContour      2576 non-null   object 
 9   Utilities        2576 non-null   object 
 10  LotConfig        2576 non-null   object 
 11  LandSlope        2576 non-null   object 
 12  Neighborhood     2576 non-null   object 
 13  Condition1       2576 non-null   object 
 14  Condition2       2576 non-null   object 
 15  BldgType         2576 non-null   object 
 16  HouseStyle       2576 non-null   object 
 17  OverallQual   

## Perform Train-Test Split

In [57]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(predictors, target, random_state=0, test_size = 0.2)

## Train the Model

In [58]:
nominative_categoricals = X_train.select_dtypes(include=['object']).columns.tolist()

In [59]:
# Define the parameter grid for the learning rate and depth
param_grid = {
    'learning_rate': [0.03, 0.1, 0.3],
    'depth': [4, 6, 10]
}

# Initialize a Pool object
train_pool = Pool(X_train, y_train, cat_features=nominative_categoricals)

# Initialize empty dictionary to store results
cv_results = {}

# Loop over all combinations of hyperparameters
for learning_rate in param_grid['learning_rate']:
    for depth in param_grid['depth']:
        
        # Define the parameters for the model
        params = {
            'loss_function': 'RMSE',
            'iterations': 100, 
            'learning_rate': learning_rate,
            'depth': depth
        }

        # Perform cross-validation and store the results
        cv_data = cv(train_pool, params, fold_count=5, plot=False)
        cv_results[(learning_rate, depth)] = np.min(cv_data['test-RMSE-mean'])

# Print the best parameters and RMSE
best_params = min(cv_results, key=cv_results.get)
print(f'Best parameters: learning_rate={best_params[0]}, depth={best_params[1]}')
print(f'Best RMSE: {cv_results[best_params]}')


Training on fold [0/5]
0:	learn: 187699.4775889	test: 178568.8805461	best: 178568.8805461 (0)	total: 2.63ms	remaining: 261ms
1:	learn: 182696.1988337	test: 173623.0084139	best: 173623.0084139 (1)	total: 5.38ms	remaining: 264ms
2:	learn: 177708.1457959	test: 168836.4016199	best: 168836.4016199 (2)	total: 8.64ms	remaining: 279ms
3:	learn: 172969.8397152	test: 164176.6526266	best: 164176.6526266 (3)	total: 11.2ms	remaining: 269ms
4:	learn: 168304.9840740	test: 159510.3630841	best: 159510.3630841 (4)	total: 12.8ms	remaining: 243ms
5:	learn: 163815.0041830	test: 155179.1200106	best: 155179.1200106 (5)	total: 15.3ms	remaining: 240ms
6:	learn: 159469.2582134	test: 150894.1183453	best: 150894.1183453 (6)	total: 16.9ms	remaining: 224ms
7:	learn: 155198.7276340	test: 146653.2212557	best: 146653.2212557 (7)	total: 19.5ms	remaining: 225ms
8:	learn: 151055.5348301	test: 142608.4160166	best: 142608.4160166 (8)	total: 22.1ms	remaining: 224ms
9:	learn: 146996.9967715	test: 138702.2431616	best: 138702.

## Model Evaluation

In [60]:
cb_best = CatBoostRegressor(iterations=100,
                            depth=best_params[1],
                            learning_rate=best_params[0],
                            loss_function='RMSE',
                            cat_features=nominative_categoricals
                            )

In [61]:
cb_best.fit(X_train, y_train)

0:	learn: 59680.0601808	total: 2.54ms	remaining: 251ms
1:	learn: 49594.6998910	total: 4.95ms	remaining: 242ms
2:	learn: 42347.0526831	total: 7.73ms	remaining: 250ms
3:	learn: 36561.5505099	total: 11ms	remaining: 264ms
4:	learn: 32848.9180635	total: 12.8ms	remaining: 244ms
5:	learn: 30017.6142056	total: 14.6ms	remaining: 229ms
6:	learn: 28092.1196657	total: 16.3ms	remaining: 217ms
7:	learn: 26592.5395664	total: 18.6ms	remaining: 214ms
8:	learn: 25219.1536593	total: 20.2ms	remaining: 205ms
9:	learn: 24184.6344144	total: 21.8ms	remaining: 197ms
10:	learn: 23450.1712800	total: 23.4ms	remaining: 190ms
11:	learn: 22804.0973954	total: 25.1ms	remaining: 184ms
12:	learn: 22100.1760762	total: 27.7ms	remaining: 185ms
13:	learn: 21653.5558560	total: 29.3ms	remaining: 180ms
14:	learn: 21333.8881567	total: 31ms	remaining: 175ms
15:	learn: 21028.3034500	total: 32.6ms	remaining: 171ms
16:	learn: 20719.1743066	total: 34.2ms	remaining: 167ms
17:	learn: 20396.0939203	total: 35.9ms	remaining: 163ms
18:	le

<catboost.core.CatBoostRegressor at 0x7fcdb88afd90>

In [62]:
# Predict on the train set
y_pred_train = cb_best.predict(X_train)

# Compute the R^2 score
r2_train = r2_score(y_train, y_pred_train)
print(f"R^2 Score: {r2_train}")

R^2 Score: 0.9676505355645786


In [63]:
# Predict on the test set
y_pred_test = cb_best.predict(X_test)

# Compute the R^2 score
r2_test = r2_score(y_test, y_pred_test)
print(f"R^2 Score: {r2_test}")

R^2 Score: 0.9325318039796777


## Feature Importance

In [34]:
feature_importances = cb_best.get_feature_importance(prettified=True)
print(feature_importances)

     Feature Id  Importances
0     GrLivArea    19.255260
1   OverallQual    18.794239
2   TotalBsmtSF    10.038362
3     ExterQual     9.659737
4    GarageArea     7.103835
..          ...          ...
75        Fence     0.000000
76  MiscFeature     0.000000
77      MiscVal     0.000000
78       YrSold     0.000000
79     SaleType     0.000000

[80 rows x 2 columns]
