# CatBoost Analysis

## Setup

In [29]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

%matplotlib inline

In [5]:
# Load data
data = pd.read_csv('data/Ames_Housing_Price_Data_engineered.csv')

In [6]:
# Create the independent variables and the target
predictors = data.drop('SalePrice', axis=1)
target = data['SalePrice']

## Preprocessing

In [7]:
# Check columns
print(predictors.columns)

Index(['GrLivArea', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea',
       'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3

In [8]:
# Check to see if there are any missing values
print(predictors.isnull().sum().sum())

0


## Perform Train-Test Split

In [9]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(predictors, target, random_state=0, test_size = 0.2)

## Train the Model

In [12]:
cat_features = ['MSZoning', 
                    'Street', 
                    'Alley', 
                    'LotShape',
                    'LandContour',
                    'Utilities',
                    'LotConfig',
                    'LandSlope',
                    'Condition1',
                    'Condition2',
                    'BldgType',
                    'HouseStyle',
                    'RoofStyle',
                    'RoofMatl',
                    'Exterior1st',
                    'Exterior2nd',
                    'MasVnrType',
                    'ExterQual',
                    'ExterCond',
                    'Foundation',
                    'BsmtQual',
                    'BsmtCond',
                    'BsmtExposure',
                    'BsmtFinType1',
                    'BsmtFinType2',
                    'Heating',
                    'HeatingQC',
                    'CentralAir',
                    'Electrical',
                    'KitchenQual',
                    'Functional',
                    'FireplaceQu',
                    'GarageType',
                    'GarageFinish',
                    'GarageQual',
                    'GarageCond',
                    'PavedDrive',
                    'PoolQC',
                    'Fence',
                    'MiscFeature',
                    'SaleType',
                    'SaleCondition']   

In [24]:
# Define the parameter grid for the learning rate and depth
param_grid = {
    'learning_rate': [0.03, 0.1, 0.3],
    'depth': [4, 6, 10]
}

# Initialize a Pool object
train_pool = Pool(X_train, y_train, cat_features=cat_features)

# Initialize empty dictionary to store results
cv_results = {}

# Loop over all combinations of hyperparameters
for learning_rate in param_grid['learning_rate']:
    for depth in param_grid['depth']:
        
        # Define the parameters for the model
        params = {
            'loss_function': 'RMSE',
            'iterations': 100, 
            'learning_rate': learning_rate,
            'depth': depth
        }

        # Perform cross-validation and store the results
        cv_data = cv(train_pool, params, fold_count=5, plot=False)
        cv_results[(learning_rate, depth)] = np.min(cv_data['test-RMSE-mean'])

# Print the best parameters and RMSE
best_params = min(cv_results, key=cv_results.get)
print(f'Best parameters: learning_rate={best_params[0]}, depth={best_params[1]}')
print(f'Best RMSE: {cv_results[best_params]}')


Training on fold [0/5]
0:	learn: 185501.9862330	test: 196403.3109621	best: 196403.3109621 (0)	total: 6.93ms	remaining: 686ms
1:	learn: 180521.6537052	test: 191515.3108817	best: 191515.3108817 (1)	total: 10.7ms	remaining: 523ms
2:	learn: 175700.9628222	test: 186763.9779563	best: 186763.9779563 (2)	total: 13.1ms	remaining: 423ms
3:	learn: 170970.0055107	test: 182035.6935262	best: 182035.6935262 (3)	total: 16.7ms	remaining: 401ms
4:	learn: 166311.3265763	test: 177262.0039240	best: 177262.0039240 (4)	total: 20.4ms	remaining: 388ms
5:	learn: 161926.2073255	test: 172861.2316689	best: 172861.2316689 (5)	total: 23.3ms	remaining: 366ms
6:	learn: 157524.2477208	test: 168385.7691893	best: 168385.7691893 (6)	total: 26.4ms	remaining: 351ms
7:	learn: 153396.0955446	test: 164291.3912679	best: 164291.3912679 (7)	total: 29.4ms	remaining: 338ms
8:	learn: 149367.8457321	test: 160208.7174128	best: 160208.7174128 (8)	total: 33.7ms	remaining: 340ms
9:	learn: 145444.4989097	test: 156199.1598390	best: 156199.

## Model Evaluation

In [25]:
model = CatBoostRegressor(
    iterations=100,
    depth=best_params[1],
    learning_rate=best_params[0],
    loss_function='RMSE',
    cat_features=cat_features
)


In [26]:
model.fit(X_train, y_train)

0:	learn: 60896.4821639	total: 3.12ms	remaining: 309ms
1:	learn: 51203.4907276	total: 6.14ms	remaining: 301ms
2:	learn: 44111.4217842	total: 8.55ms	remaining: 277ms
3:	learn: 38826.9796079	total: 11.8ms	remaining: 284ms
4:	learn: 34941.8366468	total: 13.7ms	remaining: 261ms
5:	learn: 32255.6588145	total: 15.8ms	remaining: 248ms
6:	learn: 29856.8876262	total: 18.2ms	remaining: 242ms
7:	learn: 28167.1066709	total: 19.7ms	remaining: 227ms
8:	learn: 26945.4719102	total: 22ms	remaining: 222ms
9:	learn: 26354.9328176	total: 23.7ms	remaining: 213ms
10:	learn: 25656.9300015	total: 25.4ms	remaining: 206ms
11:	learn: 24672.1858082	total: 27.4ms	remaining: 201ms
12:	learn: 24147.0700044	total: 31.8ms	remaining: 213ms
13:	learn: 23507.4587161	total: 33.7ms	remaining: 207ms
14:	learn: 23071.9886380	total: 35.4ms	remaining: 201ms
15:	learn: 22727.3589634	total: 37.7ms	remaining: 198ms
16:	learn: 22482.5837261	total: 40.1ms	remaining: 196ms
17:	learn: 22137.9726389	total: 41.7ms	remaining: 190ms
18:	

<catboost.core.CatBoostRegressor at 0x7fd5a1ea74f0>

In [27]:
# Predict the target variable for the test set
y_pred = model.predict(X_test)

# Compute the R^2 score
r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {r2}")

R^2 Score: 0.9247089329995434


## Feature Importance

In [28]:
feature_importances = model.get_feature_importance(prettified=True)
print(feature_importances)

         Feature Id  Importances
0       OverallQual    26.684795
1         GrLivArea    18.894591
2       TotalBsmtSF     7.447136
3   YearsSinceBuilt     7.235183
4        GarageArea     6.465649
..              ...          ...
75        3SsnPorch     0.000000
76            Fence     0.000000
77          MiscVal     0.000000
78         SaleType     0.000000
79           Crisis     0.000000

[80 rows x 2 columns]
