# CatBoost

CatBoost is a powerful gradient boosting library developed by Yandex. It is designed to handle categorical features efficiently and provides high performance for both classification and regression tasks. Some key features of CatBoost include:

1. Automatic handling of categorical features
2. Fast training and prediction
3. Built-in support for GPU acceleration
4. Robust to overfitting
5. Supports various loss functions and evaluation metrics

CatBoost is particularly useful when dealing with datasets that contain categorical variables, as it can automatically encode these features without the need for manual preprocessing.



In [10]:
# import the necessary libraries
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score


# hide warnings
import warnings
warnings.filterwarnings('ignore')



In [11]:
# import the dataset
df = sns.load_dataset('titanic')
df.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Data Preprocessing


In [12]:
# check for missing values
df.isnull().sum()


survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [13]:
# impute age and fare null values with knn imputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
df['age'] = imputer.fit_transform(df[['age']])
df['fare'] = imputer.fit_transform(df[['fare']])

# impute embarked and emabark_town null values with mode
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)
df['embark_town'].fillna(df['embark_town'].mode()[0], inplace=True)

# drop the deck column
df.drop(columns=['deck'], inplace=True)


# check for missing values
df.isnull().sum()


survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     891 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  embark_town  891 non-null    object  
 12  alive        891 non-null    object  
 13  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 79.4+ KB


In [18]:
categorical_col = df.select_dtypes(include=['object', 'category']).columns

df[categorical_col] = df[categorical_col].astype('category')

# check the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    category
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     891 non-null    category
 8   class        891 non-null    category
 9   who          891 non-null    category
 10  adult_male   891 non-null    bool    
 11  embark_town  891 non-null    category
 12  alive        891 non-null    category
 13  alone        891 non-null    bool    
dtypes: bool(2), category(6), float64(2), int64(4)
memory usage: 49.6 KB


In [19]:
# split the data into X and y
X = df.drop(columns=['survived'] , axis=1)
y = df['survived']

# split the data into training and testing sets
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=42)




In [23]:
%%time
# run the catboost classifier
catboost_model = CatBoostClassifier(verbose=False , 
                                    iterations=100 , 
                                    depth=3 ,
                                    learning_rate=0.1 ,
                                    loss_function='Logloss',
                                    eval_metric='Accuracy' ,
                                    random_seed=42)

# fit the model
catboost_model.fit(X_train , y_train , cat_features=categorical_col.tolist())

# predict the test set
y_pred = catboost_model.predict(X_test)

# evaluate the model
print(f'Accuracy: {accuracy_score(y_test , y_pred)}')
print(f'Precision: {precision_score(y_test , y_pred)}')
print(f'Recall: {recall_score(y_test , y_pred)}')
print(f'F1 Score: {f1_score(y_test , y_pred)}')

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
CPU times: total: 1.42 s
Wall time: 1.63 s
