# Notebook to support tests for Catboost engine based on Titanic dataset

In [18]:
import catboost
import numbers
import numpy as np
import pandas as pd
import os

In [19]:
ARTIFACTS_PATH = '../../artifacts/catboost/'
os.makedirs(ARTIFACTS_PATH, exist_ok=True) # Create path if not exists

## Load dataset

In [20]:
def load_data(csv_file, target_column, cat_columns, del_columns=[]):
    df = pd.read_csv(csv_file)
    for column_name in del_columns:
        del df[column_name]
    
    df_columns = [c for c in df.columns if c != target_column]
    cat_columns_idx = [i for i, c in enumerate(df_columns) if c in cat_columns]
    cat_columns_names = [c for i, c in enumerate(df_columns) if i in cat_columns_idx]
    
    for col_name in df.columns:
        if col_name in cat_columns_names:
            df[col_name] = df[col_name].replace(np.nan, '')
        else:
            df[col_name] = df[col_name].replace(np.nan, None).astype('float32')
    
    X_data = df.loc[:, df.columns != target_column].to_numpy()
    y_data = df[[target_column]].to_numpy().reshape(-1)
    return df, X_data, y_data

In [21]:
target_column = 'Survived'
cat_columns = ['Sex', 'Ticket', 'Cabin', 'Embarked']
del_columns = ['PassengerId', 'Name']

In [22]:
df_train, X_train, y_train = load_data('../../data/titanic/train.csv', target_column=target_column,
                                       cat_columns=cat_columns, del_columns=del_columns)
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0.0,3.0,male,22.0,1.0,0.0,A/5 21171,7.25,,S
1,1.0,1.0,female,38.0,1.0,0.0,PC 17599,71.283302,C85,C
2,1.0,3.0,female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S
3,1.0,1.0,female,35.0,1.0,0.0,113803,53.099998,C123,S
4,0.0,3.0,male,35.0,0.0,0.0,373450,8.05,,S


In [23]:
df_test, X_test, y_test = load_data('../../data/titanic/test.csv', target_column=target_column,
                                    cat_columns=cat_columns, del_columns=del_columns)
df_test.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0.0,3.0,male,34.5,0.0,0.0,330911,7.8292,,Q
1,0.0,3.0,female,47.0,1.0,0.0,363272,7.0,,S
2,0.0,2.0,male,62.0,0.0,0.0,240276,9.6875,,Q
3,0.0,3.0,male,27.0,0.0,0.0,315154,8.6625,,S
4,0.0,3.0,female,22.0,1.0,1.0,3101298,12.2875,,S


## Train model

In [24]:
cat_columns_idx = [i for i, x in enumerate(df_train.loc[:, df_train.columns != target_column].dtypes) if x == np.object]
cat_columns_idx

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  cat_columns_idx = [i for i, x in enumerate(df_train.loc[:, df_train.columns != target_column].dtypes) if x == np.object]


[1, 5, 7, 8]

In [25]:
X_train[0:5]

array([[3.0, 'male', 22.0, 1.0, 0.0, 'A/5 21171', 7.25, '', 'S'],
       [1.0, 'female', 38.0, 1.0, 0.0, 'PC 17599', 71.2833023071289,
        'C85', 'C'],
       [3.0, 'female', 26.0, 0.0, 0.0, 'STON/O2. 3101282',
        7.925000190734863, '', 'S'],
       [1.0, 'female', 35.0, 1.0, 0.0, '113803', 53.099998474121094,
        'C123', 'S'],
       [3.0, 'male', 35.0, 0.0, 0.0, '373450', 8.050000190734863, '',
        'S']], dtype=object)

In [26]:
train_data = catboost.Pool(data=X_train, label=y_train, cat_features=cat_columns_idx)

In [27]:
model = catboost.CatBoostClassifier(
    random_seed=42,
    eval_metric='Accuracy',
    verbose=False,)

In [28]:
model.fit(train_data)

<catboost.core.CatBoostClassifier at 0x7fb457be1e80>

## Save & load

See: https://catboost.ai/docs/concepts/python-reference_catboost_save_model.html

In [29]:
output_file = os.path.join(ARTIFACTS_PATH, 'titanic.cbm')
model.save_model(output_file, pool=train_data)

In [30]:
model = catboost.CatBoostClassifier() # Params not required
model.load_model(output_file)

<catboost.core.CatBoostClassifier at 0x7fb457bee100>

## Predictions

In [31]:
y_pred = model.predict(X_test)

In [32]:
accuracy = (y_test == y_pred).mean()
print('Accuracy: %.4f' % accuracy)

Accuracy: 0.8445
