# Notebook to support tests for Catboost engine based on Flowers dataset

Source: https://www.kaggle.com/arshid/iris-flower-dataset

In [1]:
import os

import catboost
import numbers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
ARTIFACTS_PATH = '../../artifacts/catboost/'
os.makedirs(ARTIFACTS_PATH, exist_ok=True) # Create path if not exists

## Load dataset

In [3]:
def load_data(df, data_columns, target_column, cat_columns):
    df = df[data_columns]
    
    df_columns = [c for c in df.columns if c != target_column]
    cat_columns_idx = [i for i, c in enumerate(df_columns) if c in cat_columns]
    cat_columns_names = [c for i, c in enumerate(df_columns) if i in cat_columns_idx]

    for col_name in df.columns:
        if col_name in cat_columns:
            df[col_name].replace(np.nan, '', inplace=True)
        else:
            df[col_name].replace(np.nan, None, inplace=True)
    
    X_data = df.loc[:, df.columns != target_column].to_numpy()
    y_data = df[[target_column]].to_numpy().reshape(-1)
    return df, X_data, y_data

In [4]:
data_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
target_column = 'species'

cat_columns = ['species']

In [5]:
df = pd.read_csv('../../data/iris/IRIS.csv')
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [7]:
df_train, X_train, y_train = load_data(df_train, data_columns, target_column, cat_columns)
df_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
22,4.6,3.6,1.0,0.2,Iris-setosa
15,5.7,4.4,1.5,0.4,Iris-setosa
65,6.7,3.1,4.4,1.4,Iris-versicolor
11,4.8,3.4,1.6,0.2,Iris-setosa
42,4.4,3.2,1.3,0.2,Iris-setosa


In [8]:
df_test, X_test, y_test = load_data(df_test, data_columns, target_column, cat_columns)
df_test.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
73,6.1,2.8,4.7,1.2,Iris-versicolor
18,5.7,3.8,1.7,0.3,Iris-setosa
118,7.7,2.6,6.9,2.3,Iris-virginica
78,6.0,2.9,4.5,1.5,Iris-versicolor
76,6.8,2.8,4.8,1.4,Iris-versicolor


## Train model

In [9]:
cat_columns_idx = [i for i, x in enumerate(df_train.loc[:, df_train.columns != target_column].dtypes) if x == np.object]
cat_columns_idx

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  cat_columns_idx = [i for i, x in enumerate(df_train.loc[:, df_train.columns != target_column].dtypes) if x == np.object]


[]

In [10]:
X_train[0:5]

array([[4.6, 3.6, 1. , 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [6.7, 3.1, 4.4, 1.4],
       [4.8, 3.4, 1.6, 0.2],
       [4.4, 3.2, 1.3, 0.2]])

In [11]:
train_data = catboost.Pool(data=X_train, label=y_train, cat_features=cat_columns_idx)

In [12]:
model = catboost.CatBoostClassifier(
    random_seed=42,
    eval_metric='Accuracy',
    loss_function='MultiClass',
    verbose=False,)

In [13]:
model.fit(train_data)

<catboost.core.CatBoostClassifier at 0x7f690ee071f0>

## Save & load

See: https://catboost.ai/docs/concepts/python-reference_catboost_save_model.html

In [14]:
output_file = os.path.join(ARTIFACTS_PATH, 'iris.cbm')
model.save_model(output_file, pool=train_data)

In [15]:
model = catboost.CatBoostClassifier() # Params not required
model.load_model(output_file)

<catboost.core.CatBoostClassifier at 0x7f690ee07a60>

## Predictions

In [16]:
y_pred = model.predict(X_test)

In [17]:
y_pred = y_pred.reshape(-1)

In [18]:
accuracy = (y_test == y_pred).mean()
print('Accuracy: %.4f' % accuracy)

Accuracy: 1.0000
