# Codecup 6: Data Analysis | Part 2
### Author: Arash Hajian nezhad

---

#### Imports

In [1]:
import numpy as np
import pandas as pd

#### Data loading and processing

In [2]:
"""
The test dataset does not contain labels,
as it was used for scorings in the competition.
"""

df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,Customer Id,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,3JUN0VW6F043,34,Private Sector/Self Employed,Yes,1300000,6,0,Yes,No,No
1,VLHY2ABIR4QL,28,Private Sector/Self Employed,Yes,750000,7,0,Yes,No,No
2,6E3F7UNXYNFF,28,Private Sector/Self Employed,Yes,750000,6,0,Yes,No,No
3,JJ8R0ZRYWR31,32,Government Sector,Yes,800000,6,1,No,No,No
4,2WGFUEX6IEHM,34,Private Sector/Self Employed,Yes,700000,4,1,No,No,No


#### Dropping the id column for training

In [3]:
df.drop('Customer Id', axis=1, inplace=True)
df.head()

Unnamed: 0,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,34,Private Sector/Self Employed,Yes,1300000,6,0,Yes,No,No
1,28,Private Sector/Self Employed,Yes,750000,7,0,Yes,No,No
2,28,Private Sector/Self Employed,Yes,750000,6,0,Yes,No,No
3,32,Government Sector,Yes,800000,6,1,No,No,No
4,34,Private Sector/Self Employed,Yes,700000,4,1,No,No,No


#### Specifying categorical features' indices for CatBoost

In [4]:
# `Chronic Diseases` is either 0 or 1, hence -> categorical
categorical_features_indices = (1, 2, 5, 6, 7)

#### Label seperation & Data splitting for training & validation

In [5]:
from sklearn.model_selection import train_test_split


X = df.drop('TravelInsurance', axis=1)
y = df['TravelInsurance']

# 0.85 is used because the dataset is very small
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.85)

#### `CatBoost` model hyperparameters tuning using `Optuna`

In [6]:
import optuna
from catboost import CatBoostClassifier, Pool, cv
# optuna.logging.set_verbosity(optuna.logging.ERROR)


def optuna_objective(trial):
    hyperparameters = {
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.001, 2, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 0.1),  # can't use with GPU training / CPU only
        'depth': trial.suggest_int('depth', 1, 12),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': trial.suggest_categorical(
            'bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']
        ),

        # constant variables
        'iterations': 500,
        'verbose': False,
        'eval_metric': 'Accuracy',
        'use_best_model': True,
        'objective': 'Logloss',
        'loss_function': 'Logloss',
        # 'task_type': 'GPU',  # can't use with cross-validation
    }

    if hyperparameters['bootstrap_type'] == 'Bayesian':
        hyperparameters.update({'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 10)})
    elif hyperparameters['bootstrap_type'] == 'Bernoulli':
        hyperparameters.update({'subsample': trial.suggest_float('subsample', 0.1, 1)})

    model = CatBoostClassifier(**hyperparameters)
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params(),
        logging_level='Silent',
        plot=False,
    )
    
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return best_accuracy  # as we use optuna's `maximize` direction


study = optuna.create_study(direction='maximize')
study.optimize(optuna_objective, n_trials=100, timeout=600)

[32m[I 2022-10-31 21:57:58,315][0m A new study created in memory with name: no-name-d9a06aa8-d125-4703-980b-14a6bd94107e[0m
[32m[I 2022-10-31 21:58:00,394][0m Trial 0 finished with value: 0.7949787655049407 and parameters: {'l2_leaf_reg': 0.005798851364794512, 'learning_rate': 0.3592009934136991, 'colsample_bylevel': 0.019054426329361697, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.5931427234405031}. Best is trial 0 with value: 0.7949787655049407.[0m
[32m[I 2022-10-31 21:58:02,537][0m Trial 1 finished with value: 0.8289399123985141 and parameters: {'l2_leaf_reg': 0.014423626517853345, 'learning_rate': 0.47811975700397186, 'colsample_bylevel': 0.09057840730126115, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.8289399123985141.[0m
[32m[I 2022-10-31 21:58:08,670][0m Trial 2 finished with value: 0.7943450745887962 and parameters: {'l2_leaf_reg': 0.0011516996607183433, 'learning_rate': 0.18205

#### Getting optimization results

In [7]:
print('Number of Trials successfully finished: ', len(study.trials))

best_trial = study.best_trial
print('Best Trial\'s score:', best_trial.value)

best_hyperparams = best_trial.params
print('Best hyperparameters:', best_hyperparams)

Number of Trials successfully finished:  100
Best Trial's score: 0.8415031356757273
Best hyperparameters: {'l2_leaf_reg': 0.01762052909719668, 'learning_rate': 0.31194403178074587, 'colsample_bylevel': 0.06273091164357764, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}


#### Cross-Validation using the best hyperparameters obtained from `Optuna`

In [8]:
best_hyperparams.update({
        'iterations': 500,
        'verbose': False,
        'eval_metric': 'Accuracy',
        'use_best_model': True,
        'loss_function': 'Logloss',
})

model = CatBoostClassifier(**best_hyperparams)

cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params(),
        plot=True,
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]

bestTest = 0.8531073446
bestIteration = 111

Training on fold [1/3]

bestTest = 0.8320754717
bestIteration = 185

Training on fold [2/3]

bestTest = 0.8468809074
bestIteration = 477



#### Traininig the Model

In [9]:
# Creating data pools

train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validation_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

model.fit(X=train_pool, eval_set=validation_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1eeeb58b340>

#### Generating output for the competition

In [19]:
comp_df = pd.read_csv('data/test.csv')
customers_id = comp_df['Customer Id']
comp_df.drop('Customer Id', axis=1, inplace=True)

prediction_probs = model.predict_proba(comp_df)
prediction = pd.DataFrame([pred[1] for pred in prediction_probs], columns=['prediction'])

#### Concatinating the `Customer Id` dataframe and `prediction` dataframe

In [20]:
customers_id.index = prediction.index  # as they are the same in size
output = pd.concat([customers_id, prediction], axis=1)
output.head()

Unnamed: 0,Customer Id,prediction
0,Q5OK4L2Q5XNH,0.844227
1,CBHVMVE7TRP5,0.923276
2,9UGHQ91U4F0M,0.473026
3,3UWXWMZK4ZGL,0.905362
4,5CGEMLMBOPES,0.163574


#### Saving the output

In [21]:
output.to_csv('part_2_output.csv', index=False)