In [45]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [46]:
SEED = 1

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from pathlib import Path
import catboost as cb
import numpy as np

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

# Utils for data preprocessing

- setting all features as categorical provides huge performance boost for catboost
- setting object/string type features as categorical provides decent performance boost for xgboost

Referenced from https://www.kaggle.com/code/rohanrao/automl-grand-prix-1st-place-solution/notebook

In [33]:
DATA_DIR = Path('')

In [34]:
# preprocessing so that catboost can recognize them as categorical features
def float64_to_int64(df: pd.DataFrame):
    # convert all numerical features into integers
    float64_cols = df.select_dtypes(include=['float64']).columns
    df[float64_cols] = df[float64_cols].astype('int64')

def retrieve_train_dev_test_for_catboost():
    train = pd.read_csv(DATA_DIR / 'train.csv')
    test = pd.read_csv(DATA_DIR / 'test.csv')

    dev = train[ :int(0.01 * len(train))]

    # # oversample the minority class
    # train_extra = pd.read_csv(DATA_DIR / 'train_extra.csv')
    # oversampled = train_extra[train_extra['Response'] == 1]

    # train = pd.concat([train[int(0.01 * len(train)): ], oversampled], ignore_index=True)
    train = train[int(0.01 * len(train)): ]

    # drop index column
    train = train.drop(columns=['id'])
    dev = dev.drop(columns=['id'])
    test = test.drop(columns=['id'])


    # convert float64 to int64
    float64_to_int64(train)
    float64_to_int64(dev)
    float64_to_int64(test)

    return train, dev, test


def retrieve_train_dev_test_as_category_for_xgboost():
    train = pd.read_csv(DATA_DIR / 'train.csv')
    test = pd.read_csv(DATA_DIR / 'test.csv')

    dev = train[ :int(0.01 * len(train))]
    train = train[int(0.01 * len(train)): ]

    # drop index column
    train = train.drop(columns=['id'])
    dev = dev.drop(columns=['id'])
    test = test.drop(columns=['id'])

    for col in train.columns:
        if col not in  ['Vehicle_Age', 'Gender', 'Vehicle_Damage']:
            continue
        train[col] = train[col].astype('category')
        dev[col] = dev[col].astype('category')
        test[col] = test[col].astype('category')
    
    return train, dev, test

def split_input_output(df: pd.DataFrame):
    x = df.drop(columns=['Response'])
    y = df['Response']

    return x, y

# XGBoost 
- scale_class_weight = sum_of_minor / sum_of_major
- enable_categorical=True

In [35]:
train_df, dev_df, test_df = retrieve_train_dev_test_as_category_for_xgboost()
X_train, Y_train = split_input_output(train_df)
X_dev, Y_dev = split_input_output(dev_df)

In [36]:
xgboost_params = {
    # const params used in training
    'n_estimators': 500,
    'objective': 'binary:logistic',
    'tree_method': 'hist',
    'eval_metric': 'auc',
    'device': 'cuda',
    'random_state': SEED,
    'early_stopping_rounds': 50,
    
    # fixed after first case study
    'learning_rate': 0.05,
    'gamma': 0.001,
    'subsample': 0.8,
    'reg_lambda': 0.2,
    'max_bin': 32767,
    
    
    # fixed after secone optimization
    'colsample_bytree': 0.21039136022674532,
    'max_depth': 27,
    'min_child_weight': 36,
    
    # override fixed params
    'n_estimators': 5000,
    'early_stopping_rounds': 200,
    
    # categorical features
    'enable_categorical': True,
}

In [37]:
# scale_pos_weight for imbalanced data boosts performance a bit for xgboost
scale_pos_weight = np.sum(Y_train == 0) / np.sum(Y_train == 1)

In [38]:
evals = [(X_dev, Y_dev)]
model = xgb.XGBClassifier(
    **xgboost_params,
    scale_pos_weight=scale_pos_weight,
)
model.fit(X_train, Y_train, eval_set=evals, verbose=100)

[0]	validation_0-auc:0.80454
[100]	validation_0-auc:0.88469
[200]	validation_0-auc:0.88970
[300]	validation_0-auc:0.89098
[400]	validation_0-auc:0.89155
[500]	validation_0-auc:0.89192
[600]	validation_0-auc:0.89214
[700]	validation_0-auc:0.89229
[800]	validation_0-auc:0.89241
[900]	validation_0-auc:0.89243
[1000]	validation_0-auc:0.89244
[1100]	validation_0-auc:0.89247
[1200]	validation_0-auc:0.89241
[1300]	validation_0-auc:0.89235
[1307]	validation_0-auc:0.89232


In [39]:
booster = model.get_booster()

In [40]:
test_dmatrix = xgb.DMatrix(test_df, enable_categorical=True)
xgboost_output = booster.predict(test_dmatrix, iteration_range=(0, booster.best_iteration + 1))

# Catboost 

- cat_features=all features

In [41]:
train_df, dev_df, test_df = retrieve_train_dev_test_for_catboost()
X_train, Y_train = split_input_output(train_df)
X_dev, Y_dev = split_input_output(dev_df)

In [42]:
cat_features=X_train.columns.values

In [43]:
dataset = cb.Pool(X_train, label=Y_train, cat_features=cat_features)

NameError: name 'cb' is not defined

In [15]:
tuned_params = {
    # -------------- Constant params used in tuning --------
    'task_type': 'GPU',
    'loss_function': 'Logloss',
    'eval_metric': 'Logloss', 
    'custom_metric': ['Logloss'],
    'random_seed': SEED,
    'use_best_model': True,
    
    
    # --------------- Speed Up Training -------------
    'bootstrap_type': 'Bayesian',
    # Ordered — Usually provides better quality on small datasets, but it may be slower than the Plain scheme.
    # Plain — The classic gradient boosting scheme.
    'boosting_type': 'Plain',
    # Try to set border_count of this parameter to 32 if training is performed on GPU. 
    # In many cases, this does not affect the quality of the model but significantly speeds up the training.
    'border_count': 32,
    
    
    # --------------- Tuned Params ------------------
    'learning_rate': 0.095,
    'depth': 8,
    'bagging_temperature': 0.8,
    'l2_leaf_reg': 0.00255,
    'random_strength': 0,
    'border_count': 32,
    'n_estimators': 3000,
    'early_stopping_rounds': 200,
    'auto_class_weights': 'None',
    'leaf_estimation_iterations': 5,
    
    # overriding const params
    'eval_metric': 'AUC', 
    'custom_metric': ['AUC'],
}

In [16]:
# At least in my experiment, scale_pos_weight does not help in catboost
model = cb.CatBoostClassifier(**tuned_params, cat_features=cat_features)
model.fit(dataset, verbose=100, eval_set=(X_dev, Y_dev))

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8730030	best: 0.8730030 (0)	total: 2.79s	remaining: 2h 19m 38s
100:	test: 0.8918580	best: 0.8918580 (100)	total: 3m 24s	remaining: 1h 38m 1s
200:	test: 0.8930877	best: 0.8930877 (200)	total: 6m 23s	remaining: 1h 29m 6s
300:	test: 0.8936544	best: 0.8936544 (300)	total: 9m 14s	remaining: 1h 22m 52s
400:	test: 0.8939801	best: 0.8939801 (400)	total: 11m 56s	remaining: 1h 17m 25s
500:	test: 0.8942422	best: 0.8942422 (500)	total: 14m 41s	remaining: 1h 13m 18s
600:	test: 0.8943807	best: 0.8943807 (600)	total: 17m 22s	remaining: 1h 9m 19s
700:	test: 0.8945042	best: 0.8945042 (700)	total: 20m 5s	remaining: 1h 5m 55s
800:	test: 0.8946012	best: 0.8946015 (794)	total: 22m 50s	remaining: 1h 2m 41s
900:	test: 0.8946748	best: 0.8946748 (900)	total: 25m 24s	remaining: 59m 11s
1000:	test: 0.8947195	best: 0.8947195 (1000)	total: 28m 2s	remaining: 55m 59s
1100:	test: 0.8947760	best: 0.8947772 (1095)	total: 30m 39s	remaining: 52m 52s
1200:	test: 0.8948178	best: 0.8948178 (1200)	total: 33m 12s	r

<catboost.core.CatBoostClassifier at 0x7fc2e8127a30>

In [17]:
test_pool = cb.Pool(test_df, cat_features=X_train.columns.values)
cb_output = model.predict(test_df, prediction_type='Probability', ntree_end=model.get_best_iteration() + 1)

In [18]:
# Two-dimensional numpy.ndarray of shape (number_of_objects, number_of_classes) with the probability for every class for each object.
print(cb_output)

[[9.94749744e-01 5.25025616e-03]
 [3.73193939e-01 6.26806061e-01]
 [7.71100509e-01 2.28899491e-01]
 ...
 [9.99671561e-01 3.28438525e-04]
 [3.93177913e-01 6.06822087e-01]
 [9.99943263e-01 5.67371797e-05]]


In [19]:
# since it is binary classification, we only need the second column
cb_output = cb_output[:, 1]

# Ensemble Catboost and XGBoost

In [20]:
id_column = pd.read_csv(DATA_DIR / 'test.csv')['id']
output = pd.DataFrame({'id': id_column, 'Response': (xgboost_output + cb_output) / 2})
output.head()

Unnamed: 0,id,Response
0,11504798,0.016614
1,11504799,0.772606
2,11504800,0.460942
3,11504801,0.000417
4,11504802,0.333679


In [21]:
output.to_csv('submission.csv', index=False)