# Gradient boosting

## Setup

In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


In [2]:
import os
import numpy as np
import pandas as pd

import optuna
from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train = pd.read_csv('../data/final/train.csv')
test = pd.read_csv('../data/final/test.csv')

train.sample(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group_count,Cabin_deck,Cabin_side
1193,Mars,False,TRAPPIST-1e,21.0,False,2066.0,0.0,15.0,0.0,1.0,False,1.0,E,P
8466,Earth,True,PSO J318.5-22,30.0,False,0.0,0.0,0.0,,0.0,True,1.0,G,P
3471,Europa,False,55 Cancri e,30.0,False,0.0,787.0,116.0,1924.0,2.0,False,2.0,D,S
7989,Earth,False,TRAPPIST-1e,21.0,False,129.0,106.0,280.0,183.0,18.0,False,1.0,F,P
4523,Earth,True,TRAPPIST-1e,1.0,False,0.0,0.0,,0.0,0.0,True,3.0,G,S


In [4]:
TARGET = 'Transported'
FEATURES = [col for col in train.columns if col not in [TARGET]]

numerical = train[FEATURES].select_dtypes(include=np.number).columns.to_list()
categorical = train[FEATURES].select_dtypes(exclude=np.number).columns.to_list()

train[categorical] = train[categorical].astype(str)
test[categorical] = test[categorical].astype(str)

print(f'Target: {TARGET}')
print(f'Fetaures:\n\tnumerical: {numerical}\n\tcategorical:{categorical}')
print(f'Shapes:\n\ttrain: {train.shape}\n\ttest: {test.shape}')

Target: Transported
Fetaures:
	numerical: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Group_count']
	categorical:['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_deck', 'Cabin_side']
Shapes:
	train: (8693, 14)
	test: (4277, 13)


## Models

In [5]:
# data splitting
x, x_val, y, y_val = train_test_split(
    train[FEATURES],
    train[TARGET],
    test_size = 0.2,
    random_state = 42
)

# pool preparation
pool_train = Pool(x, y, cat_features=categorical)
pool_val = Pool(x_val, y_val, cat_features=categorical)

In [6]:
# default 
model = CatBoostClassifier()
model.fit(
    pool_train,
    eval_set = pool_val,
    verbose = 250
)

Learning rate set to 0.051161
0:	learn: 0.6722551	test: 0.6732528	best: 0.6732528 (0)	total: 175ms	remaining: 2m 54s
250:	learn: 0.3651991	test: 0.4132755	best: 0.4132346 (248)	total: 6.44s	remaining: 19.2s
500:	learn: 0.3239834	test: 0.4098225	best: 0.4092053 (457)	total: 13.9s	remaining: 13.9s
750:	learn: 0.2953005	test: 0.4082152	best: 0.4080510 (738)	total: 21.2s	remaining: 7.01s
999:	learn: 0.2698131	test: 0.4104423	best: 0.4080510 (738)	total: 28.2s	remaining: 0us

bestTest = 0.4080510107
bestIteration = 738

Shrink model to first 739 iterations.


<catboost.core.CatBoostClassifier at 0x2608776ded0>

In [7]:
# prediction
pool_test = Pool(test[FEATURES], cat_features=categorical)
preds = model.predict(pool_test)

## Submission

In [8]:
sub = pd.read_csv('../data/raw/sample_submission.csv')
sub[TARGET] = preds.astype(bool)
sub

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,True
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,True
4274,9271_01,True
4275,9273_01,True


In [9]:
os.makedirs('../submissions', exist_ok=True)
sub.to_csv('../submissions/catboost_default.csv', index=False)