# Gradient boosting

## Setup

In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


In [2]:
import os
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split

In [3]:
# load data
train = pd.read_csv('../data/final/train.csv')
test = pd.read_csv('../data/final/test.csv')

In [4]:
TARGET = 'Transported'
FEATURES = [col for col in train.columns if col not in [TARGET]]

numerical = train[FEATURES].select_dtypes(include=np.number).columns.to_list()
categorical = train[FEATURES].select_dtypes(exclude=np.number).columns.to_list()

train[numerical] = train[numerical].astype(float)
train[categorical] = train[categorical].astype(str)
train[TARGET] = train[TARGET].astype(float)

test[numerical] = test[numerical].astype(float)
test[categorical] = test[categorical].astype(str)

print(f'Target: {TARGET}')
print(f'Features:\n\tnumerical: {numerical}\n\tcategorical:{categorical}')
print(f'Shapes:\n\ttrain: {train.shape}\n\ttest: {test.shape}')

train.sample(5)

Target: Transported
Features:
	numerical: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Expenditure', 'NoSpending', 'CabinNum', 'GroupSize', 'FamilySize']
	categorical:['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinDeck', 'CabinSide', 'Solo']
Shapes:
	train: (8693, 19)
	test: (4277, 18)


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Expenditure,NoSpending,CabinDeck,CabinNum,CabinSide,GroupSize,Solo,FamilySize,Transported
3332,Earth,False,PSO J318.5-22,20.0,False,0.0,0.0,1.0,0.0,2260.0,4522.0,0.0,F,675.0,S,1.0,True,14.0,0.0
7284,Earth,False,PSO J318.5-22,43.0,False,0.0,0.0,110.0,538.0,0.0,1296.0,0.0,F,1488.0,S,1.0,True,15.0,0.0
4814,Earth,False,TRAPPIST-1e,26.0,False,1528.0,1.0,0.0,0.0,0.0,3058.0,0.0,F,1046.0,P,8.0,False,17.0,0.0
2218,Europa,True,PSO J318.5-22,32.0,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,C,83.0,P,1.0,True,2.0,1.0
5461,Earth,True,TRAPPIST-1e,12.0,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,G,939.0,P,1.0,True,5.0,1.0


## Models

In [5]:
# data splitting
x, x_val, y, y_val = train_test_split(
    train[FEATURES],
    train[TARGET],
    test_size = 0.2,
    random_state = 42
)

# pool preparation
pool_train = Pool(x, y, cat_features=categorical)
pool_val = Pool(x_val, y_val, cat_features=categorical)

In [6]:
# default 
model = CatBoostClassifier(
    iterations = 1500,
    learning_rate=0.03,
    custom_metric = ['Accuracy', 'AUC', 'F1'],
    allow_writing_files=False
)

_ = model.fit(
    pool_train,
    eval_set = pool_val,
    verbose = 250
)

0:	learn: 0.6782815	test: 0.6783597	best: 0.6783597 (0)	total: 168ms	remaining: 4m 11s
250:	learn: 0.3645196	test: 0.3980172	best: 0.3980172 (250)	total: 6.44s	remaining: 32.1s
500:	learn: 0.3231009	test: 0.3833277	best: 0.3833277 (500)	total: 12.9s	remaining: 25.8s
750:	learn: 0.2942071	test: 0.3813303	best: 0.3812536 (651)	total: 19.4s	remaining: 19.4s
1000:	learn: 0.2710857	test: 0.3819111	best: 0.3804874 (802)	total: 26.2s	remaining: 13.1s
1250:	learn: 0.2509277	test: 0.3822678	best: 0.3804874 (802)	total: 34.4s	remaining: 6.86s
1499:	learn: 0.2329131	test: 0.3840601	best: 0.3804874 (802)	total: 43.5s	remaining: 0us

bestTest = 0.3804874363
bestIteration = 802

Shrink model to first 803 iterations.


In [7]:
# evalutation accuracy
acc = np.mean(y_val == model.predict(pool_val))
print(f'Accuracy on evaluation set {acc:.2%}')

Accuracy on evaluation set 80.10%


In [8]:
# prediction
pool_test = Pool(test[FEATURES], cat_features=categorical)
preds = model.predict(pool_test)

## Submission

In [9]:
sub = pd.read_csv('../data/raw/sample_submission.csv')
sub[TARGET] = preds.astype(bool)
sub

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [10]:
os.makedirs('../submissions', exist_ok=True)
sub.to_csv('../submissions/catboost_default.csv', index=False)