# Impementation of Catboost

In [128]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

In [129]:
categoricalVars = [0, 1, 5, 7, 9, 11, 12, 14, 15, 23, 24, 27, 2, 3, 13, 18, 20, 26]
learningRate = 0.01

addHexVars = False
addIntCategorcalVars = False
iterations = 1200
depth = 11
seed = 1

dataTrain = pd.read_csv("data/challenge1_train.csv")

noNans = dataTrain.fillna(0)

values = noNans.values
X = values[:,2:31]
Y = values[:,1]


In [130]:
intVars = [6, 8, 21, 22, 28]

if addIntCategorcalVars:
    for col in intVars:
        categoricalVars.append(col)
        for i, item in enumerate(X[:,col]):
            X[:,col][i] = int(item)

In [137]:
hexVars = [2, 3, 13, 18, 20, 26]

if addHexVars:
    for col in hexVars:
        categoricalVars.remove(col)
        for i, item in enumerate(X[:,col]):
            if "E+" in str(item):
                item = int(float(str(item)))
            X[:,col][i] = int(str(item), 16)

In [132]:
model = CatBoostClassifier(
    iterations = iterations,
    depth = depth,
    learning_rate=learningRate,
    eval_metric="AUC",
    early_stopping_rounds=100,
    loss_function='Logloss',
    verbose=100
)

In [133]:
xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.15, random_state=seed)
model.fit(xTrain, yTrain, cat_features=categoricalVars, eval_set=(xTest, yTest), plot = True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.6204529	best: 0.6204529 (0)	total: 639ms	remaining: 12m 46s
100:	test: 0.7240517	best: 0.7240686 (99)	total: 1m 37s	remaining: 17m 38s
200:	test: 0.7388633	best: 0.7388633 (200)	total: 5m 4s	remaining: 25m 12s
300:	test: 0.7451189	best: 0.7451189 (300)	total: 7m 58s	remaining: 23m 49s
400:	test: 0.7477999	best: 0.7477999 (400)	total: 10m 8s	remaining: 20m 12s
500:	test: 0.7500954	best: 0.7500954 (500)	total: 12m 20s	remaining: 17m 13s
600:	test: 0.7515287	best: 0.7516091 (595)	total: 14m 22s	remaining: 14m 19s
700:	test: 0.7527182	best: 0.7527524 (697)	total: 16m 29s	remaining: 11m 44s
800:	test: 0.7533922	best: 0.7534176 (798)	total: 18m 49s	remaining: 9m 22s
900:	test: 0.7538686	best: 0.7539546 (894)	total: 20m 52s	remaining: 6m 55s
1000:	test: 0.7538888	best: 0.7541111 (980)	total: 23m 2s	remaining: 4m 34s
1100:	test: 0.7542331	best: 0.7542331 (1100)	total: 25m 19s	remaining: 2m 16s
1199:	test: 0.7544502	best: 0.7544809 (1192)	total: 27m 46s	remaining: 0us

bestTest = 0.7

<catboost.core.CatBoostClassifier at 0x267dd722a60>

In [134]:
testData = pd.read_csv("data/challenge1_test.csv").fillna(0)
testData.drop(columns=testData.columns[0], axis=1, inplace=True)
testData = testData.values

if addIntCategorcalVars:
    for col in intVars:
        for i, item in enumerate(testData[:,col]):
            testData[:,col][i] = int(item)

if addHexVars:
    for col in hexVars:
        for i, item in enumerate(testData[:,col]):
            if "E+" in str(item):
                item = int(float(str(item)))
            testData[:,col][i] = int(str(item), 16)


In [135]:

predictions = model.predict_proba(testData)
predictions

array([[0.70340081, 0.29659919],
       [0.87834605, 0.12165395],
       [0.86493175, 0.13506825],
       ...,
       [0.92790231, 0.07209769],
       [0.87551131, 0.12448869],
       [0.90200731, 0.09799269]])

In [136]:
out = {"id": [], "target": []}
for i, prediction in enumerate(predictions):
    out["id"].append(i+50000)
    out["target"].append(prediction[1])

df = pd.DataFrame(out) 
df.to_csv('Predictions.csv', index=False) 