In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier, Pool
import numpy as np

from eda import assign_time_control

# Read and prepare training and test datasets

In [2]:
# read data
headers = pd.read_parquet("../data/headers")

In [3]:
# clean data
headers = headers[headers['Result'] != "*"]
headers = headers[headers['TimeControl'] != "-"]
headers = headers[headers['FEN'] != "?"]
headers = headers[headers['BlackElo'] != "?"]
headers = headers[headers['WhiteElo'] != "?"]

In [4]:
# add column with initial position
headers['position'] = headers['FEN'].map(lambda x: x[:8])

In [6]:
# add column with time control
headers['time_class'] = headers['TimeControl'].map(assign_time_control)

# Train / Test split

In [7]:
train = headers[headers['UTCDate'] < "2022.06.01"]
test = headers[headers['UTCDate'] >= "2022.06.01"]

In [8]:
len(train) / len(headers)

0.8295027025172017

# Machine Learning model

In [9]:
def build_pools(df, columns, cat_columns):
    y = df['Result']
    X = df[columns]
    return Pool(X, y, cat_features=cat_columns)

In [10]:
train_pool = build_pools(df=train, 
                         columns=['WhiteElo', 'BlackElo', 'time_class', 'position'], 
                         cat_columns=['time_class', 'position'])
train_pool_no_pos = build_pools(df=train, 
                                columns=['WhiteElo', 'BlackElo', 'time_class'], 
                                cat_columns=['time_class'])

test_pool = build_pools(df=test, 
                        columns=['WhiteElo', 'BlackElo', 'time_class', 'position'], 
                        cat_columns=['time_class', 'position'])

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


# Define and train a CatBoost model

In [11]:
params = {"iterations": 100,
          "depth": 6,
          "loss_function": "MultiClass",
          "learning_rate": .01,
          "train_dir": "multiclass"
          }

In [12]:
model = CatBoostClassifier(**params)
model.fit(train_pool, early_stopping_rounds=50)

0:	learn: 1.0928814	total: 8.1s	remaining: 13m 22s
1:	learn: 1.0872800	total: 14.9s	remaining: 12m 10s
2:	learn: 1.0818093	total: 25s	remaining: 13m 29s
3:	learn: 1.0764190	total: 30.7s	remaining: 12m 16s
4:	learn: 1.0711550	total: 37.7s	remaining: 11m 56s
5:	learn: 1.0660021	total: 43.2s	remaining: 11m 16s
6:	learn: 1.0609268	total: 50.3s	remaining: 11m 7s
7:	learn: 1.0559648	total: 56.1s	remaining: 10m 44s
8:	learn: 1.0510313	total: 1m 1s	remaining: 10m 21s
9:	learn: 1.0462584	total: 1m 6s	remaining: 10m
10:	learn: 1.0415125	total: 1m 12s	remaining: 9m 46s
11:	learn: 1.0368831	total: 1m 17s	remaining: 9m 31s
12:	learn: 1.0323492	total: 1m 23s	remaining: 9m 18s
13:	learn: 1.0279133	total: 1m 28s	remaining: 9m 3s
14:	learn: 1.0235329	total: 1m 34s	remaining: 8m 56s
15:	learn: 1.0192293	total: 1m 39s	remaining: 8m 44s
16:	learn: 1.0149961	total: 1m 45s	remaining: 8m 34s
17:	learn: 1.0108049	total: 1m 51s	remaining: 8m 25s
18:	learn: 1.0067741	total: 1m 56s	remaining: 8m 14s
19:	learn: 1

<catboost.core.CatBoostClassifier at 0x17fbdfb80>

In [13]:
predictions = model.predict(test_pool)
predictions_proba = model.predict_proba(test_pool)

# Dummy model

In [None]:
def f(r):
    if r["WhiteElo"] > r["BlackElo"]:
        return "1-0"
    if r["WhiteElo"] == r["BlackElo"]:
        return "1/2-1/2"
    return "0-1"
dummy_preds = test.apply(f, axis=1)

# Evaluation

In [None]:
print(classification_report(test["Result"], predictions))

In [None]:
print(classification_report(test["Result"], dummy_preds))