# Naive Bayes Classification on Lichess January 2013 database

In [1]:
from collections import Counter
from re import findall
import numpy as np
from numpy import sign
import csv

## Preprocessing

In [2]:
chunks = []

with open('./01-2013.pgn', 'r') as file:

    #filter into chunks representing individual games
    begin_chunk = '[Event'
    chunk = ''
    #features = []
    for line in file:
        if line[0:6] != begin_chunk:
            chunk += line
        else:
            chunks.append(chunk[0:len(chunk)-2].split('\n'))
            chunk = line

# filter by games with eval
eval_chunks = [c for c in chunks if '{' in c[-1][0:10]]
# write eval chunks to filtered pgn file
with open('./filtered.pgn', 'w') as filtered:
    for chunk in eval_chunks:
        for e in chunk:
            filtered.write(str(e) + '\n')
        filtered.write('\n\n')

# feature engineering + extraction
feature_vectors = []
for c in eval_chunks:
    # game_type, white_elo, black_elo, termination_type, move_seq
    features = [c[0].split()[2], c[7], c[8], c[14], c[16]]
    features[1] = int(features[1].split()[1].strip('"]'))
    features[2] = int(features[2].split()[1].strip('"]'))
    features[3] = features[3].split()[1].strip('"]')
    features[4] = [x.strip('\%eval []') for x in findall(r'\%eval.{1,5}', features[4])]
    feature_vectors.append(features)
    # now in format str, int, int, str, list[str]

    
games_maps = {
    'Bullet': 0,
    'Blitz': 1,
    'Classical': 2,
    'Correspondence': 3
}

end_maps = {
    'Normal': 0,
    'Time': 1
}

def bins(stat):
    if stat < 1650:
        return 0
    elif stat < 1950:
        return 1
    else:
        return 2


# let's get the eval into a more meaningful format
for i, v in enumerate(feature_vectors):
    a_0 = games_maps[v[0]]
    a_1 = int(bins((v[1] + v[2]) / 2))
    a_2 = int(end_maps[v[3]])
    a = [float(x) if x[0] != '#' else sign(float(x[1:]))*100 for x in v[4]]
    diffs = [round(a[x] - a[x+1], 2) for x in range(0,len(a)-1)]
    stdv = np.std(diffs)
    mean_ = np.mean(diffs)
    a_3 = stdv
    a_4 = mean_
    feature_vectors[i] = [a_1, a_0, a_2, a_3, a_4]
    
print(feature_vectors[0])

with open('games.csv', 'w') as games:
    writer = csv.writer(games)
    writer.writerows(feature_vectors)

[0, 0, 0, 20.700816468659774, 1.8555555555555554]


## Classifier

### Classify elo bin based on other features

In [3]:
!pip3 install mixed-naive-bayes



In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score
from mixed_naive_bayes import MixedNB

y = [v[0] for v in feature_vectors]
X = np.array([x[1:] for x in feature_vectors])

model = MixedNB(categorical_features=[0,1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test, y_test)

print("Accuracy: " + str(((y_test == y_pred).sum()) / X_test.shape[0] ))


[4 2]
Accuracy: 0.5769230769230769
