Imports

In [40]:
import pandas as pd
import re
import numpy as np

Formatting opening names to be shorter to group them in main openings and eventually reduce number of columns.

In [59]:
def clean_opening(opening):
    parts = opening.split("-")
    cleaned = "-".join(parts[:2])
    return cleaned

# Wczytanie danych
games = pd.read_json("chess_data.json")

# Zastosowanie funkcji do kolumny 'opening'
games["opening"] = games["opening"].apply(clean_opening)


Check if data has loaded properly.

In [60]:
games.head()

Unnamed: 0,rated,turns,victory_status,winner,time_class,white_id,white_rating,black_id,black_rating,opening
0,True,53,Time forfeit,White,blitz,Pablo_810,1162,ahmed8909,838,Pirc-Defense
1,True,67,Resign,White,blitz,MichaelMikeCorleone,1099,Pablo_810,987,Van-t
2,True,30,Resign,Black,blitz,POLCIE,997,Pablo_810,1095,Kings-Fianchetto
3,True,44,Resign,Black,blitz,Pablo_810,1033,contakto,1181,Bishops-Opening
4,True,69,Time forfeit,White,blitz,Fernando2017p,1088,Pablo_810,976,Queens-Pawn


In [61]:
games.dtypes

Unnamed: 0,0
rated,bool
turns,int64
victory_status,object
winner,object
time_class,object
white_id,object
white_rating,int64
black_id,object
black_rating,int64
opening,object


Categorizing games by the color which I played.

In [62]:
games["played_as"] = np.where(games["white_id"] == "Pablo_810", "White", "Black")
games.head()

Unnamed: 0,rated,turns,victory_status,winner,time_class,white_id,white_rating,black_id,black_rating,opening,played_as
0,True,53,Time forfeit,White,blitz,Pablo_810,1162,ahmed8909,838,Pirc-Defense,White
1,True,67,Resign,White,blitz,MichaelMikeCorleone,1099,Pablo_810,987,Van-t,Black
2,True,30,Resign,Black,blitz,POLCIE,997,Pablo_810,1095,Kings-Fianchetto,Black
3,True,44,Resign,Black,blitz,Pablo_810,1033,contakto,1181,Bishops-Opening,White
4,True,69,Time forfeit,White,blitz,Fernando2017p,1088,Pablo_810,976,Queens-Pawn,Black


Changing columns type from object to float by dummy encoding.

In [63]:
games = games.drop(['white_id', 'black_id'], axis=1)

victory_dummy = pd.get_dummies(data = games['victory_status'], drop_first = False, dtype = float)
games = pd.concat([games.drop('victory_status', axis = 1), victory_dummy], axis = 1)

rated_dummy = pd.get_dummies(data = games['rated'], drop_first = False, dtype = float)
games = pd.concat([games.drop('rated', axis = 1), rated_dummy], axis = 1)

time_dummy = pd.get_dummies(data = games['time_class'], drop_first = False, dtype = float)
games = pd.concat([games.drop('time_class', axis = 1), time_dummy], axis = 1)

played_as_dummy = pd.get_dummies(data = games['played_as'], drop_first = False, dtype = float)
games = pd.concat([games.drop('played_as', axis = 1), played_as_dummy], axis = 1)

games.head()

Unnamed: 0,turns,winner,white_rating,black_rating,opening,Draw,Mate,Resign,Time forfeit,False,True,blitz,bullet,daily,rapid,Black,White
0,53,White,1162,838,Pirc-Defense,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,67,White,1099,987,Van-t,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,30,Black,997,1095,Kings-Fianchetto,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,44,Black,1033,1181,Bishops-Opening,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
4,69,White,1088,976,Queens-Pawn,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0


Changing from two columns: white and black rating to a column containing the difference of these ratings.

In [64]:
games['rating_diff'] = games['white_rating'] - games['black_rating']

In [65]:
games = games.drop(columns=['black_rating', 'white_rating'])
games.head()

Unnamed: 0,turns,winner,opening,Draw,Mate,Resign,Time forfeit,False,True,blitz,bullet,daily,rapid,Black,White,rating_diff
0,53,White,Pirc-Defense,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,324
1,67,White,Van-t,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,112
2,30,Black,Kings-Fianchetto,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,-98
3,44,Black,Bishops-Opening,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,-148
4,69,White,Queens-Pawn,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,112


In [66]:
opening_dummy = pd.get_dummies(data = games['opening'], dtype = float)
games = pd.concat([games.drop('opening', axis = 1), opening_dummy], axis = 1)
games.head()

Unnamed: 0,turns,winner,Draw,Mate,Resign,Time forfeit,False,True,blitz,bullet,...,Tarrasch-Defense,The-Wrongcloud,Three-Knights,Torre-Attack,Trompowsky-Attack,Undefined,Van-Geet,Van-t,Vienna-Game,Ware-Opening
0,53,White,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,67,White,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,30,Black,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,44,Black,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,69,White,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
games.dtypes

Unnamed: 0,0
turns,int64
winner,object
Draw,float64
Mate,float64
Resign,float64
...,...
Undefined,float64
Van-Geet,float64
Van-t,float64
Vienna-Game,float64


In [67]:
from sklearn.model_selection import train_test_split

X = games.drop('winner', axis=1).values
y = games['winner'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [68]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

In [69]:
predictions = dtree.predict(X_test)

In [70]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

       Black       0.71      0.72      0.72      1249
        Draw       0.84      0.83      0.84       136
       White       0.73      0.71      0.72      1299

    accuracy                           0.72      2684
   macro avg       0.76      0.76      0.76      2684
weighted avg       0.72      0.72      0.72      2684



In [71]:
print(confusion_matrix(y_test, predictions))

[[904   9 336]
 [ 10 113  13]
 [361  12 926]]


In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

games.columns = games.columns.astype(str)


X = games.drop(columns=['winner'])
y = games['winner']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predykcja i ocena
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")


Accuracy: 0.73
