In [149]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [52]:
#Preprocecing
#First of all we will check how many null values there are in each column
df = pd.read_csv('chess_games.csv')

df_first_lines = df.head(500000)
df_first_lines.to_csv('shorterChessDf.csv', index=False)


In [54]:
# Define the function to split chess moves
def split_chess_moves(moves, turns):
    move_list = moves.split()
    separated_moves = []
    turn_number = 1
    for i in range(0, len(move_list), 3):
        if turn_number > turns:
            break
        separated_moves.append(move_list[i + 1] if i + 1 < len(move_list) else "")
        separated_moves.append(move_list[i + 2] if i + 2 < len(move_list) else "")
        turn_number += 1
    return separated_moves

In [55]:
def determine_result(result):
    if result == '1-0':
        return 1
    elif result == '0-1':
        return 0
    elif result == '1/2-1/2':
        return 2
    else:
        return -1  # for any unexpected result format

In [56]:
df = pd.read_csv('shorterChessDf.csv')  # Load the DataFrame

df['Result'] = df['Result'].apply(lambda x: determine_result(x))  # Apply the determine_result function

number_of_turns = 20  # Specify the number of turns we want to extract

# Create new columns for the specified number of turns
columns = [f"Turn {i + 1} White" for i in range(number_of_turns)] + [f"Turn {i + 1} Black" for i in range(number_of_turns)]

# Apply the split_chess_moves function to each row in the DataFrame
moves_split = df['AN'].apply(lambda x: pd.Series(split_chess_moves(x, number_of_turns)))

df[columns] = moves_split  # Combine the new columns with the original DataFrame

# Drop specified columns
df.drop(columns=['Event', 'White', 'Black', 'UTCDate', 'UTCTime', 'WhiteElo', 'BlackElo', 'WhiteRatingDiff',
                 'BlackRatingDiff', 'ECO', 'Opening', 'TimeControl', 'Termination', 'AN'], inplace=True)

# Save the modified DataFrame to a CSV file
output_file = 'modified_dataframe.csv'
df.to_csv(output_file, index=False)

df.head()

Unnamed: 0,Result,Turn 1 White,Turn 2 White,Turn 3 White,Turn 4 White,Turn 5 White,Turn 6 White,Turn 7 White,Turn 8 White,Turn 9 White,...,Turn 11 Black,Turn 12 Black,Turn 13 Black,Turn 14 Black,Turn 15 Black,Turn 16 Black,Turn 17 Black,Turn 18 Black,Turn 19 Black,Turn 20 Black
0,1,d4,d5,c4,c6,e3,a6,Nf3,e5,cxd5,...,Rac1,Qd6,Qc2,Qe6,Nb1,Bd6,a3,Nb6,Qc6,Nfd5
1,0,e4,e5,b3,Nf6,Bb2,Nc6,Nf3,d6,d3,...,Rxc7,Nxa2,Ra1,Nb4,Raxa7,Rxa7,Rxa7,Nxe4,Nxe4,Rxe4
2,1,e4,d5,exd5,Qxd5,Nf3,Bg4,Be2,Nf6,Nc3,...,Qxb7,Bg4,Qc6+,Ke7,Rae1,Rxh4,Nd5+,Kf8,Nxf6,Qxf6
3,1,e3,Nf6,Bc4,d6,e4,e6,Nf3,Nxe4,Nd4,...,Nxh3,g5+,Nxg5,fxg5+,Kxg5,Rh5+,Qxh5+,Kd7,Qf7+,Kd6
4,0,e4,c5,Nf3,d6,d4,cxd4,Nxd4,Nf6,Nc3,...,Ne2,Qd8,Bb6,Qe8,Be3,Ra6,c4,Nh5,Ng3,Nxg3


after splitting the games to  moves, we want to remove all null values because there are 2 options:
1. the game is shorter that the number of turns, and we do not want to consider it (null moves for not existing moves)
2. there are missing moves during the game. i.e. not complete data - we want to remove it.

In [101]:
dfFiltered = df.copy()
dfFiltered = dfFiltered.dropna()
dfFilteredAndCompacted = dfFiltered.head(50000)

In [103]:
rows_with_nulls = dfFilteredAndCompacted[dfFilteredAndCompacted.isnull().any(axis=1)]
rows_with_nulls

Unnamed: 0,Result,Turn 1 White,Turn 2 White,Turn 3 White,Turn 4 White,Turn 5 White,Turn 6 White,Turn 7 White,Turn 8 White,Turn 9 White,...,Turn 11 Black,Turn 12 Black,Turn 13 Black,Turn 14 Black,Turn 15 Black,Turn 16 Black,Turn 17 Black,Turn 18 Black,Turn 19 Black,Turn 20 Black


Here, we have verified that the data contains no NaNs or Nulls

In [110]:
# Group by these columns and identify groups with more than one entry
duplicate_groups = dfFilteredAndCompacted.groupby(list(dfFilteredAndCompacted.columns[1:])).size()
duplicate_groups = duplicate_groups[duplicate_groups > 1]

# Show the duplicate groups
duplicate_groups

Turn 1 White  Turn 2 White  Turn 3 White  Turn 4 White  Turn 5 White  Turn 6 White  Turn 7 White  Turn 8 White  Turn 9 White  Turn 10 White  Turn 11 White  Turn 12 White  Turn 13 White  Turn 14 White  Turn 15 White  Turn 16 White  Turn 17 White  Turn 18 White  Turn 19 White  Turn 20 White  Turn 1 Black  Turn 2 Black  Turn 3 Black  Turn 4 Black  Turn 5 Black  Turn 6 Black  Turn 7 Black  Turn 8 Black  Turn 9 Black  Turn 10 Black  Turn 11 Black  Turn 12 Black  Turn 13 Black  Turn 14 Black  Turn 15 Black  Turn 16 Black  Turn 17 Black  Turn 18 Black  Turn 19 Black  Turn 20 Black
e4            c5            Nf3           e6            d4            cxd4          Nxd4          Nf6           Nc3           Bb4            Bg5            Qa5            Bd2            Qe5            Ndb5           Nxe4           Nxe4           Qxe4+          Be2            Na6            Bxb4          Qxb4+         c3            Qxb2          Rb1           Qxa2          Nd6+          Ke7           O-O           Qd

We are looking for duplicated data. we found 1 duplicate which appears twice in the data. <br>
there are 2 scenarios where this may happen: <br>
   1. the whole game is duplicated by chance or by an error in the data (human or otherwise)
   2. The first 20 moves in the two games are coincidentally the same. <br>


Whichever of the cases is correct, this is only 1 duplicate from the 50,000 records in the data, so it has little to no impact on the results for our purposes.<br><br>
Since our features are not numeric, and we do not want to limit the data by confiding it to be numeric, linear correlation and various other analysis techniques cannot be performed.

In [127]:
y = dfFilteredAndCompacted['Result']
x = dfFilteredAndCompacted.drop(['Result'], axis=1)

In [134]:
# Extract all unique moves from the DataFrame
moves = set(x.values.flatten())
moves = {move for move in moves if pd.notnull(move)}

# Create a mapping of moves to numeric values
move_to_num = {move: i for i, move in enumerate(moves)}

# Function to convert moves to numeric values using the mapping
def convert_moves_to_numeric(move):
    return move_to_num.get(move, -1)  # Use -1 for any missing or unknown moves

# Apply the conversion to the DataFrame
x = x.map(convert_moves_to_numeric)

In [135]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [136]:
dummyModel = DummyClassifier(strategy="most_frequent")
dummyModel.fit(X_train, y_train)
predictionsDummy = dummyModel.predict(X_test)

accuracyDummy = metrics.accuracy_score(y_test, predictionsDummy)
accuracyDummy

0.4846

In [145]:


# Create a pipeline that scales the data and then applies logistic regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000, solver='lbfgs'))  # Handle multi-class
])

pipeline.fit(X_train, y_train)  # Fit the pipeline on the training data
predictionsLogReg = pipeline.predict(X_test)  # Make predictions
accuracyLogReg = accuracy_score(y_test, predictionsLogReg)  # Calculate accuracy
print("Logistic Regression Accuracy:", accuracyLogReg)  # Print the accuracy

Logistic Regression Accuracy: 0.4886


In [146]:
import time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

#AdaBoost
start_timeAdaBoost = time.time()
modelDecTree = DecisionTreeClassifier(random_state=0, max_depth=2)
modelAdaBoost = AdaBoostClassifier(modelDecTree, n_estimators = 1000, random_state = 0, learning_rate = 1)
modelAdaBoost.fit(X_train, y_train)
predictionsAdaBoost = modelAdaBoost.predict(X_test)
accuracyAdaBoost = metrics.accuracy_score(y_test, predictionsAdaBoost)
print("Accuracy (AdaBoost): ", accuracyAdaBoost)
timeAdaBoost = time.time() - start_timeAdaBoost
print("Time taken to achieve result: %s seconds" % timeAdaBoost)



Accuracy (AdaBoost):  0.5034
Time taken to achieve result: 167.02821230888367 seconds


In [152]:
from sklearn.ensemble import RandomForestClassifier

#Random Forests
start_timeRandForests = time.time()
modelRandForests = RandomForestClassifier(n_estimators = 1000, n_jobs = -1, random_state = 0)
modelRandForests.fit(X_train, y_train)
predictionsRandForests = modelRandForests.predict(X_test)
accuracyRandForests = metrics.accuracy_score(y_test, predictionsRandForests)
print("Accuracy (Random Forests): ", accuracyRandForests)
timeRandForests = time.time() - start_timeRandForests
print("Time taken to achieve result: %s seconds" % (timeRandForests))

Accuracy (Random Forests):  0.5193
Time taken to achieve result: 38.084845542907715 seconds


In [157]:
import warnings

#XGBoost
start_timeXGBoost = time.time()
warnings.filterwarnings('ignore')
modelXGBoost = XGBClassifier(learning_rate = 0.2, n_estimators = 2000, verbosity = 0, use_label_encoder = False, n_jobs = -1)
modelXGBoost.fit(X_train, y_train)
predictionsXGBoost = modelXGBoost.predict(X_test)
accuracyXGBoost = metrics.accuracy_score(y_test, predictionsXGBoost)
print("Accuracy (XGBoost): ", accuracyXGBoost)
timeXGBoost = time.time() - start_timeXGBoost
print("Time taken to achieve result: %s seconds" % (timeXGBoost))

Accuracy (XGBoost):  0.5124
Time taken to achieve result: 38.73734736442566 seconds


In [None]:
# Define the model with regularization and dropout
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model with a lower learning rate
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=10, batch_size=512, validation_data=(X_val_scaled, y_val))

# Evaluate model performance on validation data
val_loss, val_accuracy = model.evaluate(X_val_scaled, y_val)
print("Validation Loss:", val_loss)
print("Validation Accuracy:", val_accuracy)