In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

df = pd.read_csv('games.csv')
df = df[['winner', 'rated', 'created_at', 'last_move_at', 'turns', 'victory_status', 'increment_code',
         'white_rating', 'black_rating', 'opening_eco', 'opening_ply']]
df.winner.replace({"white": 1, "black": -1, "draw": 0}, inplace=True)

dfDum = pd.get_dummies(df)

X = dfDum.drop('winner', axis=1)
y = dfDum.winner.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 


# KNN

In [12]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))
'''
.46 accuracy
'''

              precision    recall  f1-score   support

          -1       0.43      0.47      0.45      1816
           0       0.09      0.02      0.03       192
           1       0.49      0.49      0.49      2004

    accuracy                           0.46      4012
   macro avg       0.34      0.33      0.33      4012
weighted avg       0.45      0.46      0.45      4012



'\n.47 accuracy\n'

In [13]:
knn2 = KNeighborsClassifier()
param_grid = {'n_neighbors': [i for i in range(1, 100)]}
gs = GridSearchCV(knn2, param_grid, cv=10, scoring='accuracy')
gs.fit(X_train, y_train)
pred = gs.best_estimator_.predict(X_test)

print(gs.best_estimator_)
print(classification_report(y_test, pred))
'''
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=73, p=2,
                     weights='uniform')
.48 accuracy
'''

KNeighborsClassifier(n_neighbors=73)
              precision    recall  f1-score   support

          -1       0.45      0.38      0.41      1816
           0       0.00      0.00      0.00       192
           1       0.50      0.61      0.55      2004

    accuracy                           0.48      4012
   macro avg       0.31      0.33      0.32      4012
weighted avg       0.45      0.48      0.46      4012



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


"\nKNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,\n                     weights='uniform')\n.48 accuracy\n"

# Decision tree

In [10]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
pred = dt.predict(X_test)
print(classification_report(y_test, pred))
'''
.65 accuracy
'''

              precision    recall  f1-score   support

          -1       0.62      0.59      0.61      1816
           0       0.99      0.94      0.97       192
           1       0.64      0.67      0.66      2004

    accuracy                           0.65      4012
   macro avg       0.75      0.74      0.74      4012
weighted avg       0.65      0.65      0.65      4012



'\n.65 accuracy\n'

In [11]:
dt2 = DecisionTreeClassifier()
param_grid = {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}
gs = GridSearchCV(dt2, param_grid, cv=10, scoring='accuracy')
gs.fit(X_train, y_train)

print(gs.best_estimator_)
pred = gs.best_estimator_.predict(X_test)
print(classification_report(y_test, pred))
'''
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
.64 accuracy
'''

DecisionTreeClassifier(criterion='entropy')
              precision    recall  f1-score   support

          -1       0.60      0.60      0.60      1816
           0       0.98      0.95      0.96       192
           1       0.64      0.64      0.64      2004

    accuracy                           0.64      4012
   macro avg       0.74      0.73      0.73      4012
weighted avg       0.64      0.64      0.64      4012



"\nDecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',\n                       max_depth=None, max_features=None, max_leaf_nodes=None,\n                       min_impurity_decrease=0.0, min_impurity_split=None,\n                       min_samples_leaf=1, min_samples_split=2,\n                       min_weight_fraction_leaf=0.0, presort='deprecated',\n                       random_state=None, splitter='best')\n.64 accuracy\n"

# Naive Bayes

In [33]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
pred = gnb.predict(X_test)
print(classification_report(y_test, pred))
'''
.47 accuracy
'''

              precision    recall  f1-score   support

          -1       0.45      0.57      0.50      1816
           0       0.00      0.00      0.00       192
           1       0.51      0.43      0.47      2004

    accuracy                           0.47      4012
   macro avg       0.32      0.33      0.32      4012
weighted avg       0.46      0.47      0.46      4012



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'\naccuracy of .47\n'

In [None]:
from sklearn.naive_bayes import BernoulliNB
bnb = GaussianNB()
bnb.fit(X_train, y_train)
pred = bnb.predict(X_test)
print(classification_report(y_test, pred))
'''
.47 accuracy
'''

In [17]:
gnb2 = GaussianNB()
param_grid = {'var_smoothing': [i for i in range(1, 10)]}
gsnb = GridSearchCV(gnb2, param_grid, cv=10, scoring='accuracy')
gsnb.fit(X_train, y_train)
pred = gsnb.best_estimator_.predict(X_test)
print(classification_report(y_test, pred))
print(gsnb.best_estimator_)
'''
GaussianNB(var_smoothing=1)
.5 accuracy
'''

  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /
  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /
  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /
  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /
  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /
  n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.sigma_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) /


              precision    recall  f1-score   support

          -1       0.45      0.57      0.50      1816
           0       0.00      0.00      0.00       192
           1       0.51      0.43      0.47      2004

    accuracy                           0.47      4012
   macro avg       0.32      0.33      0.32      4012
weighted avg       0.46      0.47      0.46      4012



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'\nGaussianNB(var_smoothing=1)\naccuracy of .5\n'

# Random Forest 

In [8]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
print(classification_report(y_test, pred))
'''
.68 accuracy
'''

              precision    recall  f1-score   support

          -1       0.66      0.62      0.64      1816
           0       1.00      0.94      0.97       192
           1       0.67      0.71      0.69      2004

    accuracy                           0.68      4012
   macro avg       0.78      0.76      0.77      4012
weighted avg       0.68      0.68      0.68      4012



'\n.68 accuracy\n'

In [32]:
rfc2 = RandomForestClassifier()
param_grid = {'n_estimators': [i for i in range(1, 50)]}
gsnb = GridSearchCV(rfc2, param_grid, cv=10, scoring='accuracy')
gsnb.fit(X_train, y_train)
pred = gsnb.best_estimator_.predict(X_test)
print(gsnb.best_estimator_)
print(classification_report(y_test, pred))
''''
RandomForestClassifier(n_estimators=45)
.68 accuracy
'''

RandomForestClassifier(n_estimators=45)
              precision    recall  f1-score   support

          -1       0.65      0.61      0.63      1816
           0       1.00      0.94      0.97       192
           1       0.67      0.71      0.69      2004

    accuracy                           0.68      4012
   macro avg       0.77      0.75      0.76      4012
weighted avg       0.68      0.68      0.68      4012



# Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class = 'multinomial')
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print(classification_report(y_test, pred))
'''
.5 accuracy
'''

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      1816
           0       0.00      0.00      0.00       192
           1       0.50      1.00      0.67      2004

    accuracy                           0.50      4012
   macro avg       0.17      0.33      0.22      4012
weighted avg       0.25      0.50      0.33      4012



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'\naccuracy of .5\n'

In [31]:
lr2 = LogisticRegression(multi_class='multinomial')
param_grid = {'C': [i/10 for i in range(1, 10)]}
lrgs = GridSearchCV(lr2, param_grid, cv=10, scoring='accuracy')
lrgs.fit(X_train, y_train)
pred = lrgs.best_estimator_.predict(X_test)
print(classification_report(y_test, pred))
print(lrgs.best_estimator_)
'''
LogisticRegression(C=0.1, multi_class='multinomial')
.5 accuracy
'''

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      1816
           0       0.00      0.00      0.00       192
           1       0.50      1.00      0.67      2004

    accuracy                           0.50      4012
   macro avg       0.17      0.33      0.22      4012
weighted avg       0.25      0.50      0.33      4012

LogisticRegression(C=0.1, multi_class='multinomial')


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Gradient Boost

In [4]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
pred = gbc.predict(X_test)
print(classification_report(y_test, pred))
'''
.69 accuracy
'''

              precision    recall  f1-score   support

          -1       0.68      0.61      0.64      1816
           0       0.99      0.94      0.97       192
           1       0.68      0.74      0.71      2004

    accuracy                           0.69      4012
   macro avg       0.78      0.76      0.77      4012
weighted avg       0.69      0.69      0.69      4012



'\naccuracy of .69\n'

In [35]:
gbc2 = GradientBoostingClassifier()
param_grid = {'n_estimators': [i*100 for i in range(1, 5)]}
gbcgs = GridSearchCV(gbc2, param_grid=param_grid, scoring='accuracy', cv=10)
gbcgs.fit(X_train, y_train)
pred = gbcgs.best_estimator_.predict(X_test)
print(gbcgs.best_estimator_)
print(classification_report(y_test, pred))
'''
GradientBoostingClassifier(n_estimators=400)
.81 accuracy
'''

GradientBoostingClassifier(n_estimators=400)
              precision    recall  f1-score   support

          -1       0.81      0.75      0.78      1816
           0       0.99      0.94      0.97       192
           1       0.79      0.84      0.81      2004

    accuracy                           0.81      4012
   macro avg       0.86      0.84      0.85      4012
weighted avg       0.81      0.81      0.80      4012



In [5]:
gbc3 = GradientBoostingClassifier()
param_grid = {'n_estimators': [i*100 for i in range(5, 10)]}
print('starting grid search')
gbcgs = GridSearchCV(gbc3, param_grid=param_grid, scoring='accuracy', cv=10)
print('grid search done')
gbcgs.fit(X_train, y_train)
pred = gbcgs.best_estimator_.predict(X_test)
print(gbcgs.best_estimator_)
print(classification_report(y_test, pred))
'''
GradientBoostingClassifier(n_estimators=900)
.88 accuracy
'''

starting grid search
grid search done
GradientBoostingClassifier(n_estimators=900)
              precision    recall  f1-score   support

          -1       0.88      0.85      0.87      1816
           0       0.99      0.94      0.97       192
           1       0.87      0.90      0.88      2004

    accuracy                           0.88      4012
   macro avg       0.91      0.90      0.91      4012
weighted avg       0.88      0.88      0.88      4012



In [7]:
gbc4 = GradientBoostingClassifier(n_estimators = 1000)
gbc4.fit(X_train, y_train)
pred = gbc4.predict(X_test)
print(classification_report(y_test, pred))
'''
Best accuracy with 88% 
'''

              precision    recall  f1-score   support

          -1       0.88      0.86      0.87      1816
           0       0.99      0.94      0.97       192
           1       0.87      0.90      0.89      2004

    accuracy                           0.88      4012
   macro avg       0.92      0.90      0.91      4012
weighted avg       0.88      0.88      0.88      4012

