In [118]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
data = pd.read_csv('data/krkopt.data', names=['wkf', 'wkr', 'wrf', 'wrr', 'bkf', 'bkr', 'result'])

data.head(5)

Unnamed: 0,wkf,wkr,wrf,wrr,bkf,bkr,result
0,a,1,b,3,c,2,draw
1,a,1,c,1,c,2,draw
2,a,1,c,1,d,1,draw
3,a,1,c,1,d,2,draw
4,a,1,c,2,c,1,draw


In [119]:
data.isnull().sum()

wkf       0
wkr       0
wrf       0
wrr       0
bkf       0
bkr       0
result    0
dtype: int64

In [120]:
from collections import Counter
Counter(data.result)

Counter({'draw': 2796,
         'zero': 27,
         'one': 78,
         'two': 246,
         'three': 81,
         'four': 198,
         'five': 471,
         'six': 592,
         'seven': 683,
         'eight': 1433,
         'nine': 1712,
         'ten': 1985,
         'eleven': 2854,
         'twelve': 3597,
         'thirteen': 4194,
         'fourteen': 4553,
         'fifteen': 2166,
         'sixteen': 390})

In [121]:
file_encoder = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6,  'g': 7, 'h': 8}

data.columns
data.wkf.replace(to_replace=file_encoder, inplace=True)
data.wrf.replace(to_replace=file_encoder, inplace=True)
data.bkf.replace(to_replace=file_encoder, inplace=True)

In [122]:
num_moves = {'draw': 17,
 'zero': 0,
 'one': 1,
 'two': 2,
 'three': 3,
 'four': 4,
 'five': 5,
 'six': 6,
 'seven': 7,
 'eight': 8,
'nine': 9,
 'ten': 10,
 'eleven': 11,
 'twelve': 12,
 'thirteen': 13,
 'fourteen': 14,
 'fifteen': 15,
 'sixteen': 16}

In [123]:
type_test = {}
for i in data.wkf:
    tp = type(i)
    if tp not in type_test:
        type_test[tp] = 1
    else:
        type_test[tp] += 1
type_test

{int: 28056}

In [124]:
data.wkf = data.wkf.astype('int64')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28056 entries, 0 to 28055
Data columns (total 7 columns):
wkf       28056 non-null int64
wkr       28056 non-null int64
wrf       28056 non-null int64
wrr       28056 non-null int64
bkf       28056 non-null int64
bkr       28056 non-null int64
result    28056 non-null object
dtypes: int64(6), object(1)
memory usage: 1.5+ MB


In [125]:
data.result.replace(to_replace=num_moves, inplace=True)
#Counter(data.result)

X = data.iloc[:,:-1]
y = data.result
X.head()
#data.head()

Unnamed: 0,wkf,wkr,wrf,wrr,bkf,bkr
0,1,1,2,3,3,2
1,1,1,3,1,3,2
2,1,1,3,1,4,1
3,1,1,3,1,4,2
4,1,1,3,2,3,1


In [126]:
Counter(data.result)

Counter({17: 2796,
         0: 27,
         1: 78,
         2: 246,
         3: 81,
         4: 198,
         5: 471,
         6: 592,
         7: 683,
         8: 1433,
         9: 1712,
         10: 1985,
         11: 2854,
         12: 3597,
         13: 4194,
         14: 4553,
         15: 2166,
         16: 390})

In [127]:
  X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28056 entries, 0 to 28055
Data columns (total 6 columns):
wkf    28056 non-null int64
wkr    28056 non-null int64
wrf    28056 non-null int64
wrr    28056 non-null int64
bkf    28056 non-null int64
bkr    28056 non-null int64
dtypes: int64(6)
memory usage: 1.3 MB


In [128]:
X.shape

(28056, 6)

In [129]:
y.shape

(28056,)

In [132]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)



In [133]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

accuracy_score(y_predict, y_test)

0.7744119743406985

In [134]:
model.feature_importances_

array([0.05099288, 0.08886133, 0.24740484, 0.24232771, 0.20520361,
       0.16520963])

In [135]:
random_grid = {
               'max_depth': [10, 15, 20],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1,2, 4],
               'bootstrap': [True, False]
}

In [136]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(model, random_grid, cv=5)

In [137]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='wa

In [138]:
grid.best_estimator_

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=20, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [140]:
model = grid.best_estimator_

y_predict = model.predict(X_test)
accuracy_score(y_test, y_predict)

0.7986457590876693

In [145]:
from xgboost import XGBClassifier

model = XGBClassifier(random_state=42, n_estimators=500, learning_rate=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
accuracy_score(y_test, y_predict)


0.8273342836778332