In [38]:
import pandas as pd
import numpy as np
import matplotlib as plot
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
import warnings

In [39]:
df = pd.read_csv("../data/games.csv")

In [40]:
df.head()

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


In [41]:
df.shape

(20058, 16)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20058 entries, 0 to 20057
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              20058 non-null  object 
 1   rated           20058 non-null  bool   
 2   created_at      20058 non-null  float64
 3   last_move_at    20058 non-null  float64
 4   turns           20058 non-null  int64  
 5   victory_status  20058 non-null  object 
 6   winner          20058 non-null  object 
 7   increment_code  20058 non-null  object 
 8   white_id        20058 non-null  object 
 9   white_rating    20058 non-null  int64  
 10  black_id        20058 non-null  object 
 11  black_rating    20058 non-null  int64  
 12  moves           20058 non-null  object 
 13  opening_eco     20058 non-null  object 
 14  opening_name    20058 non-null  object 
 15  opening_ply     20058 non-null  int64  
dtypes: bool(1), float64(2), int64(4), object(9)
memory usage: 2.3+ MB


In [43]:
cols_to_drop = ['winner', 'id', 'white_id', 'black_id', 'moves', 'last_move_at', 'victory_status', 'turns']
x = df.drop(columns=cols_to_drop, axis=1)
y = df['winner']

In [44]:
from sklearn.calibration import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)
print(f"Classes mapeadas: {le.classes_}")

Classes mapeadas: ['black' 'draw' 'white']


In [45]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

num_features = x.select_dtypes(exclude='object').columns
cat_features = x.select_dtypes(include='object').columns

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", OneHotEncoder(handle_unknown='ignore'), cat_features),
        ("StandardScaler", StandardScaler(), num_features)
    ]
)

In [46]:
x_processed = preprocessor.fit_transform(x)

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    x_processed, y, test_size=0.3, random_state=42
)
print(X_train.shape)
print(X_test.shape)

(14040, 2247)
(6018, 2247)


In [48]:
from sklearn.metrics import log_loss

def evaluate_model_proba(true, predict_labels, predict_probs):
    acc = accuracy_score(true, predict_labels)
    loss = log_loss(true, predict_probs) 
    return acc, loss

In [None]:
from sklearn.metrics import log_loss, accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV


params = {
    "KNeighborsClassifier": {
        'n_neighbors': [5, 11, 21],
        'weights': ['uniform'], 
        'algorithm': ['auto']
    },
    "DecisionTreeClassifier": {
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, 8, 12],
        'min_samples_leaf': [10, 20],
        'class_weight': ['balanced']
    },
    "RandomForestClassifier": {
        'n_estimators': [100, 200],
        'max_depth': [10, 15], 
        'min_samples_split': [5, 10],
        'class_weight': ['balanced']
    },
    "AdaBoostClassifier": {
        'n_estimators': [50, 100],
        'learning_rate': [0.1, 0.5, 1.0]
    },
    "XGBClassifier": {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8],
        'max_delta_step': [1] 
    }
}

models = {
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "RandomForestClassifier": RandomForestClassifier(n_jobs=1),
    "XGBClassifier": XGBClassifier(n_jobs=1)
}

model_list = []
acc_test_list = []
log_loss_list = []

print("Iniciando Treinamento com Tuning e Probabilidades")

for model_name, model_obj in models.items():
    para = params[model_name]

    rs = RandomizedSearchCV(model_obj, para, n_iter=5, cv=3, n_jobs=-1, verbose=0, random_state=42)
    rs.fit(X_train, y_train)

    best_model = rs.best_estimator_
    
    y_test_pred = best_model.predict(X_test)
    y_test_probs = best_model.predict_proba(X_test)

    
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_log_loss = log_loss(y_test, y_test_probs)

    model_list.append(model_name)
    acc_test_list.append(test_accuracy)
    log_loss_list.append(test_log_loss)
    
    print(f"Modelo: {model_name}")
    print(f"Accuracy: {test_accuracy:.4f} Log Loss: {test_log_loss:.4f}")
    print("-" * 40)

results_df = pd.DataFrame({
    'Model Name': model_list,
    'Accuracy': acc_test_list,
    'Log Loss': log_loss_list
}).sort_values(by='Log Loss', ascending=True)

print("Ordenado por Log Loss:")
print(results_df)

Iniciando Treinamento com Tuning e Probabilidades...





✅ Modelo: KNeighborsClassifier
   Accuracy: 0.5788 | Log Loss: 1.2198
----------------------------------------
✅ Modelo: DecisionTreeClassifier
   Accuracy: 0.4840 | Log Loss: 1.0972
----------------------------------------
✅ Modelo: AdaBoostClassifier
   Accuracy: 0.6130 | Log Loss: 1.0675
----------------------------------------
✅ Modelo: RandomForestClassifier
   Accuracy: 0.5548 | Log Loss: 1.0507
----------------------------------------
✅ Modelo: XGBClassifier
   Accuracy: 0.6198 | Log Loss: 0.7650
----------------------------------------

RANKING FINAL (Ordenado por Log Loss):
               Model Name  Accuracy  Log Loss
4           XGBClassifier  0.619807  0.764975
3  RandomForestClassifier  0.554835  1.050689
2      AdaBoostClassifier  0.612994  1.067517
1  DecisionTreeClassifier  0.484048  1.097195
0    KNeighborsClassifier  0.578764  1.219820


In [50]:
print(df['winner'].value_counts(normalize=True))
print(pd.Series(y_test_pred).value_counts(normalize=True))

winner
white    0.498604
black    0.454033
draw     0.047363
Name: proportion, dtype: float64
2    0.575440
0    0.423895
1    0.000665
Name: proportion, dtype: float64
