# EDA and Models Performances

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import json
import pickle
import joblib
import numpy as np

In [2]:
df = pd.read_csv("optimal_steps_tic_tac_toe_games_dataset.csv")
df.head(15)

Unnamed: 0.1,Unnamed: 0,point_1,point_2,point_3,point_4,point_5,point_6,point_7,point_8,point_9,best_step
0,0,h,c,2,3,h,5,6,c,8,[8]
1,1,h,h,c,3,c,h,6,7,8,[6]
2,2,0,1,h,3,4,5,c,h,8,"[0, 1, 4]"
3,3,0,1,2,c,4,h,h,7,c,"[0, 1, 2, 4, 7]"
4,4,h,c,h,c,h,5,6,h,c,[6]
5,5,0,c,2,3,c,5,h,h,8,[8]
6,6,h,1,2,3,4,c,h,c,8,[3]
7,7,0,c,2,3,h,5,h,c,8,[2]
8,8,0,c,c,h,4,c,h,h,8,"[0, 8]"
9,9,0,1,c,3,c,h,6,7,h,[6]


In [4]:
df.columns

Index(['Unnamed: 0', 'point_1', 'point_2', 'point_3', 'point_4', 'point_5',
       'point_6', 'point_7', 'point_8', 'point_9', 'best_step'],
      dtype='object')

In [3]:
df = pd.read_csv("optimal_steps_tic_tac_toe_games_dataset.csv")
df = df.drop(columns=['Unnamed: 0'])
print(df.shape)

# convertir string -> integer
label_encoders = {}
for col in df.columns[:-1]:  # todas las columnas menos 'best_step'
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype('str'))
    label_encoders[col] = le  # save LabelEncoder for every column

# save label_encoders
joblib.dump(label_encoders, "label_encoders.pkl") 

# convertir 'best_step' string -> list
df['best_step'] = df['best_step'].apply(json.loads)

# Expandir el dataset para incluir todas las posibles clases
expanded_data = []
original_indices = []
for idx, row in df.iterrows():
    for step in row['best_step']:
        new_row = row.drop('best_step').to_dict()
        new_row['best_step'] = step
        expanded_data.append(new_row)
        original_indices.append(idx)

expanded_df = pd.DataFrame(expanded_data)
expanded_df['original_index'] = original_indices

expanded_df.head(15)

(1000, 10)


Unnamed: 0,point_1,point_2,point_3,point_4,point_5,point_6,point_7,point_8,point_9,best_step,original_index
0,2,1,0,0,2,0,0,1,0,8,0
1,2,2,1,0,1,2,0,0,0,6,1
2,0,0,2,0,0,0,1,2,0,0,2
3,0,0,2,0,0,0,1,2,0,1,2
4,0,0,2,0,0,0,1,2,0,4,2
5,0,0,0,1,0,2,2,0,1,0,3
6,0,0,0,1,0,2,2,0,1,1,3
7,0,0,0,1,0,2,2,0,1,2,3
8,0,0,0,1,0,2,2,0,1,4,3
9,0,0,0,1,0,2,2,0,1,7,3


In [11]:
# define the features (X) and the target variable (y)
X = expanded_df.drop(columns=['best_step', 'original_index']).to_numpy()
y = expanded_df['best_step'].to_numpy()
original_indices = expanded_df['original_index'].to_numpy()

# train y test data
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
    X, y, original_indices, test_size=0.2, random_state=42)

In [12]:
X_train

array([[0, 0, 0, ..., 0, 2, 0],
       [2, 1, 0, ..., 0, 2, 0],
       [2, 2, 0, ..., 1, 0, 2],
       ...,
       [1, 0, 2, ..., 2, 1, 0],
       [1, 2, 2, ..., 0, 0, 0],
       [1, 1, 0, ..., 2, 0, 0]])

## Classifier Selection

In [29]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, hamming_loss, jaccard_score, f1_score
from sklearn.naive_bayes import GaussianNB

In [44]:
def classifier_performance(model, param_grid, train_data, train_labels, test_data, test_labels):
    # buscar los mejores hiperpar√°metros
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=None, scoring='accuracy', verbose=1)
    grid_search.fit(train_data, train_labels)
    
    best_model = grid_search.best_estimator_
    best_model.fit(train_data, train_labels)
    
    # predict en test data
    y_pred = best_model.predict(test_data)

    # value model accuracy from the list of possible proper steps
    y_test_original = df.loc[test_indices]['best_step'].tolist()
    accuracy = sum([1 if pred in actual else 0 for pred, actual in zip(y_pred, y_test_original)]) / len(y_test)
    print(f"Accuracy: {accuracy:.2f}")

    return best_model

### Random Forest Classifier

In [48]:
rfc = RandomForestClassifier(random_state=42)
#rfc.get_params()
rfc_param_grid = {
    'n_estimators': [100, 300, 500, 700, 900],
    'max_features': [1.0, 'sqrt'],
    'criterion': ['gini', 'entropy', 'log_loss']
}

rfc = classifier_performance(rfc, rfc_param_grid, X_train, y_train, X_test, y_test)
joblib.dump(rfc, "rfc.pkl") 

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Accuracy: 0.80


['rfc.pkl']

In [14]:
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

### SVM

In [49]:
svm = SVC(random_state=42)
# svm.get_params()
svm_param_grid = {
    "C": [1, 10, 100],
    "decision_function_shape": ['ovo', 'ovr'],
    "kernel": ['linear', 'rbf', 'poly', 'sigmoid'],
    "gamma": [0.01, 0.1, 'scale'],
    "coef0": [1, -1, 0]
}

min_max_scaler = MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_test_minmax = min_max_scaler.fit_transform(X_test)

svm = classifier_performance(svm, svm_param_grid, X_train_minmax, y_train, X_test_minmax, y_test)
joblib.dump(svm, "svm.pkl") 

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Accuracy: 0.68


['svm.pkl']

In [27]:
svm.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 42,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

### Naive Bayes

In [47]:
gnb = GaussianNB()
gnb_param_grid = {
    'priors': [None],
    'var_smoothing': [1e-09]
}

gnb = classifier_performance(gnb, gnb_param_grid, X_train, y_train, X_test, y_test)
joblib.dump(gnb, "gnb.pkl") 

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Accuracy: 0.65


['gnb.pkl']

In [31]:
gnb.get_params()

{'priors': None, 'var_smoothing': 1e-09}

### XGBoost

In [46]:
xgboost = XGBClassifier(random_state=42)
xgboost.get_params()
xgboost_param_grid = {
    'objective': ['binary:logistic']
}

xgboost = classifier_performance(xgboost, xgboost_param_grid, X_train, y_train, X_test, y_test)
joblib.dump(xgboost, "xgboost.pkl")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Accuracy: 0.89


['xgboost.pkl']