# Machine learning module


In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier

In [None]:
csv_path = "/content/drive/MyDrive/results_csv/"
feature = "angles_right_arm"

In [None]:
#carregar dataset de todos os videos
def load_dataset(csv_path, feature, janela, stride):
  X, Y = [], []
  for label in ("correct", "incorrect"):
    results_path = os.path.join(csv_path, label) #"/content/drive/MyDrive/results_csv/correct ou incorrect"
    for results_name in os.listdir(results_path): #correct_01.csv etc...
      df = pd.read_csv(os.path.join(results_path, results_name))
      times_series = time_series_window_embedding(df[feature].values, window=janela, stride=stride)
      for t in times_series: # Para cada times_series, armazena o vetor e o rótulo
        X.append(t)       # t é array shape=(janela,)
        Y.append(0 if label=="incorrect" else 1) #rotula 1 correto 0 incorreto

  X = np.vstack(X)          # shape = (n_videos, janela)
  y = np.array(Y)           # shape = (n_videos,)
  return X, Y

In [None]:
X, Y = load_dataset(csv_path, feature, janela=2**8, stride=2)

In [None]:
# Treinamento com Decision Tree Classifier
def train_clf (X, Y):
  TEST_SIZE = 0.20
  X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=TEST_SIZE, random_state=42,stratify=Y)
  #X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval,test_size=0.125, random_state=42,stratify=y_trainval) #test_size = 0.125 * 0.80 = 0.10 do total
  print(f"Total: {X.shape[0]} amostras")

  # hiperparâmetros da arvore de decisão
  param_grid = {"criterion":["gini", "entropy"],"max_depth":[None,3, 5, 10], "min_samples_split": [2, 5, 10]} # "min_samples_leaf": [1,5,10]

  # GridSearchCV para testar os hiperparamentros da arvore
  grid = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)
  grid.fit(X_train, y_train)

  #clf = DecisionTreeClassifier(random_state = 42) #chama o classificador
  #clf = clf.fit(X_train, y_train)
  best = grid.best_estimator_
  print("Melhores parâmetros:", grid.best_params_)
  print("Acurácia média (CV):", grid.best_score_)

  # teste final
  y_pred = best.predict(X_test)
  print("Acurácia:", best.score(X_test, y_test))
  print("\nRelatório de classificação Teste:\n", classification_report(y_test, y_pred))
  print("\nMatriz de confusão Teste:\n", confusion_matrix(y_test, y_pred))

In [None]:
train_clf(X,Y)

Total: 35191 amostras
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Melhores parâmetros: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 10}
Acurácia média (CV): 0.9512291542662368
Acurácia: 0.9538286688450064

Relatório de classificação Teste:
               precision    recall  f1-score   support

           0       0.98      0.90      0.94      2763
           1       0.94      0.99      0.96      4276

    accuracy                           0.95      7039
   macro avg       0.96      0.94      0.95      7039
weighted avg       0.96      0.95      0.95      7039


Matriz de confusão Teste:
 [[2483  280]
 [  45 4231]]


In [None]:
# Treinamento com XGBoost
def train_xgboost (X, Y):
  TEST_SIZE = 0.20
  X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=TEST_SIZE, random_state=42,stratify=Y)
  # hiperparâmetros do XGBoost
  param_grid = {"max_depth":[3, 5, 10],"learning_rate":[0.001, 0.01, 0.1],"subsample":[0.5, 0.7, 1]}
  # GridSearchCV para testar os hiperparamentros
  grid = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),param_grid,cv=5,scoring="accuracy",n_jobs=-1,verbose=1)
  grid.fit(X_train, y_train)

  best = grid.best_estimator_
  print("Melhores parâmetros:", grid.best_params_)
  print("Acurácia média (CV):", grid.best_score_)

  #teste final
  y_pred = best.predict(X_test)
  print("Acurácia:", best.score(X_test, y_test))
  print("\nRelatório de classificação Teste:\n", classification_report(y_test, y_pred))
  print("\nMatriz de confusão Teste:\n", confusion_matrix(y_test, y_pred))

In [None]:
train_xgboost(X,Y)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Melhores parâmetros: {'learning_rate': 0.1, 'max_depth': 10, 'subsample': 1}
Acurácia média (CV): 0.9736786748565492
=== Teste Final ===
Acurácia no teste: 0.9761

Relatório de classificação:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97      2763
           1       0.96      1.00      0.98      4276

    accuracy                           0.98      7039
   macro avg       0.98      0.97      0.97      7039
weighted avg       0.98      0.98      0.98      7039

Matriz de confusão:
 [[2601  162]
 [   6 4270]]
