In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.io.arff import loadarff 

#Funções de avaliação dos modelos
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    roc_curve,
    auc,
    roc_auc_score,
)
from sklearn.model_selection import(
    train_test_split, 
    cross_val_predict,
    KFold,
    GridSearchCV,
)

from sklearn.preprocessing import label_binarize

#utils
from utils.num import Num

import warnings
warnings.filterwarnings('ignore')

In [None]:
#Classes do modelo de aprendizado
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC

## Definição de funções auxiliares

### Métricas

In [None]:
def compute_metrics(y_true, y_pred, y_score=None):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    conf_matrix = confusion_matrix(y_true, y_pred)

    if y_score is not None:
        # Compute ROC curve and ROC area for each class
        classes = [i for i in range(10)]
        n_classes = len(classes)
        y_bin = label_binarize(y_true, classes=classes)

        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_bin.ravel(), y_score.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        return accuracy, f1, conf_matrix, (fpr, tpr, roc_auc)
    
    return accuracy, f1, conf_matrix

def plot_metrics(model_name:str, acc:list,
                 f1:list, cm:list, roc:list) -> None:
    print(f"Resumo das métricas do modelo {model_name}\n")
    print(f"Accurácia média (desvio): {np.mean(acc):.3f} ({np.std(acc):.3f})")
    print(f"F1-Score média (desvio): {np.mean(f1):.3f} ({np.std(f1):.3f})")

    # Confusion Matrix
    cm_mean = np.mean(cm, axis=0)
    ax = sns.heatmap(cm_mean, linewidths=0.5, annot=True, fmt='g')
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
    ax.set_title('Matriz de Confusão Média')
    ax.set_xlabel('Classes preditas')
    ax.set_ylabel('Classes verdadeiras')
    plt.show()
    

### Para K-Fold CV e GridSearch

In [None]:
def kfold_run(kf, model, X, y):
      accuracies_list = list()
      f1_list = list()
      conf_matrix_list = list()
      roc_list = list()

      for fold, (train, test) in enumerate(kf.split(X, y)):
            X_train, X_test = X.iloc[train], X.iloc[test]
            y_train, y_test = y.iloc[train], y.iloc[test]

            model.fit(X_train, y_train.values.ravel())
            y_pred = model.predict(X_test)
            y_score = model.predict_proba(X_test)

            acc, f1, conf_matrix, roc = compute_metrics(y_test, y_pred, y_score)

            print(f"Fold {fold}\t Accuracy: {acc:.3f}")

            if hasattr(model, 'best_params_'):
                  print(F"Model best params: {model.best_params_}")

            accuracies_list.append(acc)
            f1_list.append(f1)
            conf_matrix_list.append(conf_matrix)
            roc_list.append(roc)

      return accuracies_list, f1_list, conf_matrix_list, roc_list


def grid_Search(kf, model, params, X, y):
      grid = GridSearchCV(estimator=model, param_grid=params,
                          scoring='accuracy', n_jobs=-1, 
                          cv=5, refit=True)

      return kfold_run(kf, grid, X, y)

### Visualização de instâncias do DS

In [None]:
# plot multiple numbers

def plot_images(images):
    n_images = len(images)

    rows = int(np.sqrt(n_images))
    cols = int(np.sqrt(n_images))

    fig = plt.figure()
    for i in range(rows*cols):
        ax = fig.add_subplot(rows, cols, i+1)
        ax.imshow(images[i], cmap='gray_r')
        ax.axis('off')
    
    fig.suptitle('Representação de números aleatórios do dataset')

### Carregando o DataSet

In [None]:
# loading data
raw_data = loadarff('mnist_784.arff')
df = pd.DataFrame(raw_data[0])

In [None]:
# Visualizando algumas instância do dataset
nums = df['class'].unique()

imgs = list()
for i in np.random.randint(1000, size=36):
    rnd_choice = np.random.choice(nums)
    img = pd.array(df[df['class'] == rnd_choice].iloc[i][:-1],
                 dtype=int).reshape((28,28))
    imgs.append(img)

plot_images(imgs)


In [None]:
# numbers distribution
n_dist = df['class'].value_counts().sort_index()
n_dist.index = n_dist.index.astype('int')

fig = n_dist.plot.bar()
fig.set_xticklabels(fig.get_xticklabels(), rotation=0)
fig.set_xlabel('Classe')
fig.set_ylabel('Frequência')

fig.set_title('Frequência de cada número (classe)')

#### Separando os dados de treino e teste

In [None]:
# treino e teste
X = df.iloc[:, 0:-1]
y = df.iloc[:, [-1]].astype('int') # target

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, shuffle=True)

kf = KFold(n_splits=5, shuffle=True)

# Experimentos

### KNN

In [None]:
knn = KNeighborsClassifier()
knn_params = {'n_neighbors': [3, 5, 7],
             'metric': ['euclidean', 'manhattan']}

knn_acc, knn_f1, knn_cm, knn_roc = grid_Search(kf, knn, knn_params, X_train, y_train)

print("\n-----------------------------------------------------\n")
plot_metrics(type(knn).__name__, knn_acc, knn_f1, knn_cm, knn_roc)

### Decision Tree

In [None]:
DT = DecisionTreeClassifier()
dt_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth':[5, 50, 100],
    'min_samples_split': [5, 10, 30],
    'max_features':[10, 30, 100, 'sqrt'],
}
dt_acc, dt_f1, dt_cm, dt_roc = grid_Search(kf, DT, dt_params, X_train, y_train)

print("\n-----------------------------------------------------\n")
plot_metrics(type(DT).__name__, dt_acc, dt_f1, dt_cm, dt_roc)


### Naive Bayes

#### Gaussian NB

In [None]:
GNB = GaussianNB()
gnb_params = {
    
}

grid_Search(kf, GNB, gnb_params, X_train, y_train)

#### Multinomial NB

In [None]:
MNNB = MultinomialNB()
mnnb_params = {
    
}

grid_Search(kf, MNNB, mnnb_params, X_train, y_train)

### SVC

In [None]:
svc = SVC()
svc_params = [
    {'C': [0.1, 10, 100],
     'kernel':['linear', 'rbf', 'sigmoid']},
    
    {'C': [0.1, 10, 100],
     'Kernel': ['poly'],
     'degree': [2, 3, 5]},
]

grid_Search(kf, svc, svc_params, X_train, y_train)
