In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.io.arff import loadarff 

#Funções de avaliação dos modelos
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    roc_curve,
    auc,
    roc_auc_score,
)
from sklearn.model_selection import(
    train_test_split, 
    cross_val_predict,
    KFold,
    GridSearchCV,
)

#utils
from utils.num import Num

import warnings
warnings.filterwarnings('ignore')

In [None]:
#Classes do modelo de aprendizado
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC

In [None]:
# plot multiple numbers

def plot_images(images):
    n_images = len(images)

    rows = int(np.sqrt(n_images))
    cols = int(np.sqrt(n_images))

    fig = plt.figure()
    for i in range(rows*cols):
        ax = fig.add_subplot(rows, cols, i+1)
        ax.imshow(images[i], cmap='gray_r')
        ax.axis('off')
    
    fig.suptitle('Representação de números aleatórios do dataset')

In [37]:
def kfold_run(kf, model, X, y):
      accuracies_list = list()
      f1_list = list()
      conf_matrix_list = list()
      for fold, (train, test) in enumerate(kf.split(X, y)):
            X_train, X_test = X.iloc[train], X.iloc[test]
            y_train, y_test = y.iloc[train], y.iloc[test]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            y_score = model.predict_proba(X_test)

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average=None)
            conf_matrix = confusion_matrix(y_test, y_pred)

            print(f"Fold {fold}\t Accuracy: {acc:.3f}")

            if hasattr(model, 'best_params_'):
                  print(F"Model best params: {model.best_params_}")

            accuracies_list.append(acc)
            f1_list.append(f1)
            conf_matrix_list.append(conf_matrix)

      return accuracies_list, f1_list, conf_matrix_list


def grid_Search(kf, model, params, X, y):
      grid = GridSearchCV(estimator=model, param_grid=params,
                          scoring='accuracy', n_jobs=-1, 
                          cv=5, refit=True)

      return kfold_run(kf, grid, X, y)

In [None]:
# loading data
raw_data = loadarff('mnist_784.arff')
df = pd.DataFrame(raw_data[0])

In [None]:
df.head()

In [None]:
# Visualizando algumas instância do dataset
nums = df['class'].unique()

imgs = list()
for i in np.random.randint(1000, size=36):
    rnd_choice = np.random.choice(nums)
    img = pd.array(df[df['class'] == rnd_choice].iloc[i][:-1],
                 dtype=int).reshape((28,28))
    imgs.append(img)

plot_images(imgs)


In [None]:
# numbers distribution
n_dist = df['class'].value_counts().sort_index()
n_dist.index = n_dist.index.astype('int')

fig = n_dist.plot.bar()
fig.set_xticklabels(fig.get_xticklabels(), rotation=0)
fig.set_xlabel('Classe')
fig.set_ylabel('Frequência')

fig.set_title('Frequência de cada número (classe)')

In [None]:
# treino e teste
X = df.iloc[:, 0:-1]
y = df.iloc[:, [-1]].astype('int') # target

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, shuffle=True)

kf = KFold(n_splits=5, shuffle=True)

# Experiments with the Models

### KNN

In [None]:
model = KNeighborsClassifier()
parameters = {'n_neighbors': [3, 5, 7],
             'metric': ['euclidean', 'manhattan']}

grid_Search(kf, model, parameters, X_train, y_train)

### Decision Tree

In [40]:
DT = DecisionTreeClassifier()
dt_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth':[5, 50, 100],
    'min_samples_split': [5, 10, 30],
    'max_features':[10, 30, 100, 'sqrt'],
}
acc, f1, mc = grid_Search(kf, DT, dt_params, X_train, y_train)

print(f"Accuracy mean (std): {np.mean(acc)} ({np.std(acc)})")
print(f"f1-scores mean (std): {np.mean(f1, axis=0)} ({np.std(f1, axis=0)})")
print(f"Confusion Matrix mean (std): {np.mean(mc, axis=0)} ({np.std(mc, axis=0)})")

Fold 0	 Accuracy: 0.860
Model best params: {'criterion': 'entropy', 'max_depth': 100, 'max_features': 100, 'min_samples_split': 10}
Fold 1	 Accuracy: 0.856
Model best params: {'criterion': 'entropy', 'max_depth': 100, 'max_features': 100, 'min_samples_split': 5}
Fold 2	 Accuracy: 0.859
Model best params: {'criterion': 'entropy', 'max_depth': 100, 'max_features': 100, 'min_samples_split': 10}
Fold 3	 Accuracy: 0.861
Model best params: {'criterion': 'entropy', 'max_depth': 50, 'max_features': 100, 'min_samples_split': 10}
Fold 4	 Accuracy: 0.861
Model best params: {'criterion': 'entropy', 'max_depth': 50, 'max_features': 100, 'min_samples_split': 5}
Accuracy mean (std): 0.8594126984126984 (0.0017515065681439968)
f1-scores mean (std): [0.91749073 0.94976625 0.83605597 0.81934019 0.85406586 0.79696759
 0.89212593 0.88513165 0.798792   0.82305385] ([0.00596544 0.00385507 0.00273592 0.00960325 0.00681043 0.00368597
 0.00678707 0.00510509 0.00807065 0.00665181])
Confusion Matrix mean (std): [

### Naive Bayes

#### Gaussian NB

In [None]:
GNB = GaussianNB()
gnb_params = {
    
}

grid_Search(kf, GNB, gnb_params, X_train, y_train)

#### Multinomial NB

In [None]:
MNNB = MultinomialNB()
mnnb_params = {
    
}

grid_Search(kf, MNNB, mnnb_params, X_train, y_train)

### SVC

In [None]:
svc = SVC()
svc_params = [
    {'C': [0.1, 10, 100],
     'kernel':['linear', 'rbf', 'sigmoid']},
    
    {'C': [0.1, 10, 100],
     'Kernel': ['poly'],
     'degree': [2, 3, 5]},
]

grid_Search(kf, svc, svc_params, X_train, y_train)
