# Naive Bayes
-------------------------------------------------------------------------------------------------------------

# Bibliotecas Necessárias

In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import seaborn as sns # visualize
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# Funções Auxiliares

describe_dataset() : realiza o cálculo das proporções de classes do dataset original

In [2]:
def describe_dataset(X, y, k):
    # get dataset rows: instances , columns: features
    rows, columns = X.shape
    # get proportion from target
    (unique, counts) = np.unique(y, return_counts=True) 
    # calculate proportion
    prop_neg = int(counts[0]/rows*100)
    prop_pos = int(counts[1]/rows*100)

    print("k = {}, Dataset: {} positivas, {} negativas ({}% x {}%)".format(k, counts[1], counts[0], prop_pos, prop_neg))

get_classes_from_index() : realiza o cálculo das proporções de classes dos folds criados

In [3]:
def get_classes_from_index(y, skf):
    _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
    y_counts = np.bincount(y_inv)
    _, class_perm = np.unique(y_idx, return_inverse=True)
    y_encoded = class_perm[y_inv]
    y_order = np.sort(y_encoded)
    n_classes = len(y_idx)
    allocation = np.asarray(
            [np.bincount(y_order[i::skf.n_splits], minlength=n_classes)
             for i in range(skf.n_splits)])

    for idx, f in enumerate(allocation):
        count_neg = int(f[0])
        count_pos = int(f[1])
        total = count_neg+count_pos
        prop_temp_neg = int(count_neg/total*100)
        prop_temp_pos = int(count_pos/total*100)
        print("Fold {}: Pos: {}, Neg: {}, Total: {}, Proporção: {}% x {}%".format(idx, count_pos, count_neg, total, prop_temp_pos, prop_temp_neg))

# Função que aplica o Naive Bayes

In [4]:
def stratified_k_fold(X, y, list_c, k):
    """
    Parameters
    ----------    
    X : array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples
        and n_features is the number of features.
    y : array-like, of length n_samples
        The target variable for supervised learning problems.
    k : int
        Determines the number of folds.

    """

    ### Estratifica o dataset em k folds
    skf = StratifiedKFold(n_splits=k)
    describe_dataset(X, y, k)
    get_classes_from_index(y, skf)
    
    
    ### Lista para armazenar os resultados de cada valor de c
    ### Armazena um array bidimensional, onde terá o valor do c e uma lista dos resultados de c
    result = []
    
    
    ### Executa o treino e teste para cada valor do parametro c
    for c in list_c:
        print("c =  {}" .format(c))

        ### create naive bayes classifier
        clf = GaussianNB(var_smoothing = c)
        
        
        ### Array para guardar os resultados dos testes para o parametro c
        """
        Coluna 0 : Armazena o valor de c
        Coluna 1 : Armazena o resultado 
        """
        result_c = []

                
        ### resultado do fold-k
        result_k = []
        ### Executa o treino e teste para k folds
        fold_k = 1
        for train_index, test_index in skf.split(X, y):
            
            print("fold_k: {}" .format(fold_k))
            print("\nTRAIN: {}  TEST: {}".format(len(train_index), len(test_index)))
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            ### train classifier
            clf.fit(X_train, y_train)
            
            ### calculate metrics
            y_predicted = clf.predict(X_test)
            report_dict = metrics.classification_report(y_test, y_predicted, output_dict=True)
            report_str = metrics.classification_report(y_test, y_predicted)
                
            ### Armazena o resultado do test do fold-k          
            result_k.append(report_dict)
            print(report_str)
            
            fold_k = fold_k + 1
            

        ### Guarda os resultados dos k fold do parametro c
        reports = pd.DataFrame(pd.DataFrame(result_k)['1.0'].to_list())
        accuracy_reports = pd.DataFrame(pd.DataFrame(result_k)['accuracy'])
        reports['accuracy'] = accuracy_reports
        print(reports)
        
                
        ### Guarda o resultado da execução para o parâmetro c
        result_c = [c, reports]
        result.append(result_c)
        
    
    ### Retorna a lista com todos os resultado para cada c
    return result
       

# Função para calcular a média das medidas

In [5]:
# Calcula a média das medidas de cada c
def calcula_media(lista_result):
    
    mean_c = []
    for result in lista_result:
        
        c = result[0]
        result_c = result[1]
        
        # Calcula a média das medidas do parametro c
        precision_mean = result_c['precision'].mean()
        recall_mean = result_c['recall'].mean()
        f1_score_mean = result_c['f1-score'].mean()
        support_mean = result_c['support'].mean()
        accuracy_mean = result_c['accuracy'].mean()
        
        # Armazena a média das medidas do parametro c
        mean_c.append([c, precision_mean, recall_mean, f1_score_mean, support_mean, accuracy_mean])
    
    name_columns = ['c', 'precision_mean', 'recall_mean', 'f1_score_mean', 'support_mean', 'accuracy_mean']
    mean_c = pd.DataFrame(mean_c, columns=name_columns)
    return mean_c

##### Parâmetros de execução do Naive Bayes
list_c : valores do parâmetro de ajuste de probabilidade 

k_folds : número de folds para a estratificação do dataset

In [6]:
list_c = [0.001, 0.10, 0.25, 0.50, 0.75, 1]
k_folds = 10

# Execução base: Todas as características

In [7]:
df = pd.read_csv('dataset-normalizado.csv', header = 0)
X = df.drop('is_approved', axis=1).to_numpy() # DATASET
y = df['is_approved'].to_numpy() # target
result_all_features = stratified_k_fold(X, y, list_c, k=k_folds)

k = 10, Dataset: 348 positivas, 301 negativas (53% x 46%)
Fold 0: Pos: 34, Neg: 31, Total: 65, Proporção: 52% x 47%
Fold 1: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 2: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 3: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 4: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 5: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 6: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 7: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 8: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 9: Pos: 34, Neg: 30, Total: 64, Proporção: 53% x 46%
c =  0.001
fold_k: 1

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.80      0.13      0.22        31
         1.0       0.55      0.97      0.70        34

    accuracy                           0.57        65
   macro avg       0.68      0.55      0.46        65
weighted avg       0.67      0.57      0.47    

              precision    recall  f1-score   support

         0.0       0.80      0.13      0.22        31
         1.0       0.55      0.97      0.70        34

    accuracy                           0.57        65
   macro avg       0.68      0.55      0.46        65
weighted avg       0.67      0.57      0.47        65

fold_k: 2

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.85      0.37      0.51        30
         1.0       0.63      0.94      0.76        35

    accuracy                           0.68        65
   macro avg       0.74      0.65      0.64        65
weighted avg       0.73      0.68      0.64        65

fold_k: 3

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.80      0.67      0.73        30
         1.0       0.75      0.86      0.80        35

    accuracy                           0.77        65
   macro avg       0.78      0.76      0.76        65
weighted a

fold_k: 4

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.69      0.37      0.48        30
         1.0       0.61      0.86      0.71        35

    accuracy                           0.63        65
   macro avg       0.65      0.61      0.60        65
weighted avg       0.65      0.63      0.61        65

fold_k: 5

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.68      0.57      0.62        30
         1.0       0.68      0.77      0.72        35

    accuracy                           0.68        65
   macro avg       0.68      0.67      0.67        65
weighted avg       0.68      0.68      0.67        65

fold_k: 6

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.96      0.80      0.87        30
         1.0       0.85      0.97      0.91        35

    accuracy                           0.89        65
   macro avg       0.91      0.8

Resultado dos k-fold para cada valor de c

In [8]:
result_all_features

[[0.001,
     precision    recall  f1-score  support  accuracy
  0   0.550000  0.970588  0.702128       34  0.569231
  1   0.625000  0.857143  0.722892       35  0.646154
  2   0.763158  0.828571  0.794521       35  0.769231
  3   0.666667  0.800000  0.727273       35  0.676923
  4   0.717949  0.800000  0.756757       35  0.723077
  5   0.775000  0.885714  0.826667       35  0.800000
  6   0.846154  0.942857  0.891892       35  0.876923
  7   0.685714  0.685714  0.685714       35  0.661538
  8   0.888889  0.457143  0.603774       35  0.676923
  9   0.730769  0.558824  0.633333       34  0.656250],
 [0.1,
     precision    recall  f1-score  support  accuracy
  0   0.559322  0.970588  0.709677       34  0.584615
  1   0.640000  0.914286  0.752941       35  0.676923
  2   0.743590  0.828571  0.783784       35  0.753846
  3   0.666667  0.857143  0.750000       35  0.692308
  4   0.710526  0.771429  0.739726       35  0.707692
  5   0.833333  0.857143  0.845070       35  0.830769
  6   0.89

Calcula a média das medidas de cada parâmetro c

In [9]:
result_all_features_mean = calcula_media(result_all_features)
result_all_features_mean

Unnamed: 0,c,precision_mean,recall_mean,f1_score_mean,support_mean,accuracy_mean
0,0.001,0.72493,0.778655,0.734495,34.8,0.705625
1,0.1,0.733687,0.781597,0.739795,34.8,0.71488
2,0.25,0.724827,0.793025,0.739326,34.8,0.711779
3,0.5,0.711727,0.807395,0.740652,34.8,0.705625
4,0.75,0.713436,0.824538,0.750086,34.8,0.71024
5,1.0,0.704294,0.838824,0.75275,34.8,0.705625


Obtém as medidas da maior média de acurácia

In [10]:
best_accuracy_all_features = pd.Series(result_all_features_mean.iloc[result_all_features_mean['accuracy_mean'].idxmax()], 
                          name='All Features')
best_all_features = pd.DataFrame(best_accuracy_all_features)
best_all_features

Unnamed: 0,All Features
c,0.1
precision_mean,0.733687
recall_mean,0.781597
f1_score_mean,0.739795
support_mean,34.8
accuracy_mean,0.71488


# Execução Base: PCA

In [11]:
df = pd.read_csv('dataset-normalizado.csv', header = 0)
X = df.drop('is_approved', axis=1).to_numpy() # DATASET
y = df['is_approved'].to_numpy() # target
result_pca = stratified_k_fold(X, y, list_c, k=k_folds)

k = 10, Dataset: 348 positivas, 301 negativas (53% x 46%)
Fold 0: Pos: 34, Neg: 31, Total: 65, Proporção: 52% x 47%
Fold 1: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 2: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 3: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 4: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 5: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 6: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 7: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 8: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 9: Pos: 34, Neg: 30, Total: 64, Proporção: 53% x 46%
c =  0.001
fold_k: 1

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.80      0.13      0.22        31
         1.0       0.55      0.97      0.70        34

    accuracy                           0.57        65
   macro avg       0.68      0.55      0.46        65
weighted avg       0.67      0.57      0.47    

fold_k: 1

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.80      0.13      0.22        31
         1.0       0.55      0.97      0.70        34

    accuracy                           0.57        65
   macro avg       0.68      0.55      0.46        65
weighted avg       0.67      0.57      0.47        65

fold_k: 2

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.85      0.37      0.51        30
         1.0       0.63      0.94      0.76        35

    accuracy                           0.68        65
   macro avg       0.74      0.65      0.64        65
weighted avg       0.73      0.68      0.64        65

fold_k: 3

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.80      0.67      0.73        30
         1.0       0.75      0.86      0.80        35

    accuracy                           0.77        65
   macro avg       0.78      0.7


fold_k: 2

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.88      0.23      0.37        30
         1.0       0.60      0.97      0.74        35

    accuracy                           0.63        65
   macro avg       0.74      0.60      0.55        65
weighted avg       0.73      0.63      0.57        65

fold_k: 3

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.79      0.63      0.70        30
         1.0       0.73      0.86      0.79        35

    accuracy                           0.75        65
   macro avg       0.76      0.75      0.75        65
weighted avg       0.76      0.75      0.75        65

fold_k: 4

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.69      0.37      0.48        30
         1.0       0.61      0.86      0.71        35

    accuracy                           0.63        65
   macro avg       0.65      0.

Resultado dos k-fold para cada valor de c

In [12]:
result_pca

[[0.001,
     precision    recall  f1-score  support  accuracy
  0   0.550000  0.970588  0.702128       34  0.569231
  1   0.625000  0.857143  0.722892       35  0.646154
  2   0.763158  0.828571  0.794521       35  0.769231
  3   0.666667  0.800000  0.727273       35  0.676923
  4   0.717949  0.800000  0.756757       35  0.723077
  5   0.775000  0.885714  0.826667       35  0.800000
  6   0.846154  0.942857  0.891892       35  0.876923
  7   0.685714  0.685714  0.685714       35  0.661538
  8   0.888889  0.457143  0.603774       35  0.676923
  9   0.730769  0.558824  0.633333       34  0.656250],
 [0.1,
     precision    recall  f1-score  support  accuracy
  0   0.559322  0.970588  0.709677       34  0.584615
  1   0.640000  0.914286  0.752941       35  0.676923
  2   0.743590  0.828571  0.783784       35  0.753846
  3   0.666667  0.857143  0.750000       35  0.692308
  4   0.710526  0.771429  0.739726       35  0.707692
  5   0.833333  0.857143  0.845070       35  0.830769
  6   0.89

Calcula a média das medidas de cada parâmetro c

In [13]:
result_pca_mean = calcula_media(result_pca)
result_pca_mean

Unnamed: 0,c,precision_mean,recall_mean,f1_score_mean,support_mean,accuracy_mean
0,0.001,0.72493,0.778655,0.734495,34.8,0.705625
1,0.1,0.733687,0.781597,0.739795,34.8,0.71488
2,0.25,0.724827,0.793025,0.739326,34.8,0.711779
3,0.5,0.711727,0.807395,0.740652,34.8,0.705625
4,0.75,0.713436,0.824538,0.750086,34.8,0.71024
5,1.0,0.704294,0.838824,0.75275,34.8,0.705625


Obtém o resultado da maior média de acurácia 

In [14]:
best_accuracy_pca = pd.Series(result_pca_mean.iloc[result_pca_mean['accuracy_mean'].idxmax()], 
                          name='PCA')
best_pca = pd.DataFrame(best_accuracy_pca)
best_pca

Unnamed: 0,PCA
c,0.1
precision_mean,0.733687
recall_mean,0.781597
f1_score_mean,0.739795
support_mean,34.8
accuracy_mean,0.71488


# Execução Base: Chi Squared (K-Best)

In [15]:
df = pd.read_csv('dataset-fs-chi-squared.csv', header = 0)
X = df.drop('is_approved', axis=1).to_numpy() # DATASET
y = df['is_approved'].to_numpy() # target
result_chi = stratified_k_fold(X, y, list_c, k=k_folds)

k = 10, Dataset: 348 positivas, 301 negativas (53% x 46%)
Fold 0: Pos: 34, Neg: 31, Total: 65, Proporção: 52% x 47%
Fold 1: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 2: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 3: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 4: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 5: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 6: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 7: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 8: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 9: Pos: 34, Neg: 30, Total: 64, Proporção: 53% x 46%
c =  0.001
fold_k: 1

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.83      0.16      0.27        31
         1.0       0.56      0.97      0.71        34

    accuracy                           0.58        65
   macro avg       0.70      0.57      0.49        65
weighted avg       0.69      0.58      0.50    

fold_k: 3

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.75      0.70      0.72        30
         1.0       0.76      0.80      0.78        35

    accuracy                           0.75        65
   macro avg       0.75      0.75      0.75        65
weighted avg       0.75      0.75      0.75        65

fold_k: 4

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.75      0.50      0.60        30
         1.0       0.67      0.86      0.75        35

    accuracy                           0.69        65
   macro avg       0.71      0.68      0.68        65
weighted avg       0.71      0.69      0.68        65

fold_k: 5

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.70      0.63      0.67        30
         1.0       0.71      0.77      0.74        35

    accuracy                           0.71        65
   macro avg       0.71      0.7

fold_k: 6

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.92      0.77      0.84        30
         1.0       0.82      0.94      0.88        35

    accuracy                           0.86        65
   macro avg       0.87      0.85      0.86        65
weighted avg       0.87      0.86      0.86        65

fold_k: 7

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       1.00      0.90      0.95        30
         1.0       0.92      1.00      0.96        35

    accuracy                           0.95        65
   macro avg       0.96      0.95      0.95        65
weighted avg       0.96      0.95      0.95        65

fold_k: 8

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.77      0.77      0.77        30
         1.0       0.80      0.80      0.80        35

    accuracy                           0.78        65
   macro avg       0.78      0.7

Resultado dos k-fold para cada valor de c

In [16]:
result_chi

[[0.001,
     precision    recall  f1-score  support  accuracy
  0   0.559322  0.970588  0.709677       34  0.584615
  1   0.647059  0.942857  0.767442       35  0.692308
  2   0.810811  0.857143  0.833333       35  0.815385
  3   0.688889  0.885714  0.775000       35  0.723077
  4   0.710526  0.771429  0.739726       35  0.707692
  5   0.842105  0.914286  0.876712       35  0.861538
  6   0.871795  0.971429  0.918919       35  0.907692
  7   0.742857  0.742857  0.742857       35  0.723077
  8   0.944444  0.485714  0.641509       35  0.707692
  9   0.750000  0.617647  0.677419       34  0.687500],
 [0.1,
     precision    recall  f1-score  support  accuracy
  0   0.540984  0.970588  0.694737       34  0.553846
  1   0.615385  0.914286  0.735632       35  0.646154
  2   0.756757  0.800000  0.777778       35  0.753846
  3   0.666667  0.857143  0.750000       35  0.692308
  4   0.710526  0.771429  0.739726       35  0.707692
  5   0.842105  0.914286  0.876712       35  0.861538
  6   0.97

Calcula a média das medidas de cada parâmetro c

In [17]:
result_chi_mean = calcula_media(result_chi)
result_chi_mean

Unnamed: 0,c,precision_mean,recall_mean,f1_score_mean,support_mean,accuracy_mean
0,0.001,0.756781,0.815966,0.76826,34.8,0.741058
1,0.1,0.75726,0.795966,0.755794,34.8,0.72875
2,0.25,0.749808,0.795882,0.752734,34.8,0.722548
3,0.5,0.746037,0.818824,0.760984,34.8,0.727187
4,0.75,0.736571,0.824538,0.75862,34.8,0.722548
5,1.0,0.725369,0.827395,0.75524,34.8,0.716394


Obtém o resultado da maior média de acurácia

In [18]:
best_accuracy_chi = pd.Series(result_chi_mean.iloc[result_chi_mean['accuracy_mean'].idxmax()], 
                          name='Chi Squared')
best_chi = pd.DataFrame(best_accuracy_chi)
best_chi

Unnamed: 0,Chi Squared
c,0.001
precision_mean,0.756781
recall_mean,0.815966
f1_score_mean,0.76826
support_mean,34.8
accuracy_mean,0.741058


# Execução Base: recursive-feature

In [19]:
df = pd.read_csv('dataset-fs-recursive-feature.csv', header = 0)
X = df.drop('is_approved', axis=1).to_numpy() # DATASET
y = df['is_approved'].to_numpy() # target
result_recursive = stratified_k_fold(X, y, list_c, k=k_folds)

k = 10, Dataset: 348 positivas, 301 negativas (53% x 46%)
Fold 0: Pos: 34, Neg: 31, Total: 65, Proporção: 52% x 47%
Fold 1: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 2: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 3: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 4: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 5: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 6: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 7: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 8: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 9: Pos: 34, Neg: 30, Total: 64, Proporção: 53% x 46%
c =  0.001
fold_k: 1

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       1.00      0.13      0.23        31
         1.0       0.56      1.00      0.72        34

    accuracy                           0.58        65
   macro avg       0.78      0.56      0.47        65
weighted avg       0.77      0.58      0.48    

              precision    recall  f1-score   support

         0.0       0.88      0.23      0.37        30
         1.0       0.60      0.97      0.74        35

    accuracy                           0.63        65
   macro avg       0.74      0.60      0.55        65
weighted avg       0.73      0.63      0.57        65

fold_k: 3

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.85      0.57      0.68        30
         1.0       0.71      0.91      0.80        35

    accuracy                           0.75        65
   macro avg       0.78      0.74      0.74        65
weighted avg       0.78      0.75      0.74        65

fold_k: 4

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.85      0.37      0.51        30
         1.0       0.63      0.94      0.76        35

    accuracy                           0.68        65
   macro avg       0.74      0.65      0.64        65
weighted a


fold_k: 4

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.82      0.30      0.44        30
         1.0       0.61      0.94      0.74        35

    accuracy                           0.65        65
   macro avg       0.71      0.62      0.59        65
weighted avg       0.71      0.65      0.60        65

fold_k: 5

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.94      0.50      0.65        30
         1.0       0.69      0.97      0.81        35

    accuracy                           0.75        65
   macro avg       0.82      0.74      0.73        65
weighted avg       0.81      0.75      0.74        65

fold_k: 6

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       1.00      0.57      0.72        30
         1.0       0.73      1.00      0.84        35

    accuracy                           0.80        65
   macro avg       0.86      0.

Resultado dos k-fold para cada valor de c 

In [20]:
result_recursive

[[0.001,
     precision    recall  f1-score  support  accuracy
  0   0.557377  1.000000  0.715789       34  0.584615
  1   0.640000  0.914286  0.752941       35  0.676923
  2   0.761905  0.914286  0.831169       35  0.800000
  3   0.636364  0.800000  0.708861       35  0.646154
  4   0.727273  0.914286  0.810127       35  0.769231
  5   0.761905  0.914286  0.831169       35  0.800000
  6   0.850000  0.971429  0.906667       35  0.892308
  7   0.644444  0.828571  0.725000       35  0.661538
  8   0.857143  0.685714  0.761905       35  0.769231
  9   0.750000  0.705882  0.727273       34  0.718750],
 [0.1,
     precision    recall  f1-score  support  accuracy
  0   0.532258  0.970588  0.687500       34  0.538462
  1   0.629630  0.971429  0.764045       35  0.676923
  2   0.711111  0.914286  0.800000       35  0.753846
  3   0.640000  0.914286  0.752941       35  0.676923
  4   0.738095  0.885714  0.805195       35  0.769231
  5   0.772727  0.971429  0.860759       35  0.830769
  6   0.85

Calcula a média das medidas de cada parâmetro c

In [21]:
result_recursive_mean = calcula_media(result_recursive)
result_recursive_mean

Unnamed: 0,c,precision_mean,recall_mean,f1_score_mean,support_mean,accuracy_mean
0,0.001,0.718641,0.864874,0.77709,34.8,0.731875
1,0.1,0.716993,0.879076,0.778003,34.8,0.728774
2,0.25,0.706082,0.896218,0.77861,34.8,0.724183
3,0.5,0.690841,0.913445,0.777125,34.8,0.716466
4,0.75,0.68155,0.91916,0.774406,34.8,0.710288
5,1.0,0.670595,0.927731,0.771593,34.8,0.702572


Obtém o resultado da maior média de acurácia

In [22]:
best_accuracy_recursive = pd.Series(result_recursive_mean.iloc[result_recursive_mean['accuracy_mean'].idxmax()], 
                          name='Recursive Feature')
best_recursive = pd.DataFrame(best_accuracy_recursive)
best_recursive

Unnamed: 0,Recursive Feature
c,0.001
precision_mean,0.718641
recall_mean,0.864874
f1_score_mean,0.77709
support_mean,34.8
accuracy_mean,0.731875


# Junta todos os resultados

In [23]:
result = pd.concat([best_all_features, best_pca, best_chi, best_recursive], axis=1)

print("Média das métricas geradas pelo processamento de cada dataset")
result.transpose()

Média das métricas geradas pelo processamento de cada dataset


Unnamed: 0,c,precision_mean,recall_mean,f1_score_mean,support_mean,accuracy_mean
All Features,0.1,0.733687,0.781597,0.739795,34.8,0.71488
PCA,0.1,0.733687,0.781597,0.739795,34.8,0.71488
Chi Squared,0.001,0.756781,0.815966,0.76826,34.8,0.741058
Recursive Feature,0.001,0.718641,0.864874,0.77709,34.8,0.731875
