# Naive Bayes
-------------------------------------------------------------------------------------------------------------

# Bibliotecas Necessárias

In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import seaborn as sns # visualize
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# Funções Auxiliares

describe_dataset() : realiza o cálculo das proporções de classes do dataset original

In [2]:
def describe_dataset(X, y, k):
    # get dataset rows: instances , columns: features
    rows, columns = X.shape
    # get proportion from target
    (unique, counts) = np.unique(y, return_counts=True) 
    # calculate proportion
    prop_neg = int(counts[0]/rows*100)
    prop_pos = int(counts[1]/rows*100)

    print("k = {}, Dataset: {} positivas, {} negativas ({}% x {}%)".format(k, counts[1], counts[0], prop_pos, prop_neg))

get_classes_from_index() : realiza o cálculo das proporções de classes dos folds criados

In [3]:
def get_classes_from_index(y, skf):
    _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
    y_counts = np.bincount(y_inv)
    _, class_perm = np.unique(y_idx, return_inverse=True)
    y_encoded = class_perm[y_inv]
    y_order = np.sort(y_encoded)
    n_classes = len(y_idx)
    allocation = np.asarray(
            [np.bincount(y_order[i::skf.n_splits], minlength=n_classes)
             for i in range(skf.n_splits)])

    for idx, f in enumerate(allocation):
        count_neg = int(f[0])
        count_pos = int(f[1])
        total = count_neg+count_pos
        prop_temp_neg = int(count_neg/total*100)
        prop_temp_pos = int(count_pos/total*100)
        print("Fold {}: Pos: {}, Neg: {}, Total: {}, Proporção: {}% x {}%".format(idx, count_pos, count_neg, total, prop_temp_pos, prop_temp_neg))

In [253]:
def stratified_k_fold(X, y, k, reports):
    """
    Parameters
    ----------    
    X : array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples
        and n_features is the number of features.
    y : array-like, of length n_samples
        The target variable for supervised learning problems.
    k : int
        Determines the number of folds.

    """

    skf = StratifiedKFold(n_splits=k)
    describe_dataset(X, y, k)
    get_classes_from_index(y, skf)
    print(skf)
    
    ### create naive bayes classifier
    clf = GaussianNB()
        
    
    ### Executa o treino e teste para k folds
    for train_index, test_index in skf.split(X, y):
        
        print("\nTRAIN: {}  TEST: {}".format(len(train_index), len(test_index)))
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        ### train classifier
        clf.fit(X_train, y_train)
                  
        ### calculate metrics
        y_predicted = clf.predict(X_test)
        report_dict = metrics.classification_report(y_test, y_predicted, output_dict=True)
        report_str = metrics.classification_report(y_test, y_predicted)
        reports.append(report_dict)
        print(report_str)
    
    

# Execução base: Todas as características

In [278]:
reports_dict_all_features = []

df = pd.read_csv('dataset-normalizado.csv', header = 0)
X = df.drop('is_approved', axis=1).to_numpy() # DATASET
y = df['is_approved'].to_numpy() # target
stratified_k_fold(X, y, k=10, reports=reports_dict_all_features)

k = 10, Dataset: 348 positivas, 301 negativas (53% x 46%)
Fold 0: Pos: 34, Neg: 31, Total: 65, Proporção: 52% x 47%
Fold 1: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 2: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 3: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 4: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 5: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 6: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 7: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 8: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 9: Pos: 34, Neg: 30, Total: 64, Proporção: 53% x 46%
StratifiedKFold(n_splits=10, random_state=None, shuffle=False)

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.80      0.13      0.22        31
         1.0       0.55      0.97      0.70        34

    accuracy                           0.57        65
   macro avg       0.68      0.55      0.46        65
weigh

Gera o resultado do processamento da base considerando todas as características

In [315]:
result_all_features = pd.DataFrame(pd.DataFrame(reports_dict_all_features)['1.0'].to_list())
accuracy_all_features = pd.DataFrame(pd.DataFrame(reports_dict_all_features)['accuracy'])
result_all_features['accuracy'] = accuracy_all_features
result_all_features

Unnamed: 0,precision,recall,f1-score,support,accuracy
0,0.55,0.970588,0.702128,34,0.569231
1,0.625,0.857143,0.722892,35,0.646154
2,0.783784,0.828571,0.805556,35,0.784615
3,0.666667,0.8,0.727273,35,0.676923
4,0.717949,0.8,0.756757,35,0.723077
5,0.775,0.885714,0.826667,35,0.8
6,0.846154,0.942857,0.891892,35,0.876923
7,0.685714,0.685714,0.685714,35,0.661538
8,0.888889,0.457143,0.603774,35,0.676923
9,0.730769,0.558824,0.633333,34,0.65625


In [316]:
result_mean_all_features = pd.DataFrame(result_all_features.mean(), columns=['All Features'])
result_mean_all_features
# result_mean_all_features.transpose()

Unnamed: 0,All Features
precision,0.726993
recall,0.778655
f1-score,0.735598
support,34.8
accuracy,0.707163


# Execução Base: PCA

In [317]:
reports_dict_pca = []

df = pd.read_csv('dataset-pca.csv', header = 0)
X = df.drop('is_approved', axis=1).to_numpy() # DATASET
y = df['is_approved'].to_numpy() # target
stratified_k_fold(X, y, k=10, reports=reports_dict_pca)

k = 10, Dataset: 348 positivas, 301 negativas (53% x 46%)
Fold 0: Pos: 34, Neg: 31, Total: 65, Proporção: 52% x 47%
Fold 1: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 2: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 3: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 4: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 5: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 6: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 7: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 8: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 9: Pos: 34, Neg: 30, Total: 64, Proporção: 53% x 46%
StratifiedKFold(n_splits=10, random_state=None, shuffle=False)

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       1.00      0.16      0.28        31
         1.0       0.57      1.00      0.72        34

    accuracy                           0.60        65
   macro avg       0.78      0.58      0.50        65
weigh

In [318]:
# Gera a tabela com os indicadores do resultados dos k-fold

result_pca = pd.DataFrame(pd.DataFrame(reports_dict_pca)['1.0'].to_list())
accuracy_pca = pd.DataFrame(pd.DataFrame(reports_dict_pca)['accuracy'])
result_pca['accuracy'] = accuracy_pca
result_pca

Unnamed: 0,precision,recall,f1-score,support,accuracy
0,0.566667,1.0,0.723404,34,0.6
1,0.615385,0.914286,0.735632,35,0.646154
2,0.810811,0.857143,0.833333,35,0.815385
3,0.617021,0.828571,0.707317,35,0.630769
4,0.666667,0.685714,0.676056,35,0.646154
5,0.823529,0.8,0.811594,35,0.8
6,0.828571,0.828571,0.828571,35,0.815385
7,0.692308,0.514286,0.590164,35,0.615385
8,0.923077,0.342857,0.5,35,0.630769
9,0.736842,0.411765,0.528302,34,0.609375


In [319]:
result_mean_pca = pd.DataFrame(result_pca.mean(), columns=['PCA'])
result_mean_pca
# result_mean_pca.transpose()

Unnamed: 0,PCA
precision,0.728088
recall,0.718319
f1-score,0.693437
support,34.8
accuracy,0.680937


# Execução Base: Chi Squared (K-Best)

In [320]:
reports_dict_chi = []

df = pd.read_csv('dataset-fs-chi-squared.csv', header = 0)
X = df.drop('is_approved', axis=1).to_numpy() # DATASET
y = df['is_approved'].to_numpy() # target
stratified_k_fold(X, y, k=10, reports=reports_dict_chi)

k = 10, Dataset: 348 positivas, 301 negativas (53% x 46%)
Fold 0: Pos: 34, Neg: 31, Total: 65, Proporção: 52% x 47%
Fold 1: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 2: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 3: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 4: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 5: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 6: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 7: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 8: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 9: Pos: 34, Neg: 30, Total: 64, Proporção: 53% x 46%
StratifiedKFold(n_splits=10, random_state=None, shuffle=False)

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.83      0.16      0.27        31
         1.0       0.56      0.97      0.71        34

    accuracy                           0.58        65
   macro avg       0.70      0.57      0.49        65
weigh

Gera a tabela com os indicadores do resultados dos k-fold

In [321]:
result_chi = pd.DataFrame(pd.DataFrame(reports_dict_chi)['1.0'].to_list())
accuracy_chi = pd.DataFrame(pd.DataFrame(reports_dict_chi)['accuracy'])
result_chi['accuracy'] = accuracy_chi
result_chi

Unnamed: 0,precision,recall,f1-score,support,accuracy
0,0.559322,0.970588,0.709677,34,0.584615
1,0.647059,0.942857,0.767442,35,0.692308
2,0.810811,0.857143,0.833333,35,0.815385
3,0.673913,0.885714,0.765432,35,0.707692
4,0.710526,0.771429,0.739726,35,0.707692
5,0.842105,0.914286,0.876712,35,0.861538
6,0.871795,0.971429,0.918919,35,0.907692
7,0.742857,0.742857,0.742857,35,0.723077
8,0.947368,0.514286,0.666667,35,0.723077
9,0.75,0.617647,0.677419,34,0.6875


In [327]:
result_mean_chi = pd.DataFrame(result_chi.mean(), columns=['Chi Squared'])
result_mean_chi
# result_mean_pca.transpose()

Unnamed: 0,Chi Squared
precision,0.755576
recall,0.818824
f1-score,0.769819
support,34.8
accuracy,0.741058


# Execução Base: recursive-feature

In [332]:
reports_dict_recursive = []

df = pd.read_csv('dataset-fs-recursive-feature.csv', header = 0)
X = df.drop('is_approved', axis=1).to_numpy() # DATASET
y = df['is_approved'].to_numpy() # target
stratified_k_fold(X, y, k=10, reports=reports_dict_recursive)

k = 10, Dataset: 348 positivas, 301 negativas (53% x 46%)
Fold 0: Pos: 34, Neg: 31, Total: 65, Proporção: 52% x 47%
Fold 1: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 2: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 3: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 4: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 5: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 6: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 7: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 8: Pos: 35, Neg: 30, Total: 65, Proporção: 53% x 46%
Fold 9: Pos: 34, Neg: 30, Total: 64, Proporção: 53% x 46%
StratifiedKFold(n_splits=10, random_state=None, shuffle=False)

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       1.00      0.13      0.23        31
         1.0       0.56      1.00      0.72        34

    accuracy                           0.58        65
   macro avg       0.78      0.56      0.47        65
weigh

In [333]:
result_recursive = pd.DataFrame(pd.DataFrame(reports_dict_recursive)['1.0'].to_list())
accuracy_recursive = pd.DataFrame(pd.DataFrame(reports_dict_recursive)['accuracy'])
result_recursive['accuracy'] = accuracy_chi
result_recursive

Unnamed: 0,precision,recall,f1-score,support,accuracy
0,0.557377,1.0,0.715789,34,0.584615
1,0.64,0.914286,0.752941,35,0.692308
2,0.761905,0.914286,0.831169,35,0.815385
3,0.636364,0.8,0.708861,35,0.707692
4,0.727273,0.914286,0.810127,35,0.707692
5,0.761905,0.914286,0.831169,35,0.861538
6,0.85,0.971429,0.906667,35,0.907692
7,0.644444,0.828571,0.725,35,0.723077
8,0.857143,0.685714,0.761905,35,0.723077
9,0.75,0.705882,0.727273,34,0.6875


In [337]:
result_mean_recursive = pd.DataFrame(result_recursive.mean(), columns=['Recursive Feature'])
result_mean_recursive
# result_mean_pca.transpose()

Unnamed: 0,Recursive Feature
precision,0.718641
recall,0.864874
f1-score,0.77709
support,34.8
accuracy,0.741058


# Junta os resultados gerados de cada base

In [347]:
result = pd.concat([result_mean_all_features, result_mean_pca, result_mean_chi, result_mean_recursive], axis=1)

print("Média das métricas geradas pelo processamento de cada dataset")
result.iloc[[0,1,2,4]]

Média das métricas geradas pelo processamento de cada dataset


Unnamed: 0,All Features,PCA,Chi Squared,Recursive Feature
precision,0.726993,0.728088,0.755576,0.718641
recall,0.778655,0.718319,0.818824,0.864874
f1-score,0.735598,0.693437,0.769819,0.77709
accuracy,0.707163,0.680937,0.741058,0.741058
