# Atividade 6 - Grupo 7G

## Bibliotecas utilizadas

In [9]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics


## Funções auxiliares

`describe_dataset()` : realiza o cálculo das proporções de classes do dataset original

In [10]:
def describe_dataset(X, y, k):
    # get dataset rows: instances , columns: features
    rows, columns = X.shape
    # get proportion from target
    (unique, counts) = np.unique(y, return_counts=True) 
    # calculate proportion
    prop_neg = int(counts[0]/rows*100)
    prop_pos = int(counts[1]/rows*100)

    print("k = {}, Dataset: {} positivas, {} negativas ({}% x {}%)".format(k, counts[1], counts[0], prop_pos, prop_neg))

`get_classes_from_index()` : realiza o cálculo das proporções de classes dos folds criados

In [11]:
def get_classes_from_index(y, skf):
    _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
    y_counts = np.bincount(y_inv)
    _, class_perm = np.unique(y_idx, return_inverse=True)
    y_encoded = class_perm[y_inv]
    y_order = np.sort(y_encoded)
    n_classes = len(y_idx)
    allocation = np.asarray(
            [np.bincount(y_order[i::skf.n_splits], minlength=n_classes)
             for i in range(skf.n_splits)])

    for idx, f in enumerate(allocation):
        count_neg = int(f[0])
        count_pos = int(f[1])
        total = count_neg+count_pos
        prop_temp_neg = int(count_neg/total*100)
        prop_temp_pos = int(count_pos/total*100)
        print("Fold {}: Pos: {}, Neg: {}, Total: {}, Proporção: {}% x {}%".format(idx, count_pos, count_neg, total, prop_temp_pos, prop_temp_neg))

## Função final - Stratified K-Folds cross-validator

In [12]:
def stratified_k_fold(X, y, k):
    """
    Parameters
    ----------    
    X : array-like, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples
        and n_features is the number of features.
    y : array-like, of length n_samples
        The target variable for supervised learning problems.
    k : int
        Determines the number of folds.

    """

    skf = StratifiedKFold(n_splits=k)
    describe_dataset(X, y, k)
    get_classes_from_index(y, skf)
    
    ### create naive bayes classifier
    clf = GaussianNB()
    
    for train_index, test_index in skf.split(X, y):
        print("\nTRAIN: {}  TEST: {}".format(len(train_index), len(test_index)))
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        ### train classifier
        clf.fit(X_train, y_train)
        
        ### calculate metrics
        y_predicted = clf.predict(X_test)
        print(metrics.classification_report(y_test, y_predicted))

## Preparação do dataset

In [13]:
df = pd.read_csv('dataset/dataset-normalizado.csv', header = 0)
X = df.drop('is_approved',axis=1).to_numpy() # DATASET
y = df['is_approved'].to_numpy() # target

In [14]:
stratified_k_fold(X, y, k=10)

k = 10, Dataset: 41 positivas, 608 negativas (6% x 93%)
Fold 0: Pos: 4, Neg: 61, Total: 65, Proporção: 6% x 93%
Fold 1: Pos: 4, Neg: 61, Total: 65, Proporção: 6% x 93%
Fold 2: Pos: 4, Neg: 61, Total: 65, Proporção: 6% x 93%
Fold 3: Pos: 4, Neg: 61, Total: 65, Proporção: 6% x 93%
Fold 4: Pos: 4, Neg: 61, Total: 65, Proporção: 6% x 93%
Fold 5: Pos: 4, Neg: 61, Total: 65, Proporção: 6% x 93%
Fold 6: Pos: 4, Neg: 61, Total: 65, Proporção: 6% x 93%
Fold 7: Pos: 4, Neg: 61, Total: 65, Proporção: 6% x 93%
Fold 8: Pos: 5, Neg: 60, Total: 65, Proporção: 7% x 92%
Fold 9: Pos: 4, Neg: 60, Total: 64, Proporção: 6% x 93%

TRAIN: 584  TEST: 65
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99        61
         1.0       1.00      0.75      0.86         4

    accuracy                           0.98        65
   macro avg       0.99      0.88      0.92        65
weighted avg       0.98      0.98      0.98        65


TRAIN: 584  TEST: 65
             