In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB

In [2]:
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2

In [3]:
def load_wine(normalizar):
    # função que carrega os dados do dataset wine, e remove a classe '3' para se tornar um dataset binário
    names = ['class','alcohol','malic_acid','ash','alcalinity_of_ash','magnesium','total_phenols','flavanoids','nonflavanoid_phenols'
              ,'proanthocyanins','color_intensity','hue','OD280_OD315_of_diluted_wines','proline']
    data = pd.read_csv('../Data/wine.data', names=names)
    data_binario = data.loc[data["class"] != 3,:]
    
    if normalizar:
        scaler = StandardScaler()
        scaler.fit(data_binario.drop('class', axis=1))
        X = pd.DataFrame(scaler.transform(data_binario.drop('class', axis=1)), columns=names[1:])
    else:
        X = data_binario.drop('class', axis=1)
    y = data_binario.loc[:,'class']
    return X, y

In [4]:
def metricas(classificador, X, y, folds, seed=42):
    np.random.seed(seed)
    recall = cross_val_score(classificador, X, y, cv=folds, scoring='recall')
    precision = cross_val_score(classificador, X, y, cv=folds, scoring='precision')
    accuracy = cross_val_score(classificador, X, y, cv=folds, scoring='accuracy')
    return (np.mean(recall), np.mean(precision), np.mean(accuracy))

In [5]:
X, y = load_wine(normalizar=False)
gnb = GaussianNB()
y_pred = gnb.fit(X, y).predict(X)
print("Number of mislabeled points out of a total %d points : %d"
      % (X.shape[0],(y != y_pred).sum()))

Number of mislabeled points out of a total 130 points : 1


#### As we can see, even without normalizing the dataset we can achieve good results.

In [6]:
# https://towardsdatascience.com/unfolding-na%C3%AFve-bayes-from-scratch-2e86dcae4b01

### Testing with all features

In [10]:
# warnings.filterwarnings('ignore')
revocacao, precisao, acuracia = metricas(gnb, X, y, folds=10)
print("Revocação: " + str(revocacao))
print("Precisão: " + str(precisao))
print("Acurácia: " + str(acuracia))

Revocação: 0.9666666666666668
Precisão: 0.9714285714285715
Acurácia: 0.9697802197802197


### Testing with PCA

In [27]:
pca = PCA(n_components=1)
X_PCA = pca.fit_transform(X)

In [29]:
# warnings.filterwarnings('ignore')
revocacao, precisao, acuracia = metricas(gnb, X_PCA, y, folds=4)
print("Revocação: " + str(revocacao))
print("Precisão: " + str(precisao))
print("Acurácia: " + str(acuracia))

Revocação: 0.9
Precisão: 0.9366515837104072
Acurácia: 0.9242424242424243


In [31]:
### Testing with Relief TOP 5 features
relief_features = ['alcohol', 'alcalinity_of_ash', 'nonflavanoid_phenols','OD280_OD315_of_diluted_wines', 'proline']
X_relief = X.loc[:,relief_features]
X_relief.head()

Unnamed: 0,alcohol,alcalinity_of_ash,nonflavanoid_phenols,OD280_OD315_of_diluted_wines,proline
0,14.23,15.6,0.28,3.92,1065
1,13.2,11.2,0.26,3.4,1050
2,13.16,18.6,0.3,3.17,1185
3,14.37,16.8,0.24,3.45,1480
4,13.24,21.0,0.39,2.93,735


In [32]:
# warnings.filterwarnings('ignore')
revocacao, precisao, acuracia = metricas(gnb, X_relief, y, folds=4)
print("Revocação: " + str(revocacao))
print("Precisão: " + str(precisao))
print("Acurácia: " + str(acuracia))

Revocação: 0.95
Precisão: 0.9705882352941176
Acurácia: 0.9621212121212122
