In [1]:
# PCA 

In [2]:
import warnings
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

plt.rcParams['figure.figsize'] = [10, 5]

In [3]:
def CrossValidacaoEstratificada(dataset, y, folds=4):
    npY = np.array(y)
    fold_classes = list()

    #construindo as estruturas
    dataset_fold = list()
    y_fold = list()
    for i in range(folds):
        dataset_fold.append(list())
        y_fold.append(list())
    fold_atual = 0
    unicos = np.unique(npY)
    for i in range(len(unicos)):
        # cria uma lista das classe e os valores os índices(posições) delas no vetor y
        fold_classes.append(np.where(npY == unicos[i])[0].tolist())
        
        while len(fold_classes[i])>0:
            # sorteia um elemento do vetor de elementos da mesma classe
            if (fold_atual >= folds):
                fold_atual = 0
            index_elemento = np.random.randint(len(fold_classes[i]))
            index = fold_classes[i].pop(index_elemento)
            # Adiciona o elemento sorteado no bucket correspondente
            dataset_fold[fold_atual].append(dataset[index])
            y_fold[fold_atual].append(y[index])
            fold_atual = fold_atual + 1
    for i in range(len(dataset_fold)):
        print("dataset_fold[" + str(i) + "]: " + str(len(dataset_fold[i])))
    return dataset_fold, y_fold

In [4]:
names = ['class','alcohol','malic_acid','ash','alcalinity_of_ash','magnesium','total_phenols','flavanoids','nonflavanoid_phenols'
          ,'proanthocyanins','color_intensity','hue','OD280_OD315_of_diluted_wines','proline']
data = pd.read_csv('../Data/wine.data', names=names)

In [5]:
data.head()

Unnamed: 0,class,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,OD280_OD315_of_diluted_wines,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [6]:
data_binario = data.loc[data["class"] != 3,:]
data_binario["class"].unique()

array([1, 2])

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, make_scorer

In [8]:
X = data.drop('class', axis=1)
y = data.loc[:,'class']

### Testing without Standarization

In [9]:
svm_not_scalled = LinearSVC(C=1, loss="hinge")

In [10]:
print(cross_val_score(svm_not_scalled, X, y, cv=10))

[0.84210526 0.83333333 0.77777778 0.88888889 0.44444444 1.
 1.         0.88888889 1.         1.        ]


### Testing with Standarization

In [11]:
svm_scalled = Pipeline((
                    ("scaler", StandardScaler()),
                    ("linear_svc", LinearSVC(C=1, loss="hinge"))
                ))

In [12]:
print(cross_val_score(svm_scalled, X, y, cv=10))

[0.94736842 0.94444444 1.         0.94444444 1.         0.94444444
 1.         1.         1.         1.        ]


### Creating methods to test

In [13]:
def metricas(classificador, X, y, folds):
    weighted_recall_scorer = make_scorer(recall_score, average='weighted')
    recall = cross_val_score(classificador, X, y, cv=folds, scoring=weighted_recall_scorer)
#     print('Revocação: ', np.mean(recall), recall)
    weighted_precision_scorer = make_scorer(precision_score, average='weighted')
    precision = cross_val_score(classificador, X, y, cv=folds, scoring=weighted_precision_scorer)
#     print('Precisão: ', np.mean(precision), precision)
    accuracy = cross_val_score(classificador, X, y, cv=folds, scoring='accuracy')
#     print('Acurácia', np.mean(accuracy), accuracy)
    return (np.mean(recall), np.mean(precision), np.mean(accuracy))

def scalling_and_svc(X, y, kernel=["linear"], C=[1], gamma=[1], degree=[3], folds=10):
    result = dict() # cria um dicionário dos resultados, com o índice os kernels
    for k in kernel:
        paramC = dict()
        for i in C:
            if k == "rbf":
                print("rbf")
                paramG = dict()
                for g in gamma:
                    svm_scalled = Pipeline((
                                ("scaler", StandardScaler()),
                                ("svc", SVC(kernel=k, gamma=g, C=i))
                            ))
                    svm_scalled.fit(X, y)
                    paramG["gamma-"+str(g)] = metricas(svm_scalled, X, y, folds)
                paramC["c-"+str(i)] = paramG 
            elif k == "poly":
                print("poly")
                paramD = dict()
                for d in degree:
                    svm_scalled = Pipeline((
                                ("scaler", StandardScaler()),
                                ("svc", SVC(kernel=k, C=i, coef0=1, degree=d))
                            ))
                    svm_scalled.fit(X,y)
                    paramD["k-"+str(d)] = metricas(svm_scalled, X, y, folds)
                paramC["c-"+str(i)] = paramD
            elif k == "linear": #case linear
                print("linear")
                svm_scalled = Pipeline((
                                ("scaler", StandardScaler()),
                                ("svc", SVC(kernel=k, C=i))
                            ))
                svm_scalled.fit(X,y)
                paramC["c-"+str(i)] = metricas(svm_scalled, X, y, folds)
        result[k] = paramC
    return result

In [14]:
kernel = ["linear", "poly", "rbf"]
pC = [0.01, 0.1, 1, 10, 100]
pGamma = [0.1, 1, 10]
degree=[3]
folds = 10

# scalling_and_svc(X=X, y=y, kernel = kernel, C=C )
# warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')
resultados = scalling_and_svc(X, y, kernel=kernel, C=pC, degree=degree, gamma= pGamma, folds = folds)
print(resultados)

linear
linear
linear
linear
linear
poly
poly
poly
poly
poly
rbf
rbf
rbf
rbf
rbf
{'linear': {'c-0.01': (0.9780701754385965, 0.9814605959342803, 0.9780701754385965), 'c-0.1': (0.9891812865497076, 0.9906920077972711, 0.9891812865497076), 'c-1': (0.9551943584451325, 0.962061833505332, 0.9551943584451325), 'c-10': (0.9610767113863089, 0.9671038503120547, 0.9610767113863089), 'c-100': (0.9610767113863089, 0.9671038503120547, 0.9610767113863089)}, 'poly': {'c-0.01': {'k-3': (0.9230994152046783, 0.9431375576989612, 0.9230994152046783)}, 'c-0.1': {'k-3': (0.983625730994152, 0.9859301030353663, 0.983625730994152)}, 'c-1': {'k-3': (0.9728070175438596, 0.9775577833472571, 0.9728070175438596)}, 'c-10': {'k-3': (0.9728070175438596, 0.9775577833472571, 0.9728070175438596)}, 'c-100': {'k-3': (0.9728070175438596, 0.9775577833472571, 0.9728070175438596)}}, 'rbf': {'c-0.01': {'gamma-0.1': (0.3992539559683522, 0.15968837168785793, 0.3992539559683522), 'gamma-1': (0.3992539559683522, 0.15968837168785793, 0

In [15]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1)
X_pca = pca.fit(X)

In [17]:
kernel = ["linear", "poly", "rbf"]
pC = [0.01, 0.1, 1, 10, 100]
pGamma = [0.1, 1, 10]
degree=[3]
folds = 10

# scalling_and_svc(X=X, y=y, kernel = kernel, C=C )
# warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')
resultados = scalling_and_svc(X, y, kernel=kernel, C=pC, degree=degree, gamma= pGamma, folds = folds)
print(resultados)

linear
linear
linear
linear
linear
poly
poly
poly
poly
poly
rbf
rbf
rbf
rbf
rbf
{'linear': {'c-0.01': (0.9780701754385965, 0.9814605959342803, 0.9780701754385965), 'c-0.1': (0.9891812865497076, 0.9906920077972711, 0.9891812865497076), 'c-1': (0.9551943584451325, 0.962061833505332, 0.9551943584451325), 'c-10': (0.9610767113863089, 0.9671038503120547, 0.9610767113863089), 'c-100': (0.9610767113863089, 0.9671038503120547, 0.9610767113863089)}, 'poly': {'c-0.01': {'k-3': (0.9230994152046783, 0.9431375576989612, 0.9230994152046783)}, 'c-0.1': {'k-3': (0.983625730994152, 0.9859301030353663, 0.983625730994152)}, 'c-1': {'k-3': (0.9728070175438596, 0.9775577833472571, 0.9728070175438596)}, 'c-10': {'k-3': (0.9728070175438596, 0.9775577833472571, 0.9728070175438596)}, 'c-100': {'k-3': (0.9728070175438596, 0.9775577833472571, 0.9728070175438596)}}, 'rbf': {'c-0.01': {'gamma-0.1': (0.3992539559683522, 0.15968837168785793, 0.3992539559683522), 'gamma-1': (0.3992539559683522, 0.15968837168785793, 0