In [135]:
# http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/ ----> exemplo seleção de atributos
# https://medium.com/data-hackers/como-usar-pipelines-no-scikit-learn-1398a4cc6ae9 ----> exemplo de pipeline do sklearn

In [136]:
import numpy as np
import pandas as pd

### Visualização do dataset

In [153]:
# Reading data files
df_wine = pd.read_csv("wine.data", header=None)
df_wine.columns = ['Class', 'Alcohol', 'Malic Acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']

df_wine.info()
df_wine.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Class                         178 non-null    int64  
 1   Alcohol                       178 non-null    float64
 2   Malic Acid                    178 non-null    float64
 3   Ash                           178 non-null    float64
 4   Alcalinity of ash             178 non-null    float64
 5   Magnesium                     178 non-null    int64  
 6   Total phenols                 178 non-null    float64
 7   Flavanoids                    178 non-null    float64
 8   Nonflavanoid phenols          178 non-null    float64
 9   Proanthocyanins               178 non-null    float64
 10  Color intensity               178 non-null    float64
 11  Hue                           178 non-null    float64
 12  OD280/OD315 of diluted wines  178 non-null    float64
 13  Proli

Unnamed: 0,Class,Alcohol,Malic Acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [154]:
features_names = (df_wine.columns)[1:]
print(features_names)

Index(['Alcohol', 'Malic Acid', 'Ash', 'Alcalinity of ash', 'Magnesium',
       'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
       'Proanthocyanins', 'Color intensity', 'Hue',
       'OD280/OD315 of diluted wines', 'Proline'],
      dtype='object')


Dividindo o dataset em conjuntos de treinamento e teste

In [139]:
from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=0)

Como o uso de pipelines era opcional, resolvi usar uma outra abordagem. Primeiramente crio uma lista com os modelos SVM com kernel linear, SVM com kernel gaussiano e Random Forest.

In [140]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

models = {
    "SVM linear": svm.SVC(kernel='linear'),
    "SVM gaussiano": svm.SVC(kernel ='rbf', random_state = 0),
    "Random Forest": RandomForestClassifier(n_estimators=500, random_state=1)
}

### Forward Feature Selection

Aqui uso o modulo python SequentialFeatureSelector para fazer a seleção de atributos com os métodos forward e backward. Para o modo forward, usando a propriedade k_features configurada com o valor 5, esse módulo retorna a seleção de features em conjuntos de 1 a 5 features, logo só é necessário executar uma vez para cada modelo da lista criada anteriormente.

Os resultados são salvos em uma lista para melhorar a visualização posteriormente.

In [141]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from time import sleep
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

sfsList = {}


for model in models.keys():

    sfs = SFS(models[model], 
           k_features=5, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

    sfsList["{model}-forward".format(model=model, number=i)] = sfs.fit(X_train, y_train, custom_feature_names=features_names)
        


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    0.1s finished

[2022-04-25 22:32:45] Features: 1/5 -- score: 0.8145161290322581[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.0s finished

[2022-04-25 22:32:45] Features: 2/5 -- score: 0.9193548387096774[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.1s finished

[2022-04-25 22:32:45] Features: 3/5 -- score: 0.9435483870967742[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

Aqui são impressos conjuntos de features para cada modelo. Para todos eles, há conjuntos com as 1,2,3,4 e 5 melhores features determinadas pela seleção.

É possível ver na saída da execução os valores de score para cada modelo, de acordo com o número de features: 

* **Para SVM linear, o maior score obtido foi 0.98387 com 5 features.**
* **Para SVM gaussiano, o maior valor é obtido com apenas 3 features, sendo ele 0.935484.**
* **Já para Random Forest, o maior valor obtido foi 1.0, com apenas 2 features.**

In [142]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

for sfs in sfsList.keys():
    print(sfs)
    pp.pprint(sfsList[sfs].subsets_) 
    print("\n\n")

SVC linear-forward
{   1: {   'avg_score': 0.8145161290322581,
           'cv_scores': array([0.81451613]),
           'feature_idx': (6,),
           'feature_names': ('Flavanoids',)},
    2: {   'avg_score': 0.9193548387096774,
           'cv_scores': array([0.91935484]),
           'feature_idx': (0, 6),
           'feature_names': ('Alcohol', 'Flavanoids')},
    3: {   'avg_score': 0.9435483870967742,
           'cv_scores': array([0.94354839]),
           'feature_idx': (0, 6, 12),
           'feature_names': ('Alcohol', 'Flavanoids', 'Proline')},
    4: {   'avg_score': 0.967741935483871,
           'cv_scores': array([0.96774194]),
           'feature_idx': (0, 1, 6, 12),
           'feature_names': ('Alcohol', 'Malic Acid', 'Flavanoids', 'Proline')},
    5: {   'avg_score': 0.9838709677419355,
           'cv_scores': array([0.98387097]),
           'feature_idx': (0, 1, 6, 10, 12),
           'feature_names': (   'Alcohol',
                                'Malic Acid',
        

### Backward Feature Selection

Para o modo backward, usando a propriedade k_features configurada com o valor 2, esse módulo retorna a seleção de features em conjuntos de 2 a 13 features, logo só é necessário executar uma vez para cada modelo da lista criada anteriormente, para obter os conjuntos de 2, 3, 4 e 5 features.

Os resultados são salvos em uma lista para melhorar a visualização posteriormente.

In [143]:
sfsListBack = {}


for model in models.keys():

    sfsBack = SFS(models[model], 
           k_features=2, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)

    sfsListBack["{model}-backward".format(model=model, number=i)] = sfsBack.fit(X_train, y_train, custom_feature_names=features_names)
        

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    0.8s finished

[2022-04-25 22:33:08] Features: 12/2 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.7s finished

[2022-04-25 22:33:09] Features: 11/2 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.8s finished

[2022-04-25 22:33:10] Features: 10/2 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:  

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    2.5s finished

[2022-04-25 22:33:39] Features: 6/2 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.2s finished

[2022-04-25 22:33:42] Features: 5/2 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.8s finished

[2022-04-25 22:33:43] Features: 4/2 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.5s finished

[2022-

Aqui são impressos conjuntos de features para cada modelo.

Olhando apenas os conjuntos com 2, 3, 4 e 5 features, é possível ver na saída da execução os valores de score para cada modelo, de acordo com o número de features: 

* **Para SVM linear, o maior score obtido foi 0.98387 com 5 features.**
* **Para SVM gaussiano, o maior valor é obtido com apenas 2 features, sendo ele 0.766129.**
* **Já para Random Forest, o maior valor obtido foi 1.0, com apenas 2 features (mas também para 3, 4 ou 5 features).**

In [144]:
for sfs in sfsListBack.keys():
    print(sfs)
    pp.pprint(sfsListBack[sfs].subsets_)
    print("\n\n")

SVC linear-backward
{   2: {   'avg_score': 0.9112903225806451,
           'cv_scores': array([0.91129032]),
           'feature_idx': (6, 9),
           'feature_names': ('Flavanoids', 'Color intensity')},
    3: {   'avg_score': 0.9516129032258065,
           'cv_scores': array([0.9516129]),
           'feature_idx': (6, 9, 12),
           'feature_names': ('Flavanoids', 'Color intensity', 'Proline')},
    4: {   'avg_score': 0.9758064516129032,
           'cv_scores': array([0.97580645]),
           'feature_idx': (1, 6, 9, 12),
           'feature_names': (   'Malic Acid',
                                'Flavanoids',
                                'Color intensity',
                                'Proline')},
    5: {   'avg_score': 0.9838709677419355,
           'cv_scores': array([0.98387097]),
           'feature_idx': (1, 2, 6, 9, 12),
           'feature_names': (   'Malic Acid',
                                'Ash',
                                'Flavanoids',
          

### PCA

Para fazer a análise com PCA, primeiramente se utiliza a normalização standard scalar.

In [145]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

Nesta parte, é utilizado um loop for para obter os conjuntos com 2 a 5 principal components. Os valores de train e test são salvos em listas para serem utilizados posteriormente. Os valores de taxa de variação são impressos para cada conjunto.

In [146]:
from sklearn.decomposition import PCA

pcaXtrain = []
pcaXtest = []

for i in range(2,6):
    pca = PCA(n_components=i)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    
    pcaXtrain.append(X_train_pca)
    pcaXtest.append(X_test_pca)
    print(pca.explained_variance_ratio_)

[0.99829536 0.00154355]
[9.98295363e-01 1.54354722e-03 8.19537593e-05]
[9.98295363e-01 1.54354722e-03 8.19537593e-05 5.20670597e-05]
[9.98295363e-01 1.54354722e-03 8.19537593e-05 5.20670597e-05
 1.23944813e-05]


Agora os modelos são treinados novamente utilizando a saída do PCA

In [159]:
for model in models.keys():
    for i in range(len(pcaXtrain)):
        
        print("Model: {model}".format(model=model))
        
        classifier = models[model]
        classifier.fit(pcaXtrain[i], y_train)
        y_pred = classifier.predict(pcaXtest[i])
        
        print('Accuracy: {accuracy}\n'.format(accuracy=accuracy_score(y_test, y_pred)))
    print("------------------------------------------")

Model: SVC linear
Accuracy: 0.6481481481481481

Model: SVC linear
Accuracy: 0.7777777777777778

Model: SVC linear
Accuracy: 0.9814814814814815

Model: SVC linear
Accuracy: 0.9444444444444444

------------------------------------------
Model: SVC gaussiano
Accuracy: 0.6666666666666666

Model: SVC gaussiano
Accuracy: 0.6666666666666666

Model: SVC gaussiano
Accuracy: 0.6666666666666666

Model: SVC gaussiano
Accuracy: 0.6666666666666666

------------------------------------------
Model: Random Forest
Accuracy: 0.7222222222222222

Model: Random Forest
Accuracy: 0.7592592592592593

Model: Random Forest
Accuracy: 0.9629629629629629

Model: Random Forest
Accuracy: 0.9444444444444444

------------------------------------------


Olhando os resultados de treinamento com PCA observa-se que: 

* **Para SVM linear, o maior score obtido foi 0.98148 com 4 components.**
* **Para SVM gaussiano, o maior score é de 0.6666 e se repete para todos os conjuntos de componentes**
* **Já para Random Forest, o maior score obtido foi 0.96296, com 4 components**