In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler

In [2]:
def load_wine(normalizar):
    # função que carrega os dados do dataset wine, e remove a classe '3' para se tornar um dataset binário
    names = ['class','alcohol','malic_acid','ash','alcalinity_of_ash','magnesium','total_phenols','flavanoids','nonflavanoid_phenols'
              ,'proanthocyanins','color_intensity','hue','OD280_OD315_of_diluted_wines','proline']
    data = pd.read_csv('../Data/wine.data', names=names)
    data_binario = data.loc[data["class"] != 3,:]
    
    if normalizar:
        scaler = StandardScaler()
        scaler.fit(data_binario.drop('class', axis=1))
        X = pd.DataFrame(scaler.transform(data_binario.drop('class', axis=1)), columns=names[1:])
    else:
        X = data_binario.drop('class', axis=1)
    y = data_binario.loc[:,'class']
    return X, y

In [3]:
X, y = load_wine(normalizar=False)
#Encontrar as 5 melhores características do dataset
bestfeatures=SelectKBest(score_func=chi2,k=5)
fit = bestfeatures.fit(X,y)
dfscores=pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#Concatenando 2 dataframes para melhor visualização

featureScores=pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns=['Specs','Score']
print(featureScores.nlargest(7,'Score'))


                Specs         Score
12            proline  14497.066903
9     color_intensity     45.797138
4           magnesium     44.833856
3   alcalinity_of_ash     17.573073
6          flavanoids     10.517824
0             alcohol      5.350222
5       total_phenols      4.316162


#### LAFON-LAFOFRCADE and PEYNAUD*
measured the amino acid content of Bordeaux wines and found the white varieties to average
higher in proline than the red varieties. Cabernet Sauvignon and Merlot were noted
to be particularily high. Seasonal changes were described.

## Utilizando o método relief

In [4]:
def Relief(D, C, S, NoSample, Threshold, Seed):
    '''D: Training set
    C: class vector
    S: Original feature set
    NoSample: Number of instances
    Threshold: Threshold to select the features
    Seed: Seed to random selections
    T: Array of selected features
    W: Weight of each feature'''
    
    T = list()
    W = np.zeros(len(S))
#     import pdb; pdb.set_trace()
    np.random.seed(Seed)
    random_choice = np.random.randint(low=0, high=len(D), size=NoSample)
    
    for i in range(NoSample):
        
        # Randomly choosing an instance x in D
        x = D.iloc[random_choice[i]]
        # class of the instance x
        x_class = C[random_choice[i]]
        
        # Calculating the euclidean distance vector
        ## subsetting by class
        same_class = D[C == x_class]
        diff_class = D[C != x_class]
        
        ## broadcasting the instance x in a matrix with the right shape
        x_matrix_same = np.array(x) + np.zeros((len(same_class), len(x)))
        x_matrix_diff = np.array(x) + np.zeros((len(diff_class), len(x)))
        
        euclidean_distance_same = np.sqrt(np.sum(np.power(np.add(x_matrix_same, - same_class), 2), axis=1))
        euclidean_distance_diff = np.sqrt(np.sum(np.power(np.add(x_matrix_diff, - diff_class), 2), axis=1))
        
        ## Near Hit: The instance with the minimum Euclidean distance among all the instances of the same class.
        ### removing the distance to x
        near_hit = np.argmax(euclidean_distance_same == np.min(euclidean_distance_same.drop(random_choice[i])))
        ## Near Miss: The instance with the minimum Euclidean distance among all the instances of the different class.
        near_miss = np.argmax(euclidean_distance_diff == np.min(euclidean_distance_diff))
        ### when A is continuous diff(A,I1,I2) is defined as |value(A,I1) - value(A,I2)| / ( max(A) - min(A) )
        # diff near hit
        divisor = np.add(D.max(0), - D.min(0))
        Diff = np.divide(np.abs(np.add(x, - D.iloc[near_hit])), divisor)
        W = np.add(W, np.multiply(Diff, 1/NoSample))
    print(W.sort_values(ascending=False))
    # return all features which surpass the threshold
    return W[W >= Threshold].index

In [5]:
help(Relief)

Help on function Relief in module __main__:

Relief(D, C, S, NoSample, Threshold, Seed)
    D: Training set
    C: class vector
    S: Original feature set
    NoSample: Number of instances
    Threshold: Threshold to select the features
    Seed: Seed to random selections
    T: Array of selected features
    W: Weight of each feature



In [6]:
X, y = load_wine(True)

In [7]:
S = X.columns
NoSample = 30
Threshold = 0.1
Seed = 42
T = Relief(X, y, S, NoSample, Threshold, Seed)
print("\nCaracterísticas selecionadas:\n")
print(T)

OD280_OD315_of_diluted_wines    0.123513
proline                         0.107228
nonflavanoid_phenols            0.106289
alcohol                         0.104737
alcalinity_of_ash               0.101375
hue                             0.089542
color_intensity                 0.084646
total_phenols                   0.082614
magnesium                       0.079348
proanthocyanins                 0.075079
flavanoids                      0.069771
ash                             0.065597
malic_acid                      0.061594
Name: 102, dtype: float64

Características selecionadas:

Index(['alcohol', 'alcalinity_of_ash', 'nonflavanoid_phenols',
       'OD280_OD315_of_diluted_wines', 'proline'],
      dtype='object')


will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  return getattr(obj, method)(*args, **kwds)
