# Seleção de Features com ANOVA

## Importando as bibliotecas

In [28]:
from pandas import read_csv
from numpy import nan

## Carregando os dados

In [29]:
#Fonte: https://www.kaggle.com/uciml/pima-indians-diabetes-database
data = read_csv('pima-indians-diabetes.csv' , header=None)

In [30]:
data.shape

(768, 9)

Attributes: (all numeric-valued)
   1. Number of times pregnant
   2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
   3. Diastolic blood pressure (mm Hg)
   4. Triceps skin fold thickness (mm)
   5. 2-Hour serum insulin (mu U/ml)
   6. Body mass index (weight in kg/(height in m)^2)
   7. Diabetes pedigree function
   8. Age (years)
   9. Class variable (0 or 1)

In [32]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [38]:
data.isnull().sum()

0    111
1      5
2     35
3    227
4    374
5     11
6      0
7      0
8      0
dtype: int64

In [39]:
data.iloc[:,:-1] = data.iloc[:,:-1].replace(0, nan)

In [40]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6.0,148.0,72.0,35.0,,33.6,0.627,50,1
1,1.0,85.0,66.0,29.0,,26.6,0.351,31,0
2,8.0,183.0,64.0,,,23.3,0.672,32,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63,0
764,2.0,122.0,70.0,27.0,,36.8,0.340,27,0
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30,0
766,1.0,126.0,60.0,,,30.1,0.349,47,1


In [41]:
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,657.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,4.494673,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.217291,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,1.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,2.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,4.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,7.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [42]:
data.dropna(inplace=True)

In [43]:
data.shape

(336, 9)

In [44]:
x = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [45]:
y.value_counts()

0    225
1    111
Name: 8, dtype: int64

## Selecionando as features e os parâmetros do modelo

In [47]:
def select_features(x_train, y_train, x_test):

    fs = SelectKBest(score_func=f_classif, k=4)
    fs.fit(x_train, y_train)

    x_train_fs = fs.transform(x_train)
    x_test_fs = fs.transform(x_test)

    return x_train_fs, x_test_fs, fs

In [48]:
def get_best_model(model_name, x, y):
    
    if model_name == 'd_tree':
        params = {'max_depth': [None, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
        model = DecisionTreeClassifier()
    elif model_name == 'r_forest':
        params = {'n_estimators': [10, 50, 100, 200], 'max_depth': [None, 1, 3, 5, 7]}
        model = RandomForestClassifier()
    else:
        print("Oops! That was no valid model. Try again...")
    
    
    grid = GridSearchCV(model,           
                        params,          
                        error_score=0.)  
    grid.fit(x, y) 
    
    return grid

## Realizando a classificação

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score

In [54]:
# Divisão treino/teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# Seleção das features
x_train_fs, x_test_fs, fs = select_features(x_train, y_train, x_test)

# Capturar os melhores parâmetros do modelo
grid = get_best_model("d_tree", x_train_fs, y_train)

# Instanciar o modelo com os melhores parâmetros e treiná-lo
model = DecisionTreeClassifier(**grid.best_params_)
#model = RandomForestClassifier(**grid.best_params_)
model.fit(x_train_fs, y_train)

# Avaliando o modelo
yhat = model.predict(x_test_fs)
accuracy = accuracy_score(y_test, yhat)
kappa = cohen_kappa_score(y_test, yhat)
print('Accuracy: %.2f ' % (accuracy*100))
print('Kappa: ' + str(kappa)) 

Accuracy: 64.71 
Kappa: 0.1807228915662651


Atividade (1.25): realizar a seleção de features com a ANOVA atendendo aos seguintes itens:
<ol>
        <li> Modifique a função select_features para receber como parâmetro de entrada o valor de k e altere o código para garantir que todas as features do dataset serão avaliadas; </li> 
        <li> Criar uma função para avaliar os modelos de classificação em termos de acurácia, utilizar o repeated stratified k-fold cross-validation. Você define o número de folds e repetições;</li> 
        <li> Plotar um boxplot da acurácia obtida por cada número de features testados. Um boxplot distinto deve ser plotado para cada classificador. Atenção: utilizar a versão da função get_best_model() que vocês desenvolveram na atividade anterior, contemplando os classificadores svm e gxboost;</li>    
</ol