# Atividade 02: Atributos Categóricos e Valores Faltantes
### Aluno: Alisson da Silva Vieira

# Bibliotecas utilizadas
- Numpy: É uma biblioteca fundamental para computação científica em Python, que fornece um objeto de matriz multidimensional, vários objetos derivados (como matrizes e matrizes mascaradas) e uma variedade de rotinas para operações rápidas em matrizes.
- Pandas: É uma biblioteca que fornece estruturas de dados rápidas, flexíveis e expressivas projetadas para tornar o trabalho com dados "relacionais" ou "rotulados" fácil e intuitivo. Tem como objetivo ser o bloco de construção fundamental de alto nível para fazer análises de dados.

In [55]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold

%matplotlib inline

In [56]:
df = pd.read_csv('data/agaricus_lepiota_small_c.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,e,x,s,y,t,a,f,w,b,g,...,s,w,w,p,w,o,p,n,v,d
1,e,f,s,y,f,n,f,c,b,p,...,s,w,w,p,w,o,f,n,y,g
2,e,k,s,w,f,c,f,w,b,g,...,s,w,n,p,w,t,e,w,n,g
3,e,f,f,n,t,n,f,c,b,w,...,s,g,w,p,w,o,p,k,v,d
4,p,x,s,w,t,p,f,c,n,w,...,s,w,w,p,w,o,p,n,s,u


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     1000 non-null   object
 1   cap-shape                 1000 non-null   object
 2   cap-surface               1000 non-null   object
 3   cap-color                 1000 non-null   object
 4   bruises                   1000 non-null   object
 5   odor                      1000 non-null   object
 6   gill-attachment           1000 non-null   object
 7   gill-spacing              1000 non-null   object
 8   gill-size                 1000 non-null   object
 9   gill-color                1000 non-null   object
 10  stalk-shape               1000 non-null   object
 11  stalk-root                690 non-null    object
 12  stalk-surface-above-ring  1000 non-null   object
 13  stalk-surface-below-ring  1000 non-null   object
 14  stalk-color-above-ring   

In [58]:
df.groupby('class').describe()

Unnamed: 0_level_0,cap-shape,cap-shape,cap-shape,cap-shape,cap-surface,cap-surface,cap-surface,cap-surface,cap-color,cap-color,...,spore-print-color,spore-print-color,population,population,population,population,habitat,habitat,habitat,habitat
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,...,top,freq,count,unique,top,freq,count,unique,top,freq
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
e,518,5,x,239,518,3,y,186,518,10,...,n,208,518,6,v,153,518,7,d,221
p,482,4,x,222,482,3,y,225,482,8,...,w,209,482,6,v,301,482,7,d,171


In [59]:
df.loc[df['class'] == 'e', 'class'] = 0
df.loc[df['class'] == 'p', 'class'] = 1

In [60]:
atributos_categoricos = ['stalk-root']

transformers = [
    ('imp_cat', SimpleImputer(strategy='constant', fill_value='desconhecido'), atributos_categoricos)
]

ct_imp = ColumnTransformer(
    transformers, remainder='drop'
)

X_imp_vals = ct_imp.fit_transform(df)

X_imputed = pd.DataFrame(X_imp_vals, columns=[*atributos_categoricos])

In [61]:
df['stalk-root'] = X_imputed['stalk-root']

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     1000 non-null   object
 1   cap-shape                 1000 non-null   object
 2   cap-surface               1000 non-null   object
 3   cap-color                 1000 non-null   object
 4   bruises                   1000 non-null   object
 5   odor                      1000 non-null   object
 6   gill-attachment           1000 non-null   object
 7   gill-spacing              1000 non-null   object
 8   gill-size                 1000 non-null   object
 9   gill-color                1000 non-null   object
 10  stalk-shape               1000 non-null   object
 11  stalk-root                1000 non-null   object
 12  stalk-surface-above-ring  1000 non-null   object
 13  stalk-surface-below-ring  1000 non-null   object
 14  stalk-color-above-ring   

In [62]:
def numeric(df, columns=['bruises', 'gill-size', 'stalk-shape', 'veil-type'], values=[['f', 't'], ['b', 'n'], ['e', 't'], ['p', 'u']]):
    for idx, column in enumerate(columns):
        df.loc[df[column] == values[idx][0], column] = 0
        df.loc[df[column] == values[idx][1], column] = 1
        
        df[column] = pd.to_numeric(df[column], errors='coerce')

numeric(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     1000 non-null   object
 1   cap-shape                 1000 non-null   object
 2   cap-surface               1000 non-null   object
 3   cap-color                 1000 non-null   object
 4   bruises                   1000 non-null   int64 
 5   odor                      1000 non-null   object
 6   gill-attachment           1000 non-null   object
 7   gill-spacing              1000 non-null   object
 8   gill-size                 1000 non-null   int64 
 9   gill-color                1000 non-null   object
 10  stalk-shape               1000 non-null   int64 
 11  stalk-root                1000 non-null   object
 12  stalk-surface-above-ring  1000 non-null   object
 13  stalk-surface-below-ring  1000 non-null   object
 14  stalk-color-above-ring   

In [63]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

columns = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']
labels = ['ai_cap-shape', 'ai_cap-surface', 'ai_cap-color', 'ai_bruises', 'ai_odor', 'ai_gill-attachment', 'ai_gill-spacing', 'ai_gill-size', 'ai_gill-color', 'ai_stalk-shape', 'ai_stalk-root', 'ai_stalk-surface-above-ring', 'ai_stalk-surface-below-ring', 'ai_stalk-color-above-ring', 'ai_stalk-color-below-ring', 'ai_veil-type', 'ai_veil-color', 'ai_ring-number', 'ai_ring-type', 'ai_spore-print-color', 'ai_population', 'ai_habitat']

transformers = []
for idx, column in enumerate(columns):
    transformers.append((labels[idx], OrdinalEncoder(), [column]))
# transformers = [
#     ('oe_department', OrdinalEncoder(), ['department']),
#     ('oe_salary', OrdinalEncoder(categories=[['low', 'medium', 'high']]), ['salary']),
# ]

ct = ColumnTransformer(
    transformers, remainder='passthrough'
)

X_oe = ct.fit_transform(df)

In [64]:
X = X_oe
y = df['class']

In [71]:
'''
    ### Funções referente ao classificador KNN ####
'''

def selecionarMelhoresK(ks, X_treino, X_val, y_treino, y_val):

    # lista de acuracias
    acuracias_val = []

    # para cada um dos k's
    for k in ks:

        # crirar o classificador
        knn = KNeighborsClassifier(n_neighbors=k)

        # treinar o classificador
        knn.fit(X_treino, y_treino)

        # predizer
        pred = knn.predict(X_val)

        # calcular a acuracia
        acuracias_val.append(accuracy_score(y_val, pred))
    
    # melhor acuracia
    melhor_val = max(acuracias_val)

    # melhor k
    melhor_k = ks[np.argmax(acuracias_val)]        
    
    # criar o classificador com o melhor k
    knn = KNeighborsClassifier(n_neighbors=melhor_k)

    # treinar o classificador
    knn.fit(np.vstack((X_treino, X_val)), [*y_treino, *y_val])
    
    # retorna o classificador, o melhor k e a melhor acuracia
    return knn, melhor_k, melhor_val

def validacaoCruzadaKnn(kVias = 10):

    # acuracias
    acuracias = []
    print('a')
    # usar o protocolo de validação cruzada estratificada
    skf = StratifiedKFold(n_splits=kVias, shuffle=True, random_state=1)

    for a in skf.split(X, y):
        print(a)
        break
    for idx_treino, idx_teste in skf.split(X, y):
        print('b')

        # extrair as instâncias de treinamento de acordo com os índices fornecidos pelo skf.split
        X_treino = X[idx_treino]
        y_treino = y[idx_treino]
        
        # extrair as instâncias de teste de acordo com os índices fornecidos pelo skf.split
        X_teste = X[idx_teste]
        y_teste = y[idx_teste]

        # separar as instâncias de treinamento entre treinamento e validação para a otimização do hiperparâmetro k
        X_treino, X_val, y_treino, y_val = train_test_split(X_treino, y_treino, test_size=0.2, stratify=y_treino, shuffle=True, random_state=1)

        # escolher o k com o melhor resultado no conjunto de validação e treinar o KNN com o melhor k.
        knn = selecionarMelhoresK(range(1,30,2), X_treino, X_val, y_treino, y_val)[0]
        
        # calcular a acurácia no conjunto de testes desta iteração e salvar na lista.
        acuracias.append(accuracy_score(y_teste, knn.predict(X_teste)))
    
    return acuracias


'''
    ### Funções auxiliares ####
'''

def showResult(acc, legend):
    print('Resultado ', legend, ':\n    >> Acc mínima: ', round(min(acc), 3), '%\n    >> Acc máxima: ', round(max(acc), 3), '%')
    print('    >> Média: ', round(np.mean(acc), 3), '\n    >> Desvio padrão: ', round(np.std(acc), 3), '\n')

In [72]:
acc = validacaoCruzadaKnn()

showResult(acc, 'KNN')

a


ValueError: Supported target types are: ('binary', 'multiclass'). Got 'unknown' instead.