### COLETA DE DADOS

In [3]:
# CARREGANDO DADOS
from ucimlrepo import fetch_ucirepo 

# importando dataset
dataset = fetch_ucirepo(id=848) # 61068 registros e 21 atributos
  
# coletando as informações
data_frame = dataset.data.original

### PRÉ-PROCESSAMENTO DE DADOS

In [4]:
# TRATANDO DADOS
import pandas
from sklearn.preprocessing import LabelEncoder

print(f"PRÉ TRATAMENTO: {len(data_frame)} registros e {data_frame.shape} colunas")

# removendo colunas com muitos nulos
tolerancia = len(data_frame) * 0.7
data_frame = data_frame.dropna(axis=1, thresh=tolerancia)


# removendo duplicados
print("VALORES DUPLICADOS: ", data_frame.duplicated().sum())
data_frame = data_frame.drop_duplicates()

# convertendo colunas categóricas em valores inteiros
conversores = {}
for coluna in data_frame.columns:
    if (data_frame[coluna].dtype == type(object)):
        conversor = LabelEncoder()
        data_frame[coluna] = conversor.fit_transform(data_frame[coluna])
        conversores[coluna] = conversor

print(f"PÓS TRATAMENTO: {len(data_frame)} registros e {data_frame.shape} colunas")

PRÉ TRATAMENTO: 61069 registros e (61069, 21) colunas
VALORES DUPLICADOS:  146


PÓS TRATAMENTO: 60923 registros e (60923, 15) colunas


In [5]:
print(data_frame.columns)

Index(['class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-color', 'stem-height',
       'stem-width', 'stem-color', 'has-ring', 'ring-type', 'habitat',
       'season'],
      dtype='object')


### DIVISÃO DE DADOS

In [6]:
# DIVIDINDO O DATASET TRATADO
import numpy
from sklearn.model_selection import train_test_split

atributos = data_frame.drop(["class"], axis=1)
respostas = data_frame[["class"]]

a_treino, a_teste, r_treino, r_teste = train_test_split(atributos, respostas, test_size=0.3, random_state=42)

# convertendo de (N, 1) para (N,)
r_treino = numpy.ravel(r_treino)
r_teste = numpy.ravel(r_teste)

In [7]:
print("TAMANHO DO DATASET TRATADO: ", len(data_frame))
print("CONJUNTO DE TREINO: ", a_treino.shape, " e ", r_treino.shape)
print("CONJUNTO DE TESTE: ", a_teste.shape, " e ", r_teste.shape)

TAMANHO DO DATASET TRATADO:  60923
CONJUNTO DE TREINO:  (42646, 14)  e  (42646,)
CONJUNTO DE TESTE:  (18277, 14)  e  (18277,)


### TREINAMENTO E AVALIAÇÃO DO MODELO

#### ÁRVORE DE DECISÃO

In [8]:
# CRIANDO PARÂMETROS PARA O MODELO
lista_parametros_ad = [
    {"id": "Var_AD_1", "criterion": "gini", "max_depth": 10, "min_samples_split": 10, "min_samples_leaf": 5},
    {"id": "Var_AD_2", "criterion": "gini", "max_depth": 10, "min_samples_split": 10, "min_samples_leaf": 10},
    {"id": "Var_AD_3", "criterion": "gini", "max_depth": 10, "min_samples_split": 20, "min_samples_leaf": 5},
    {"id": "Var_AD_4", "criterion": "gini", "max_depth": 10, "min_samples_split": 20, "min_samples_leaf": 10},
    {"id": "Var_AD_5", "criterion": "gini", "max_depth": 20, "min_samples_split": 10, "min_samples_leaf": 5},
    {"id": "Var_AD_6", "criterion": "gini", "max_depth": 20, "min_samples_split": 10, "min_samples_leaf": 10},
    {"id": "Var_AD_7", "criterion": "gini", "max_depth": 20, "min_samples_split": 20, "min_samples_leaf": 5},
    {"id": "Var_AD_8", "criterion": "gini", "max_depth": 20, "min_samples_split": 20, "min_samples_leaf": 10},
    {"id": "Var_AD_9", "criterion": "entropy", "max_depth": 10, "min_samples_split": 10, "min_samples_leaf": 5},
    {"id": "Var_AD_10", "criterion": "entropy", "max_depth": 10, "min_samples_split": 10, "min_samples_leaf": 10},
    {"id": "Var_AD_11", "criterion": "entropy", "max_depth": 10, "min_samples_split": 20, "min_samples_leaf": 5},
    {"id": "Var_AD_12", "criterion": "entropy", "max_depth": 10, "min_samples_split": 20, "min_samples_leaf": 10},
    {"id": "Var_AD_13", "criterion": "entropy", "max_depth": 20, "min_samples_split": 10, "min_samples_leaf": 5},
    {"id": "Var_AD_14", "criterion": "entropy", "max_depth": 20, "min_samples_split": 10, "min_samples_leaf": 10},
    {"id": "Var_AD_15", "criterion": "entropy", "max_depth": 20, "min_samples_split": 20, "min_samples_leaf": 5},
    {"id": "Var_AD_16", "criterion": "entropy", "max_depth": 20, "min_samples_split": 20, "min_samples_leaf": 10},
]

In [9]:
# APLICANDO MODELO
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score

# criando a lista de resultados finais
lista_resultados_ad = []

for parametros in lista_parametros_ad:
    # criando classificador
    classificador = DecisionTreeClassifier(
        criterion=parametros["criterion"],
        max_depth=parametros["max_depth"],
        min_samples_split=parametros["min_samples_split"],
        min_samples_leaf=parametros["min_samples_leaf"],
    )

    # treinando o modelo
    classificador.fit(a_treino, r_treino)

    # prevendo respostas
    r_previsao_teste = classificador.predict(a_teste)

    # calculando métricas
    acuracia = accuracy_score(r_teste, r_previsao_teste)
    f1 = f1_score(r_teste, r_previsao_teste, average="weighted")

    # salvando resultados
    lista_resultados_ad.append([parametros["id"], acuracia, f1])


In [10]:
# EXIBINDO RESULTADOS
tabela_resultados_ad = pandas.DataFrame(lista_resultados_ad, columns=["Identificador", "Acurácia", "F1-score"])
print(tabela_resultados_ad)

   Identificador  Acurácia  F1-score
0       Var_AD_1  0.918313  0.918539
1       Var_AD_2  0.917656  0.917886
2       Var_AD_3  0.918203  0.918431
3       Var_AD_4  0.917765  0.917995
4       Var_AD_5  0.991191  0.991191
5       Var_AD_6  0.988565  0.988568
6       Var_AD_7  0.990152  0.990152
7       Var_AD_8  0.988401  0.988403
8       Var_AD_9  0.871368  0.871697
9      Var_AD_10  0.870548  0.870878
10     Var_AD_11  0.871368  0.871697
11     Var_AD_12  0.870548  0.870878
12     Var_AD_13  0.994693  0.994693
13     Var_AD_14  0.991519  0.991520
14     Var_AD_15  0.993817  0.993818
15     Var_AD_16  0.991793  0.991793


#### RANDOM FOREST

In [11]:
# CRIANDO PARÂMETROS PARA OS MODELOS
parametros_rf = [
    {"id": "Var_RF_1", "criterion": "gini", "max_depth": 10, "min_samples_split": 10, "min_samples_leaf": 5},
    {"id": "Var_RF_2", "criterion": "gini", "max_depth": 10, "min_samples_split": 10, "min_samples_leaf": 10},
    {"id": "Var_RF_3", "criterion": "gini", "max_depth": 10, "min_samples_split": 20, "min_samples_leaf": 5},
    {"id": "Var_RF_4", "criterion": "gini", "max_depth": 10, "min_samples_split": 20, "min_samples_leaf": 10},
    {"id": "Var_RF_5", "criterion": "gini", "max_depth": 20, "min_samples_split": 10, "min_samples_leaf": 5},
    {"id": "Var_RF_6", "criterion": "gini", "max_depth": 20, "min_samples_split": 10, "min_samples_leaf": 10},
    {"id": "Var_RF_7", "criterion": "gini", "max_depth": 20, "min_samples_split": 20, "min_samples_leaf": 5},
    {"id": "Var_RF_8", "criterion": "gini", "max_depth": 20, "min_samples_split": 20, "min_samples_leaf": 10},
    {"id": "Var_RF_9", "criterion": "entropy", "max_depth": 10, "min_samples_split": 10, "min_samples_leaf": 5},
    {"id": "Var_RF_10", "criterion": "entropy", "max_depth": 10, "min_samples_split": 10, "min_samples_leaf": 10},
    {"id": "Var_RF_11", "criterion": "entropy", "max_depth": 10, "min_samples_split": 20, "min_samples_leaf": 5},
    {"id": "Var_RF_12", "criterion": "entropy", "max_depth": 10, "min_samples_split": 20, "min_samples_leaf": 10},
    {"id": "Var_RF_13", "criterion": "entropy", "max_depth": 20, "min_samples_split": 10, "min_samples_leaf": 5},
    {"id": "Var_RF_14", "criterion": "entropy", "max_depth": 20, "min_samples_split": 10, "min_samples_leaf": 10},
    {"id": "Var_RF_15", "criterion": "entropy", "max_depth": 20, "min_samples_split": 20, "min_samples_leaf": 5},
    {"id": "Var_RF_16", "criterion": "entropy", "max_depth": 20, "min_samples_split": 20, "min_samples_leaf": 10}
]


In [12]:
# APLICANDO MODELO
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# criando a lista de resultados finais
lista_resultados_rf = []

for parametros in parametros_rf:
    # criando classificador
    classificador = RandomForestClassifier(
        criterion=parametros["criterion"], 
        max_depth=parametros["max_depth"], 
        min_samples_split=parametros["min_samples_split"], 
        min_samples_leaf=parametros["min_samples_leaf"], 
        n_estimators=100, 
        random_state=42
    )

    # treinando o modelo
    classificador.fit(a_treino, r_treino)

    # prevendo respostas
    r_previsao = classificador.predict(a_teste)

    # calculando métricas
    acuracia = accuracy_score(r_teste, r_previsao)
    f1 = f1_score(r_teste, r_previsao, average="weighted")

    # salvando resultados
    lista_resultados_rf.append([parametros["id"], acuracia, f1])

In [13]:
# EXIBINDO RESULTADOS
tabela_resultados_rf = pandas.DataFrame(lista_resultados_rf, columns=["Identificador", "Acurácia", "F1-score"])
print(tabela_resultados_rf)

   Identificador  Acurácia  F1-score
0       Var_RF_1  0.990425  0.990424
1       Var_RF_2  0.990918  0.990916
2       Var_RF_3  0.990699  0.990699
3       Var_RF_4  0.990918  0.990916
4       Var_RF_5  0.998906  0.998906
5       Var_RF_6  0.998523  0.998523
6       Var_RF_7  0.998960  0.998960
7       Var_RF_8  0.998523  0.998523
8       Var_RF_9  0.988401  0.988400
9      Var_RF_10  0.985282  0.985285
10     Var_RF_11  0.990042  0.990044
11     Var_RF_12  0.985282  0.985285
12     Var_RF_13  0.998960  0.998960
13     Var_RF_14  0.998304  0.998304
14     Var_RF_15  0.998851  0.998851
15     Var_RF_16  0.998304  0.998304


#### Rede Neural Multilayer Perceptron

In [14]:
# CRIANDO PARÂMETROS PARA OS MODELOS

lista_parametros_mlp = [
    {"id": "Var_MLP_1", "hidden_layer_sizes": (50,), "activation": "relu"},
    {"id": "Var_MLP_2", "hidden_layer_sizes": (50,), "activation": "tanh"},
    {"id": "Var_MLP_3", "hidden_layer_sizes": (50,), "activation": "logistic"},
    {"id": "Var_MLP_4", "hidden_layer_sizes": (100,), "activation": "relu"},
    {"id": "Var_MLP_5", "hidden_layer_sizes": (100,), "activation": "tanh"},
    {"id": "Var_MLP_6", "hidden_layer_sizes": (100,), "activation": "logistic"},
    {"id": "Var_MLP_7", "hidden_layer_sizes": (150,), "activation": "relu"},
    {"id": "Var_MLP_8", "hidden_layer_sizes": (150,), "activation": "tanh"},
    {"id": "Var_MLP_9", "hidden_layer_sizes": (150,), "activation": "logistic"},
    {"id": "Var_MLP_10", "hidden_layer_sizes": (50, 50), "activation": "relu"},
    {"id": "Var_MLP_11", "hidden_layer_sizes": (50, 50), "activation": "tanh"},
    {"id": "Var_MLP_12", "hidden_layer_sizes": (50, 50), "activation": "logistic"},
    {"id": "Var_MLP_13", "hidden_layer_sizes": (100, 100), "activation": "relu"},
    {"id": "Var_MLP_14", "hidden_layer_sizes": (100, 100), "activation": "tanh"},
    {"id": "Var_MLP_15", "hidden_layer_sizes": (100, 100), "activation": "logistic"},
    {"id": "Var_MLP_16", "hidden_layer_sizes": (150, 150), "activation": "relu"},
    {"id": "Var_MLP_17", "hidden_layer_sizes": (150, 150), "activation": "tanh"},
    {"id": "Var_MLP_18", "hidden_layer_sizes": (150, 150), "activation": "logistic"}
]

In [15]:
# APLICANDO MODELO
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score

# criando a lista de resultados finais
lista_resultados_mlp = []

for parametros in lista_parametros_mlp:
    # criando classificador
    classificador = MLPClassifier(
        hidden_layer_sizes=parametros["hidden_layer_sizes"],
        activation=parametros["activation"],
        max_iter=500,
    )

    # treinando o modelo
    classificador.fit(a_treino, r_treino)

    # prevendo respostas
    r_previsao = classificador.predict(a_teste)

    # calculando métricas
    acuracia = accuracy_score(r_teste, r_previsao)
    f1 = f1_score(r_teste, r_previsao, average="weighted")

    # salvando resultados
    lista_resultados_mlp.append([parametros["id"], acuracia, f1])

In [18]:
# EXIBINDO RESULTADOS
tabela_resultados_mlp = pandas.DataFrame(lista_resultados_mlp, columns=["Identificador", "Acurácia", "F1-score"])
print(tabela_resultados_mlp)

   Identificador  Acurácia  F1-score
0      Var_MLP_1  0.990808  0.990806
1      Var_MLP_2  0.996881  0.996882
2      Var_MLP_3  0.997757  0.997757
3      Var_MLP_4  0.997538  0.997538
4      Var_MLP_5  0.998796  0.998796
5      Var_MLP_6  0.998851  0.998851
6      Var_MLP_7  0.998030  0.998031
7      Var_MLP_8  0.998632  0.998632
8      Var_MLP_9  0.999070  0.999070
9     Var_MLP_10  0.997811  0.997812
10    Var_MLP_11  0.997647  0.997647
11    Var_MLP_12  0.994091  0.994093
12    Var_MLP_13  0.998960  0.998960
13    Var_MLP_14  0.997428  0.997428
14    Var_MLP_15  0.996334  0.996333
15    Var_MLP_16  0.999070  0.999070
16    Var_MLP_17  0.999289  0.999289
17    Var_MLP_18  0.997374  0.997374
