**Table of contents**<a id='toc0_'></a>    
- [Imports](#toc1_)    
- [Read Data](#toc2_)    
- [Data Preprocessing](#toc3_)
  - [Drop rows with missing values](#toc3_2_)    
  - [Removing Categorical Columns](#toc4_)    
  - [Split Train and Test Data](#toc5_)    
  - [Data Cleaning](#toc6_)    
    - [Impute missing numeric data](#toc6_1_)    
  - [Data Normalization](#toc7_)    
- [Model training](#toc8_)    
  - [KNN](#toc8_1_)  
  - [LVQ](#toc8_2_)
  - [Decision Tree](#toc8_3_)  
  - [MLP](#toc8_4_)
  - [SVM](#toc8_5_)  
  - [Stacking](#toc8_6_)  
  - [Random Forest](#toc8_7_)  

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

----

# <a id='toc1_'></a>[Imports](#toc0_)

In [26]:
pip install -U git+https://github.com/rickvanveen/sklvq.git
pip install ucimlrepo

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/rickvanveen/sklvq.git
  Cloning https://github.com/rickvanveen/sklvq.git to c:\users\pichau\appdata\local\temp\pip-req-build-8fam6gx1
Note: you may need to restart the kernel to use updated packages.


  Running command git clone --filter=blob:none --quiet https://github.com/rickvanveen/sklvq.git 'C:\Users\pichau\AppData\Local\Temp\pip-req-build-8fam6gx1'
  fatal: unable to access 'https://github.com/rickvanveen/sklvq.git/': Could not resolve host: github.com
  error: subprocess-exited-with-error
  
  × git clone --filter=blob:none --quiet https://github.com/rickvanveen/sklvq.git 'C:\Users\pichau\AppData\Local\Temp\pip-req-build-8fam6gx1' did not run successfully.
  │ exit code: 128
  ╰─> See above for output.
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
error: subprocess-exited-with-error

× git clone --filter=blob:none --quiet https://github.com/rickvanveen/sklvq.git 'C:\Users\pichau\AppData\Local\Temp\pip-req-build-8fam6gx1' did not run successfully.
│ exit code: 128
╰─> See above for output.

note: This error originates from a subprocess, and is likely not a problem with pip.


In [25]:
import numpy as np
from matplotlib import pyplot as plt
from math import sqrt
from tqdm import tqdm

import pandas as pd
pd.options.display.max_colwidth = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 200

import sklearn
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate,train_test_split, GridSearchCV,RandomizedSearchCV
from sklvq import GLVQ
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, make_scorer
import random
from random import seed,randrange
import requests
import io
import pickle

# <a id='toc2_'></a>[Read Data](#toc0_)

In [34]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 
  
# metadata 
print(heart_disease.metadata) 
  
# variable information 
print(heart_disease.variables)

## <a id='toc3_'></a>[Visualize Data](#toc0_)

In [None]:
X

In [None]:
y

# <a id='toc3_'></a>[Data Preprocessing](#toc0_)

## <a id='toc3_2_'></a>[Drop rows with missing values](#toc0_)

In [36]:
df.dropna(inplace=True, axis=0, how='any')

## <a id='toc4_'></a>[Removing Categorical Columns](#toc0_)

In [37]:
categorical_columns = ["Unnamed: 0", "track_id", "track_name", "album_name", "artists"]
df = df.drop(categorical_columns, axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   popularity        113999 non-null  int64  
 1   duration_ms       113999 non-null  int64  
 2   explicit          113999 non-null  bool   
 3   danceability      113999 non-null  float64
 4   energy            113999 non-null  float64
 5   key               113999 non-null  int64  
 6   loudness          113999 non-null  float64
 7   mode              113999 non-null  int64  
 8   speechiness       113999 non-null  float64
 9   acousticness      113999 non-null  float64
 10  instrumentalness  113999 non-null  float64
 11  liveness          113999 non-null  float64
 12  valence           113999 non-null  float64
 13  tempo             113999 non-null  float64
 14  time_signature    113999 non-null  int64  
 15  track_genre       113999 non-null  object 
dtypes: bool(1), float64(9), i

## <a id='toc5_'></a>[Split Train and Test Data](#toc0_)

In [38]:
def train_validation_test_split(df, target_column, validation_size=0.1, test_size=0.1, random_state=42):
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=random_state, stratify=df[target_column])
    
    df_train, df_validation = train_test_split(df_train,
                                               test_size=validation_size/(1 - test_size),
                                               random_state=random_state,
                                               stratify=df_train[target_column])
    return df_train, df_validation, df_test  

In [39]:
df_train, df_validation, df_test = train_validation_test_split(df, "track_genre",0.2, 0.2)
df.info()

print('\n',len(df_train.values)/float(len(df)),len(df_test.values)/float(len(df)),len(df_validation.values)/float(len(df))) #Garantindo que o percentual ocorre

<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   popularity        113999 non-null  int64  
 1   duration_ms       113999 non-null  int64  
 2   explicit          113999 non-null  bool   
 3   danceability      113999 non-null  float64
 4   energy            113999 non-null  float64
 5   key               113999 non-null  int64  
 6   loudness          113999 non-null  float64
 7   mode              113999 non-null  int64  
 8   speechiness       113999 non-null  float64
 9   acousticness      113999 non-null  float64
 10  instrumentalness  113999 non-null  float64
 11  liveness          113999 non-null  float64
 12  valence           113999 non-null  float64
 13  tempo             113999 non-null  float64
 14  time_signature    113999 non-null  int64  
 15  track_genre       113999 non-null  object 
dtypes: bool(1), float64(9), i

## <a id='toc6_'></a>[Data Cleaning](#toc0_)

### <a id='toc6_1_'></a>[Impute missing numeric data](#toc0_)

In [40]:
numeric_columns = df_train.select_dtypes(include=['number']).columns

numeric_imputer = SimpleImputer(strategy='median')
numeric_imputer.fit(df_train[numeric_columns])

df_train[numeric_columns] = numeric_imputer.transform(df_train[numeric_columns])
df_validation[numeric_columns] = numeric_imputer.transform(df_validation[numeric_columns])
df_test[numeric_columns] = numeric_imputer.transform(df_test[numeric_columns])

## <a id='toc7_'></a>[Data Normalization](#toc0_)

In [41]:
normalizer = MinMaxScaler()

normalizer.fit(df_train[numeric_columns])

df_train[numeric_columns] = normalizer.transform(df_train[numeric_columns])
df_validation[numeric_columns] = normalizer.transform(df_validation[numeric_columns])
df_test[numeric_columns] = normalizer.transform(df_test[numeric_columns])

In [42]:
#Div. de dados atributos e classe
df_cara_train = df_train[numeric_columns].values  #caracteristicas
df_clas_train = df_train['track_genre'].values #classe

df_cara_validation = df_validation[numeric_columns].values  #caracteristicas
df_clas_validation = df_validation['track_genre'].values #classe

df_cara_test = df_test[numeric_columns].values  #caracteristicas
df_clas_test = df_test['track_genre'].values #classe

# <a id='toc8_'></a>[Model training](#toc0_)

## <a id='toc8_1_'></a>[KNN](#toc0_)

In [43]:
df_cara_train_scaled = df_cara_train
df_cara_valid_scaled = df_cara_validation
df_cara_test_scaled = df_cara_test

In [44]:
knn = KNeighborsClassifier().fit(df_cara_train,df_clas_train)
param_grid = {
    'n_neighbors': np.arange(1,81,2),
    'metric': ['euclidean', 'manhattan']
}
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(df_cara_train,df_clas_train)

with open('KNN_model_searcher.pkl', 'wb') as f:
    pickle.dump(grid_search,f)

print(grid_search.cv_results_['mean_test_score'],'\n\n')
print(f'Melhor parametro: {grid_search.best_params_}')
print(f'Melhor resultado: {grid_search.best_score_}','\n\n')


[0.1748636  0.15861048 0.16880577 0.17154974 0.17500064 0.17567899
 0.17537008 0.17594873 0.17530662 0.17495474 0.17563676 0.17429404
 0.17423267 0.17333656 0.17353709 0.1732115  0.17287979 0.17233861
 0.17125046 0.17021634 0.1702484  0.16896469 0.16769784 0.16724919
 0.16641829 0.16637438 0.16600326 0.16487446 0.16456805 0.16393318
 0.16359721 0.16302103 0.16188695 0.16125885 0.16108099 0.16073382
 0.16078705 0.15982873 0.1588715  0.15828664 0.18945929 0.17724955
 0.18825781 0.19278721 0.19557491 0.19736664 0.19715771 0.19794149
 0.19734906 0.1991353  0.19896836 0.19766001 0.19836991 0.19772185
 0.19737806 0.19758553 0.19734314 0.19711047 0.19644256 0.19637877
 0.19560007 0.19604323 0.19556347 0.19502116 0.1942498  0.19444474
 0.19418936 0.19393728 0.19386993 0.19266551 0.19213404 0.19144797
 0.19135651 0.19105361 0.19006619 0.18963809 0.19024574 0.19016178
 0.18950826 0.1891294 ] 


Melhor parametro: {'metric': 'manhattan', 'n_neighbors': 19}
Melhor resultado: 0.1991353044549033 


D

In [None]:
print(grid_search.score(df_train.iloc[:, :-1], df_train.iloc[:, -1]))

In [None]:
print(grid_search.score(df_validation.iloc[:, :-1], df_validation.iloc[:, -1]))

In [None]:
print(grid_search.score(df_test.iloc[:, :-1], df_test.iloc[:, -1]))

In [None]:
grid_search.best_params_

In [None]:
grid_search.cv_results_

In [None]:
np.array([grid_search.cv_results_[f"split{i}_test_score"] for i in range(grid_search.cv)])


In [None]:
np.array([grid_search.cv_results_[f"split{i}_test_score"] for i in range(grid_search.cv)])[:, 8]


In [None]:
df_clas_test = df_test["track_genre"]
df_clas_pred = grid_search.best_estimator_.predict(df_test.drop(columns=["track_genre"]))
df_clas_pred_proba = grid_search.best_estimator_.predict_proba(df_test.drop(columns=["track_genre"]))

evaluation = {
    "accuracy": accuracy_score(df_clas_test, df_clas_pred),
    "precision": precision_score(
        df_clas_test, df_clas_pred, average="weighted"
    ),
    "recall": recall_score(df_clas_test, df_clas_pred, average="weighted"),
    "f1": f1_score(df_clas_test, df_clas_pred, average="weighted")
}
print(evaluation)

## <a id='toc8_2_'></a>[LVQ](#toc0_)

In [45]:
df_cara_train_scaled = df_cara_train
df_cara_valid_scaled = df_cara_validation
df_cara_test_scaled = df_cara_test

In [46]:
# Definindo o classificador LVQ
lvq = GLVQ()

# Criando o dicionário de parâmetros para o grid search
param_grid = {
    "prototype_n_per_class": [1,3],  # Número de protótipos por classe
    "distance_type": ["euclidean"],
    "solver_params": [{"max_runs": 5, "step_size": step} for step in [0.1, 0.5]]  # Lista de dicionários para diferentes step_sizes
}
 


# Criando os scorers personalizados
scorers = {
    "accuracy": make_scorer(accuracy_score),
    "precision_macro": make_scorer(precision_score, average='macro', zero_division = 0),
    "recall_macro": make_scorer(recall_score, average='macro'),
    "f1_macro": make_scorer(f1_score, average='macro')
}

# Criando o objeto GridSearchCV
grid_search = GridSearchCV(lvq, param_grid, cv=5, scoring=scorers, refit="accuracy")

# Treinando o GridSearchCV com os dados de treino escalados
grid_search.fit(df_cara_train_scaled, df_clas_train)

with open('LVQ_model_searcher.pkl', 'wb') as f:
    pickle.dump(grid_search,f)

print(grid_search.best_params_)

In [None]:
print(grid_search.score(df_train.iloc[:, :-1], df_train.iloc[:, -1]))

In [None]:
print(grid_search.score(df_validation.iloc[:, :-1], df_validation.iloc[:, -1]))

In [47]:
print(grid_search.score(df_test.iloc[:, :-1], df_test.iloc[:, -1]))

Melhores parâmetros:  {'distance_type': 'euclidean', 'prototype_n_per_class': 3, 'solver_params': {'max_runs': 5, 'step_size': 0.1}}
accuracy per fold:  [0.12982456 0.12938596 0.13377193 0.13333333 0.13881579]
recall_macro per fold:  [0.12982456 0.12938596 0.13377193 0.13333333 0.13881579]
f1_macro per fold:  [0.1140344  0.11959892 0.11408463 0.11836989 0.12084056]
precision_macro per fold:  [0.12466036 0.13964932 0.14156033 0.1312936  0.13110597]
13.302631578947368
13.302631578947368
11.738567900442213
13.365391757392203
Dados de Teste
{'accuracy': make_scorer(accuracy_score), 'recall_macro': make_scorer(recall_score, average=macro), 'f1_macro': make_scorer(f1_score, average=macro), 'precision_macro': make_scorer(precision_score, average=macro, zero_division=0)}
{'accuracy': make_scorer(accuracy_score), 'recall_macro': make_scorer(recall_score, average=macro), 'f1_macro': make_scorer(f1_score, average=macro), 'precision_macro': make_scorer(precision_score, average=macro, zero_division

In [None]:
df_clas_test = df_test["track_genre"]
df_clas_pred = grid_search.best_estimator_.predict(df_test.drop(columns=["track_genre"]))
df_clas_pred_proba = grid_search.best_estimator_.predict_proba(df_test.drop(columns=["track_genre"]))

evaluation = {
    "accuracy": accuracy_score(df_clas_test, df_clas_pred),
    "precision": precision_score(
        df_clas_test, df_clas_pred, average="weighted"
    ),
    "recall": recall_score(df_clas_test, df_clas_pred, average="weighted"),
    "f1": f1_score(df_clas_test, df_clas_pred, average="weighted")
}
print(evaluation)

## <a id='toc8_3_'></a>[Decision Tree](#toc0_)

In [None]:
random_state=42
decision_tree = DecisionTreeClassifier(random_state=random_state)

parameters = {
    "criterion": ["gini", "entropy", "log_loss"],
    "splitter": ["best", "random"],
    "max_depth": np.arange(5, 10000, 50),
    "min_samples_leaf": np.arange(1, 10000, 50),
    "max_features": ["sqrt", "log2"],
    "max_leaf_nodes": np.arange(2, 10000, 50),
}

grid_search = RandomizedSearchCV(
    cv=5,
    random_state=random_state,
    n_iter=500,
    n_jobs=-1,
    estimator=decision_tree,
    param_distributions=parameters,
)

print(grid_search.best_params_)

In [None]:
grid_search.fit(df_train.iloc[:, :-1], df_train.iloc[:, -1])

In [None]:
print(grid_search.score(df_train.iloc[:, :-1], df_train.iloc[:, -1]))

In [None]:
print(grid_search.score(df_validation.iloc[:, :-1], df_validation.iloc[:, -1]))

In [None]:
print(grid_search.score(df_test.iloc[:, :-1], df_test.iloc[:, -1]))

In [None]:
from sklearn.metrics import (
    f1_score,
    recall_score,
    precision_score,
    accuracy_score,
    confusion_matrix,
    classification_report,
)


def display_metrics(model, x_test, y_test):
    predicted = model.predict(x_test)
    report = classification_report(y_test, predicted)
    accuracy = accuracy_score(y_test, predicted)
    print(report)
    print(f"accuracy: {accuracy}")

    cm = confusion_matrix(y_test, predicted)
    plt.figure(figsize=(12, 10))
    sns.heatmap(
        cm,
        center=True,
        annot=True,
        yticklabels=model.classes_,
        xticklabels=model.classes_,
    )

In [None]:
display_metrics(
    grid_search.best_estimator_, df_validation.iloc[:, :-1], df_validation.iloc[:, -1]
)

In [None]:
df_clas_test = df_test["track_genre"]
df_clas_pred = grid_search.best_estimator_.predict(df_test.drop(columns=["track_genre"]))
df_clas_pred_proba = grid_search.best_estimator_.predict_proba(df_test.drop(columns=["track_genre"]))

evaluation = {
    "accuracy": accuracy_score(df_clas_test, df_clas_pred),
    "precision": precision_score(
        df_clas_test, df_clas_pred, average="weighted"
    ),
    "recall": recall_score(df_clas_test, df_clas_pred, average="weighted"),
    "f1": f1_score(df_clas_test, df_clas_pred, average="weighted")
}
print(evaluation)

## <a id='toc8_4_'></a>[MLP](#toc0_)

In [48]:
df_cara_train_scaled = df_cara_train
df_cara_valid_scaled = df_cara_validation
df_cara_test_scaled = df_cara_test

In [None]:
mlp = MLPClassifier()
param_grid = {
    "hidden_layer_sizes": [(100,)],
    "activation": ["relu"],# "tanh"],
    "solver": ["adam"], #, 'sgd'],
    "alpha": [0.0001],# 0.01],
    "learning_rate": ["adaptive"],#"constant"]
    "max_iter": [700]#300,500,
}

grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='accuracy', verbose=3)
grid_search.fit(df_cara_train, df_clas_train)

In [50]:
with open('MLP_model_searcher.pkl', 'wb') as f:
    pickle.dump(grid_search, f)
print(grid_search.cv_results_['mean_test_score'])
print(f'Melhor parametro: {grid_search.best_params_}')
print(f'Melhor resultado: {grid_search.best_score_}')

[0.3010278  0.30273837]
Melhor parametro: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'max_iter': 700, 'solver': 'adam'}
Melhor resultado: 0.30273837281631594


In [51]:
print(grid_search.score(df_train.iloc[:, :-1], df_train.iloc[:, -1]))

Dados de Teste
{'accuracy': make_scorer(accuracy_score), 'recall_macro': make_scorer(recall_score, average=macro), 'f1_macro': make_scorer(f1_score, average=macro), 'precision_macro': make_scorer(precision_score, average=macro, zero_division=0)}
{'accuracy': make_scorer(accuracy_score), 'recall_macro': make_scorer(recall_score, average=macro), 'f1_macro': make_scorer(f1_score, average=macro), 'precision_macro': make_scorer(precision_score, average=macro, zero_division=0)}
{'accuracy': make_scorer(accuracy_score), 'recall_macro': make_scorer(recall_score, average=macro), 'f1_macro': make_scorer(f1_score, average=macro), 'precision_macro': make_scorer(precision_score, average=macro, zero_division=0)}


In [None]:
print(grid_search.score(df_validation.iloc[:, :-1], df_validation.iloc[:, -1]))

In [None]:
print(grid_search.score(df_test.iloc[:, :-1], df_test.iloc[:, -1]))

In [None]:
grid_search.best_params_

In [None]:
grid_search.cv_results_

In [None]:
np.array([grid_search.cv_results_[f"split{i}_test_score"] for i in range(grid_search.cv)])

In [None]:
np.array([grid_search.cv_results_[f"split{i}_test_score"] for i in range(grid_search.cv)])[:, 8]

In [None]:
df_clas_test = df_test["track_genre"]
df_clas_pred = grid_search.best_estimator_.predict(df_test.drop(columns=["track_genre"]))
df_clas_pred_proba = grid_search.best_estimator_.predict_proba(df_test.drop(columns=["track_genre"]))

evaluation = {
    "accuracy": accuracy_score(df_clas_test, df_clas_pred),
    "precision": precision_score(
        df_clas_test, df_clas_pred, average="weighted"
    ),
    "recall": recall_score(df_clas_test, df_clas_pred, average="weighted"),
    "f1": f1_score(df_clas_test, df_clas_pred, average="weighted")
}
print(evaluation)

## <a id='toc8_5_'></a>[SVM](#toc0_)

In [52]:
class_svm = SVC().fit(df_cara_train,df_clas_train)
###CUIDADO AO RODAR AS CÉLULAS ABAIXO

In [53]:
lista_kernels=['rbf']
lista_c =[100]
lista_gamma = [2]

# Criando um dicionário com os hiperparâmetros e valores a serem testados
param_grid = {'kernel': lista_kernels,'C': lista_c, 'gamma':lista_gamma}

In [54]:
grid_search = GridSearchCV(class_svm, param_grid, cv=5, scoring='accuracy')
grid_search.fit(df_cara_train,df_clas_train)

with open('SVM_model_searcher.pkl', 'wb') as f:
    pickle.dump(grid_search,f)
print(grid_search.cv_results_['mean_test_score'])
print(f'Melhor parametro: {grid_search.best_params_}')
print(f'Melhor resultado: {grid_search.best_score_}')
#lista_kernels=['linear','rbf']
#lista_c =[2,3,4,5,7,10,100]
#lista_gamma = [2,3,4,5,7,10,100]
#Melhor parametro: {'C': 100, 'gamma': 2, 'kernel': 'rbf'}
#Melhor resultado: 0.26281478175137607

[0.2449159]
Melhor parametro: {'C': 100, 'gamma': 2, 'kernel': 'rbf'}
Melhor resultado: 0.24491589853230442


In [55]:
print(grid_search.score(df_train.iloc[:, :-1], df_train.iloc[:, -1]))

Dados de Teste
{'accuracy': make_scorer(accuracy_score), 'recall_macro': make_scorer(recall_score, average=macro), 'f1_macro': make_scorer(f1_score, average=macro), 'precision_macro': make_scorer(precision_score, average=macro, zero_division=0)}
{'accuracy': make_scorer(accuracy_score), 'recall_macro': make_scorer(recall_score, average=macro), 'f1_macro': make_scorer(f1_score, average=macro), 'precision_macro': make_scorer(precision_score, average=macro, zero_division=0)}
{'accuracy': make_scorer(accuracy_score), 'recall_macro': make_scorer(recall_score, average=macro), 'f1_macro': make_scorer(f1_score, average=macro), 'precision_macro': make_scorer(precision_score, average=macro, zero_division=0)}


In [None]:
print(grid_search.score(df_validation.iloc[:, :-1], df_validation.iloc[:, -1]))

In [None]:
print(grid_search.score(df_test.iloc[:, :-1], df_test.iloc[:, -1]))

In [None]:
grid_search.best_params_

In [None]:
grid_search.cv_results_

In [None]:
np.array([grid_search.cv_results_[f"split{i}_test_score"] for i in range(grid_search.cv)])

In [None]:
np.array([grid_search.cv_results_[f"split{i}_test_score"] for i in range(grid_search.cv)])[:, 8]

In [None]:
df_clas_test = df_test["track_genre"]
df_clas_pred = grid_search.best_estimator_.predict(df_test.drop(columns=["track_genre"]))
df_clas_pred_proba = grid_search.best_estimator_.predict_proba(df_test.drop(columns=["track_genre"]))

evaluation = {
    "accuracy": accuracy_score(df_clas_test, df_clas_pred),
    "precision": precision_score(
        df_clas_test, df_clas_pred, average="weighted"
    ),
    "recall": recall_score(df_clas_test, df_clas_pred, average="weighted"),
    "f1": f1_score(df_clas_test, df_clas_pred, average="weighted")
}
print(evaluation)

## <a id='toc8_6_'></a>[Stacking_Pikle Import](#toc0_)

In [None]:
# Print metrics for each df type
def testCurrentConfiguration(stacking_clf, estimator, final_estimator):
    print("---------------------")
    print("running for ", final_estimator)
    print([i[0] for i in estimator])
    df_clas_pred_test = stacking_clf.predict(df_cara_test)
    df_clas_pred_train = stacking_clf.predict(df_cara_train)
    df_clas_pred_val = stacking_clf.predict(df_cara_validation)

    evaluation_test={
    'accuracy': accuracy_score(df_clas_test, df_clas_pred_test),
    'precision': precision_score(df_clas_test, df_clas_pred_test, average='weighted'),
    'recall': recall_score(df_clas_test, df_clas_pred_test, average='weighted'),
    'f1': f1_score(df_clas_test, df_clas_pred_test, average='weighted')
    }
    evaluation_train={
    'accuracy': accuracy_score(df_clas_train, df_clas_pred_train),
    'precision': precision_score(df_clas_train, df_clas_pred_train, average='weighted'),
    'recall': recall_score(df_clas_train, df_clas_pred_train, average='weighted'),
    'f1': f1_score(df_clas_train, df_clas_pred_train, average='weighted')
    }
    evaluation_val={
    'accuracy': accuracy_score(df_clas_validation, df_clas_pred_val),
    'precision': precision_score(df_clas_validation, df_clas_pred_val, average='weighted'),
    'recall': recall_score(df_clas_validation, df_clas_pred_val, average='weighted'),
    'f1': f1_score(df_clas_validation, df_clas_pred_val, average='weighted')
    }

    print(f'Dados de Teste')
    print(evaluation_test)
    print(f'Dados de Treino')
    print(evaluation_train)
    print(f'Dados de Validaçao')
    print(evaluation_val)

In [None]:
# Load previously trained base classifiers
lvq = None
mlp = None
knn = None

with (open('.\\modelos\\LVQ_model_searcher.pkl', 'rb')) as f:
    lvq = pickle.load(f)
with (open('.\\modelos\\MLP_model_searcher.pkl', 'rb')) as f:
    mlp = pickle.load(f)
with (open('.\\modelos\\KNN_model_searcher.pkl', 'rb')) as f:
    knn = pickle.load(f)

# Create possible combinations of classifiers
estimators_combination = [
    [
        ('mlp', mlp),
        ('knn', knn),
        ('lvq', lvq),
    ],
    [
        ('mlp', mlp),
        ('knn', knn),
    ],
    [
        ('mlp', mlp),
        ('lvq', lvq),
    ],
    [
        ('knn', knn),
        ('lvq', lvq),
    ],
]
# List of final classifiers
final_estimator_combination = [LogisticRegression(), DecisionTreeClassifier()]

# Test each pair of <estimators, final_estimator> to check their metrics
for estimator in estimators_combination:
    for final_estimator in final_estimator_combination:
        # Setting prefit since we'll use the pretrained models above
        stacking_clf = StackingClassifier(estimators=estimator, cv="prefit", final_estimator = final_estimator)
        # Train the stacking classifier
        stacking_clf.fit(df_cara_train, df_clas_train)
        testCurrentConfiguration(stacking_clf, estimator, final_estimator)

#### Results

---------------------

Dados os parametros:

LogisticRegression() and ['mlp', 'knn', 'lvq']

Dados de Teste

{'accuracy': 0.33973684210526317, 'precision': 0.34139945210280004, 'recall': 0.33973684210526317, 'f1': 0.3336826841814082}
Dados de Treino

{'accuracy': 0.349093567251462, 'precision': 0.35105385289525554, 'recall': 0.349093567251462, 'f1': 0.34309435758690215}
Dados de Validaçao

{'accuracy': 0.3333333333333333, 'precision': 0.33351549805734143, 'recall': 0.3333333333333333, 'f1': 0.32617102272847603}

---------------------

Dados os parametros:

DecisionTreeClassifier() and ['mlp', 'knn', 'lvq']

Dados de Teste

{'accuracy': 0.2162280701754386, 'precision': 0.22048676762124503, 'recall': 0.2162280701754386, 'f1': 0.21740034305197142}
Dados de Treino

{'accuracy': 0.9150877192982456, 'precision': 0.9219857008466373, 'recall': 0.9150877192982456, 'f1': 0.9146622179174141}
Dados de Validaçao

{'accuracy': 0.21070175438596492, 'precision': 0.21447817161106542, 'recall': 0.21070175438596492, 'f1': 0.21172910144200224}

---------------------

Dados os parametros:

LogisticRegression() and ['mlp', 'knn']

Dados de Teste

{'accuracy': 0.33964912280701753, 'precision': 0.34128038961699103, 'recall': 0.33964912280701753, 'f1': 0.3335919922101997}
Dados de Treino

{'accuracy': 0.349093567251462, 'precision': 0.35107340335988624, 'recall': 0.349093567251462, 'f1': 0.34309757660798107}
Dados de Validaçao

{'accuracy': 0.3335087719298246, 'precision': 0.33372436732779615, 'recall': 0.3335087719298246, 'f1': 0.32635466564214843}

---------------------

Dados os parametros:

DecisionTreeClassifier() and ['mlp', 'knn']

Dados de Teste

{'accuracy': 0.218859649122807, 'precision': 0.2256198364329918, 'recall': 0.218859649122807, 'f1': 0.2211579625357569}
Dados de Treino

{'accuracy': 0.9150877192982456, 'precision': 0.9219857008466373, 'recall': 0.9150877192982456, 'f1': 0.9146622179174141}
Dados de Validaçao

{'accuracy': 0.2119298245614035, 'precision': 0.21741307507188387, 'recall': 0.2119298245614035, 'f1': 0.2137076792286073}

---------------------

Dados os parametros:

LogisticRegression() and ['mlp', 'lvq']

Dados de Teste

{'accuracy': 0.3175438596491228, 'precision': 0.31734137211142344, 'recall': 0.3175438596491228, 'f1': 0.3064514645508661}
Dados de Treino

{'accuracy': 0.3166374269005848, 'precision': 0.31398891231022374, 'recall': 0.3166374269005848, 'f1': 0.3062036094875135}
Dados de Validaçao

{'accuracy': 0.30929824561403507, 'precision': 0.3044883153326813, 'recall': 0.30929824561403507, 'f1': 0.2987353639361971}

---------------------

Dados os parametros:

DecisionTreeClassifier() and ['mlp', 'lvq']

Dados de Teste

{'accuracy': 0.21210526315789474, 'precision': 0.21589840017411255, 'recall': 0.21210526315789474, 'f1': 0.21287484022042927}
Dados de Treino

{'accuracy': 0.9150877192982456, 'precision': 0.9219857008466373, 'recall': 0.9150877192982456, 'f1': 0.9146622179174141}
Dados de Validaçao

{'accuracy': 0.20649122807017545, 'precision': 0.21075318095663353, 'recall': 0.20649122807017545, 'f1': 0.20774644656404842}

---------------------

Dados os parametros:

LogisticRegression() and ['knn', 'lvq']

Dados de Teste

{'accuracy': 0.2893859649122807, 'precision': 0.28885355731186063, 'recall': 0.2893859649122807, 'f1': 0.28051175029905395}
Dados de Treino

{'accuracy': 0.29403508771929826, 'precision': 0.29360649609651057, 'recall': 0.29403508771929826, 'f1': 0.28516542279260515}
Dados de Validaçao

{'accuracy': 0.28192982456140353, 'precision': 0.27751455749852805, 'recall': 0.28192982456140353, 'f1': 0.2710909125079709}

---------------------

Dados os parametros:

DecisionTreeClassifier() and ['knn', 'lvq']

Dados de Teste

{'accuracy': 0.17587719298245613, 'precision': 0.1814274155830884, 'recall': 0.17587719298245613, 'f1': 0.17769330577152145}
Dados de Treino

{'accuracy': 0.9150877192982456, 'precision': 0.9219857008466373, 'recall': 0.9150877192982456, 'f1': 0.9146622179174141}
Dados de Validaçao

{'accuracy': 0.17394736842105263, 'precision': 0.17779027035188788, 'recall': 0.17394736842105263, 'f1': 0.17520835874535154}

#### Best Parameters
After running the comparison above we noticed that the best configuration for our model was:
- final_estimator: LogisticRegression()
- estimators: ['mlp', 'knn', 'lvq']

In [None]:
estimators_combination = [
    ('mlp', mlp),
    ('knn', knn),
    ('lvq', lvq),
]
final_estimator = LogisticRegression()
stacking_clf = StackingClassifier(estimators=estimator, cv="prefit", final_estimator = final_estimator)
# Train the stacking classifier
stacking_clf.fit(df_cara_train, df_clas_train)

with open('stacking_model_searcher.pkl', 'wb') as f:
    pickle.dump(stacking_clf, f)

## <a id='toc8_7_'></a>[Random Forest](#toc0_)

In [None]:
random_state=42

decision_tree =RandomForestClassifier(
    random_state=random_state
)

parameters = {
    "n_estimators": np.arange(5, 1000, 10),
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": np.arange(1, 50),
    "min_samples_split": np.arange(2, 100),
    "min_samples_leaf": np.arange(1, 100)
}

grid_search = RandomizedSearchCV(
    cv=5,
    random_state=random_state,
    n_iter=15,
    n_jobs=7,
    estimator=decision_tree,
    param_distributions=parameters,
    verbose=2
)

grid_search.fit(df_train.iloc[:, :-1], df_train.iloc[:, -1])

In [None]:
print(grid_search.score(df_train.iloc[:, :-1], df_train.iloc[:, -1]))

In [None]:
print(grid_search.score(df_validation.iloc[:, :-1], df_validation.iloc[:, -1]))

In [None]:
print(grid_search.score(df_test.iloc[:, :-1], df_test.iloc[:, -1]))

In [None]:
grid_search.best_params_

In [None]:
grid_search.cv_results_

In [None]:
np.array([grid_search.cv_results_[f"split{i}_test_score"] for i in range(grid_search.cv)])

In [None]:
np.array([grid_search.cv_results_[f"split{i}_test_score"] for i in range(grid_search.cv)])[:, 8]

In [None]:
df_clas_test = df_test["track_genre"]
df_clas_pred = grid_search.best_estimator_.predict(df_test.drop(columns=["track_genre"]))
df_clas_pred_proba = grid_search.best_estimator_.predict_proba(df_test.drop(columns=["track_genre"]))

evaluation = {
    "accuracy": accuracy_score(df_clas_test, df_clas_pred),
    "precision": precision_score(
        df_clas_test, df_clas_pred, average="weighted"
    ),
    "recall": recall_score(df_clas_test, df_clas_pred, average="weighted"),
    "f1": f1_score(df_clas_test, df_clas_pred, average="weighted")
}
print(evaluation)

In [None]:
import pickle
with open('random_forest.pkl', 'wb') as f:
    pickle.dump(grid_search, f)

In [None]:
with open('random_forest.pkl', 'rb') as f:
    grid_search_load = pickle.load(f)

In [None]:
grid_search_load