**Table of contents**<a id='toc0_'></a>    
- [Imports](#toc1_)    
- [Read Data](#toc2_)    
- [Data Preprocessing](#toc3_)
  - [Drop rows with missing values](#toc3_2_)    
  - [Removing Categorical Columns](#toc4_)    
  - [Split Train and Test Data](#toc5_)    
  - [Data Cleaning](#toc6_)    
    - [Impute missing numeric data](#toc6_1_)    
  - [Data Normalization](#toc7_)    
- [Model training](#toc8_)    
  - [KNN](#toc8_1_)  
  - [LVQ](#toc8_2_)
  - [Decision Tree](#toc8_3_)  
  - [MLP](#toc8_4_)
  - [SVM](#toc8_5_)  
  - [Stacking](#toc8_6_)  
  - [Random Forest](#toc8_7_)  

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

----

# <a id='toc1_'></a>[Imports](#toc0_)

In [1]:
import numpy as np
from matplotlib import pyplot as plt
from math import sqrt
from tqdm import tqdm

import pandas as pd
pd.options.display.max_colwidth = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 200

import sklearn
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate,train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier,NearestCentroid
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, classification_report
import random
from random import seed,randrange
import requests
import io
import pickle

# <a id='toc2_'></a>[Read Data](#toc0_)

In [2]:
# Downloading the csv file from your GitHub account

url = "https://raw.githubusercontent.com/Zuluke/Projetos-AM/main/spotify_activity/dataset.csv" # Make sure the url is the raw version of the file on GitHub
download = requests.get(url).content

# Reading the downloaded content and turning it into a pandas dataframe

df = pd.read_csv(io.StringIO(download.decode('utf-8')))

## <a id='toc3_'></a>[Visualize Data](#toc0_)

In [32]:
df.shape
print(df.shape)
print('\n')
df.info()
print('\n')
df.head()

(114000, 21)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liven

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Soundtrack),Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


# <a id='toc3_'></a>[Data Preprocessing](#toc0_)

## <a id='toc3_2_'></a>[Drop rows with missing values](#toc0_)

In [3]:
df.dropna(inplace=True, axis=0, how='any')

## <a id='toc4_'></a>[Removing Categorical Columns](#toc0_)

In [4]:
categorical_columns = ['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name', 'explicit', 'key', 'mode', 'time_signature']
df = df.drop(categorical_columns, axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   popularity        113999 non-null  int64  
 1   duration_ms       113999 non-null  int64  
 2   danceability      113999 non-null  float64
 3   energy            113999 non-null  float64
 4   loudness          113999 non-null  float64
 5   speechiness       113999 non-null  float64
 6   acousticness      113999 non-null  float64
 7   instrumentalness  113999 non-null  float64
 8   liveness          113999 non-null  float64
 9   valence           113999 non-null  float64
 10  tempo             113999 non-null  float64
 11  track_genre       113999 non-null  object 
dtypes: float64(9), int64(2), object(1)
memory usage: 11.3+ MB


## <a id='toc5_'></a>[Split Train and Test Data](#toc0_)

In [5]:
def train_validation_test_split(df, target_column, validation_size=0.1, test_size=0.1, random_state=42):
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=random_state, stratify=df[target_column])
    
    df_train, df_validation = train_test_split(df_train,
                                               test_size=validation_size/(1 - test_size),
                                               random_state=random_state,
                                               stratify=df_train[target_column])
    return df_train, df_validation, df_test  

In [6]:
df_train, df_validation, df_test = train_validation_test_split(df, "track_genre",0.2, 0.2)
df.info()

print('\n',len(df_train.values)/float(len(df)),len(df_test.values)/float(len(df)),len(df_validation.values)/float(len(df))) #Garantindo que o percentual ocorre

<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   popularity        113999 non-null  int64  
 1   duration_ms       113999 non-null  int64  
 2   danceability      113999 non-null  float64
 3   energy            113999 non-null  float64
 4   loudness          113999 non-null  float64
 5   speechiness       113999 non-null  float64
 6   acousticness      113999 non-null  float64
 7   instrumentalness  113999 non-null  float64
 8   liveness          113999 non-null  float64
 9   valence           113999 non-null  float64
 10  tempo             113999 non-null  float64
 11  track_genre       113999 non-null  object 
dtypes: float64(9), int64(2), object(1)
memory usage: 11.3+ MB

 0.5999964911972913 0.2000017544013544 0.2000017544013544


## <a id='toc6_'></a>[Data Cleaning](#toc0_)

### <a id='toc6_1_'></a>[Impute missing numeric data](#toc0_)

In [7]:
numeric_columns = df_train.select_dtypes(include=['number']).columns

numeric_imputer = SimpleImputer(strategy='median')
numeric_imputer.fit(df_train[numeric_columns])

df_train[numeric_columns] = numeric_imputer.transform(df_train[numeric_columns])
df_validation[numeric_columns] = numeric_imputer.transform(df_validation[numeric_columns])
df_test[numeric_columns] = numeric_imputer.transform(df_test[numeric_columns])

## <a id='toc7_'></a>[Data Normalization](#toc0_)

In [8]:
normalizer = MinMaxScaler()

normalizer.fit(df_train[numeric_columns])

df_train[numeric_columns] = normalizer.transform(df_train[numeric_columns])
df_validation[numeric_columns] = normalizer.transform(df_validation[numeric_columns])
df_test[numeric_columns] = normalizer.transform(df_test[numeric_columns])

In [9]:
#Div. de dados atributos e classe
df_cara_train = df_train[numeric_columns].values  #caracteristicas
df_clas_train = df_train['track_genre'].values #classe

df_cara_validation = df_validation[numeric_columns].values  #caracteristicas
df_clas_validation = df_validation['track_genre'].values #classe

df_cara_test = df_test[numeric_columns].values  #caracteristicas
df_clas_test = df_test['track_genre'].values #classe

# <a id='toc8_'></a>[Model training](#toc0_)

## <a id='toc8_1_'></a>[KNN](#toc0_)

In [15]:
knn = KNeighborsClassifier().fit(df_cara_train,df_clas_train)
param_grid = {
    'n_neighbors': np.arange(1,81,2),
    'metric': ['euclidean', 'manhattan']
}
grid = GridSearchCV(knn, param_grid, cv=5, scoring='f1_weighted')
grid.fit(df_cara_train,df_clas_train)

print(grid.cv_results_['mean_test_score'],'\n\n')
print(f'Melhor parametro: {grid.best_params_}')
print(f'Melhor resultado: {grid.best_score_}','\n\n')

df_clas_pred = grid.best_estimator_.predict(df_cara_test)

evaluation={
'accuracy': accuracy_score(df_clas_test, df_clas_pred),
'precision': precision_score(df_clas_test, df_clas_pred, average='weighted'),
'recall': recall_score(df_clas_test, df_clas_pred, average='weighted'),
'f1': f1_score(df_clas_test, df_clas_pred, average='weighted')
}

print(f'Dados de Teste')
print(evaluation)

[0.17642146 0.16329389 0.17569539 0.18255353 0.18487841 0.18497905
 0.18571474 0.18558509 0.18439735 0.18390946 0.18471261 0.18363075
 0.18432642 0.18249941 0.18229252 0.18081921 0.18064847 0.17945825
 0.18035265 0.18077085 0.18020759 0.17949938 0.17908914 0.17729188
 0.17638633 0.1755122  0.17498116 0.17538158 0.17400104 0.1734301
 0.17364342 0.17314272 0.17297175 0.17271418 0.17284106 0.17179817
 0.17102728 0.17071442 0.17032072 0.1699916  0.19087878 0.17709372
 0.19216631 0.20037558 0.20299684 0.20438646 0.20579672 0.2060207
 0.20554318 0.20704705 0.20546339 0.20397883 0.20322883 0.20269859
 0.20266339 0.20243221 0.20196117 0.2032894  0.20320699 0.20277698
 0.20199777 0.20158935 0.2010892  0.20000683 0.2008053  0.20033155
 0.20090226 0.20082566 0.19990506 0.19897568 0.19675984 0.19692632
 0.19693009 0.19699069 0.19636382 0.19595918 0.19710388 0.19526259
 0.19516007 0.19519686] 




## <a id='toc8_2_'></a>[LVQ](#toc0_)

In [66]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

def calculate_metrics_per_class(actual, predicted, class_label):
    TP = 0  # Verdadeiros Positivos
    FP = 0  # Falsos Positivos
    FN = 0  # Falsos Negativos
    for i in range(len(actual)):
        if actual[i] == predicted[i] == class_label:
            TP += 1
        elif predicted[i] == class_label and actual[i] != class_label:
            FP += 1
        elif actual[i] == class_label and predicted[i] != class_label:
            FN += 1
    return TP, FP, FN

def macro_recall(actual, predicted):
    unique_classes = set(actual)
    recalls = []
    for class_label in unique_classes:
        TP, _, FN = calculate_metrics_per_class(actual, predicted, class_label)
        recall = TP / (TP + FN) if (TP + FN) else 0
        recalls.append(recall)
    return sum(recalls) / len(recalls) * 100.0

def macro_precision(actual, predicted):
    unique_classes = set(actual)
    precisions = []
    for class_label in unique_classes:
        TP, FP, _ = calculate_metrics_per_class(actual, predicted, class_label)
        precision = TP / (TP + FP) if (TP + FP) else 0
        precisions.append(precision)
    return sum(precisions) / len(precisions) * 100.0

def macro_f1_score(actual, predicted):
    precision = macro_precision(actual, predicted) / 100.0
    recall = macro_recall(actual, predicted) / 100.0
    return 2 * (precision * recall) / (precision + recall) * 100.0 if (precision + recall) else 0


# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, dataset_validation, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores_cv = list()
	scores_val = list()
	
	scores_cv_recall = list()
	scores_cv_precision = list()
	scores_cv_f1 = list()

	scores_val_recall = list()
	scores_val_precision = list()
	scores_val_f1 = list()


	actual_validation = [row[-1] for row in dataset_validation]
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted_test, predicted_validation = algorithm(train_set, test_set, dataset_validation, *args)
		actual = [row[-1] for row in fold]
  
		accuracy_test = accuracy_metric(actual, predicted_test)
		scores_cv.append(accuracy_test)
  
		scores_cv_recall.append(macro_recall(actual, predicted_test))
		scores_cv_precision.append(macro_precision(actual, predicted_test))
		scores_cv_f1.append(macro_f1_score(actual, predicted_test))
  
		accuracy_val = accuracy_metric(actual_validation, predicted_validation)
		scores_val.append(accuracy_val)
		
		scores_val_recall.append(macro_recall(actual, predicted_validation))
		scores_val_precision.append(macro_precision(actual, predicted_validation))
		scores_val_f1.append(macro_f1_score(actual, predicted_validation))

	return scores_cv, scores_val, scores_cv_recall, scores_val_recall, scores_cv_precision, scores_val_precision, scores_cv_f1, scores_val_f1 

# calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
	distance = np.longdouble(0)
	r1=np.array(row1)
	r2=np.array(row2)
	dist = np.linalg.norm(r1-r2)
	return round(dist,8)

# Locate the best matching unit
def get_best_matching_unit(codebooks, test_row):
	distances = list()
	for codebook in codebooks:
		dist = euclidean_distance(codebook, test_row)
		distances.append((codebook, dist))
	distances.sort(key=lambda tup: tup[1])
	return distances[0][0]

# Make a prediction with codebook vectors
def predict(codebooks, test_row):
	bmu = get_best_matching_unit(codebooks, test_row)
	return bmu[-1]

# Create a random codebook vector
def random_codebook(train):
	n_records = len(train)
	n_features = len(train[0])
	codebook = [train[randrange(n_records)][i] for i in range(n_features)]
	return codebook

# Train a set of codebook vectors
def train_codebooks(train, n_codebooks, lrate, epochs):
	codebooks = [random_codebook(train) for i in range(n_codebooks)]
	for epoch in range(epochs):
		rate = lrate * (1.0-(epoch/float(epochs)))
		for row in train:
			bmu = get_best_matching_unit(codebooks, row)
			for i in range(len(row)-1):
				error = row[i] - bmu[i]
				if bmu[-1] == row[-1]:
					bmu[i] += rate * error
				else:
					bmu[i] -= rate * error
	return codebooks

# LVQ Algorithm
def learning_vector_quantization(train, test, validation, n_codebooks, lrate, epochs):
	codebooks = train_codebooks(train, n_codebooks, lrate, epochs)
	predictions_test = list()
	predictions_validation = list()

	for row in test:
		output = predict(codebooks, row)
		predictions_test.append(output)
	for row in validation:
		output = predict(codebooks, row)
		predictions_validation.append(output)
  
	return predictions_test, predictions_validation



In [55]:
df_train_2

Unnamed: 0,popularity,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre
86894,0.393939,0.037292,0.585117,0.682,0.818561,0.032850,0.100402,0.000000,0.1290,0.597586,0.592745,85
19536,0.000000,0.042588,0.514781,0.627,0.756159,0.028290,0.424699,0.000074,0.0755,0.297787,0.637367,19
47115,0.010101,0.047729,0.597350,0.906,0.748597,0.058446,0.000840,0.019000,0.0958,0.752515,0.548653,47
42313,0.141414,0.020240,0.376147,0.937,0.786700,0.288083,0.001406,0.000000,0.7740,0.161972,0.436558,42
92452,0.565657,0.041328,0.300714,0.374,0.685151,0.030363,0.839357,0.385000,0.1370,0.294769,0.652735,92
...,...,...,...,...,...,...,...,...,...,...,...,...
32902,0.303030,0.041336,0.587156,0.937,0.811138,0.048083,0.000245,0.871000,0.0903,0.336016,0.453624,32
11465,0.545455,0.045073,0.662589,0.195,0.663253,0.049948,0.697791,0.000013,0.0851,0.365191,0.552135,11
57339,0.787879,0.037743,0.570846,0.872,0.826694,0.145078,0.023996,0.000007,0.3710,0.686117,0.566250,56
17301,0.474747,0.040389,0.597350,0.419,0.706457,0.052124,0.297189,0.379000,0.0911,0.217304,0.552683,17


In [67]:
le = LabelEncoder()
df_train_2 = df_train.copy()
df_train_2['track_genre'] = le.fit_transform(df_train_2['track_genre'])
#df_train_2['explicit'] = df_train_2['explicit'].astype("int")

df_validation_2 = df_validation.copy()
df_validation_2['track_genre'] = le.transform(df_validation_2['track_genre'])
#df_validation_2['explicit'] = df_validation_2['explicit'].astype("int")

df_test_2 = df_test.copy()
df_test_2['track_genre'] = le.transform(df_test_2['track_genre'])
#df_test_2['explicit'] = df_test_2['explicit'].astype("int")

n_folds = 3
n_epochs = 20
learn_rate=0.3
n_codebooks=15

dataset_train = df_train_2.values.tolist()
dataset_validation = df_validation_2.values.tolist()

In [None]:
scores_cv, scores_vali, scores_cv_recall, scores_val_recall, scores_cv_precision, scores_val_precision, scores_cv_f1, scores_val_f1 = evaluate_algorithm(
    dataset_train, dataset_validation, learning_vector_quantization, n_folds, n_codebooks, learn_rate, n_epochs)

evaluation = {
    'cv_accuracy': sum(scores_cv)/float(len(scores_cv)),
    'validation_accuracy': sum(scores_vali)/float(len(scores_vali)),
    'cv_recall': sum(scores_cv_recall)/float(len(scores_cv_recall)),
    'validation_recall': sum(scores_val_recall)/float(len(scores_val_recall)),
    'cv_precision': sum(scores_cv_precision)/float(len(scores_cv_precision)),
    'validation_precision': sum(scores_val_precision)/float(len(scores_val_precision)),
    'cv_f1': sum(scores_cv_f1)/float(len(scores_cv_f1)),
    'validation_f1': sum(scores_val_f1)/float(len(scores_val_f1))
}

## LVQ 2

In [16]:
class LVQ:
    """
    Implementação do algoritmo Learning Vector Quantization (LVQ).

    Parameters:
        learning_rate: Taxa de aprendizado.
        epocas: Número de épocas.
    """
    def _init_(self, learning_rate=0.01, epocas=100):
        self.learning_rate = learning_rate
        self.epocas = epocas

    def fit(self, X, y):
        """
        Ajusta o modelo aos dados de treinamento.

        Parameters:
            X: Matriz de características de treinamento.
            y: Vetor de rótulos de treinamento.
        """
        self.X_train = X
        self.y_train = y
        self.model = NearestCentroid()
        self.model.fit(X, y)

    def _update_weights(self, x, y, learning_rate):
        """
        Atualiza os pesos (centróides) dos vetores de aprendizado.

        Parameters:
            x: Vetor de características da amostra de treinamento.
            y: Rótulo da amostra de treinamento.
            learning_rate: Taxa de aprendizado.
        """
        closest_class = self.model.predict([x])[0]
        if closest_class == y:
            self.model.centroids_[closest_class] += learning_rate * (x - self.model.centroids_[closest_class])
        else:
            self.model.centroids_[closest_class] -= learning_rate * (x - self.model.centroids_[closest_class])

    def train(self):
        """Treina o modelo LVQ."""
        for _ in range(self.epocas):
            for x, y in zip(self.X_train, self.y_train):
                self._update_weights(x, y, self.learning_rate)

    def predict(self, X):
        """
        Prevê os rótulos das amostras de teste.

        Parameters:
            X: Matriz de características das amostras de teste.

        Returns:
            array: Vetor de rótulos previstos.
        """
        return self.model.predict(X)

le = LabelEncoder()
df_train_2 = df_train.copy()
df_train_2['track_genre'] = le.fit_transform(df_train_2['track_genre'])
df_test_2 = df_test.copy()

df_cara_train = df_train_2[numeric_columns].values  #caracteristicas
df_clas_train = df_train_2['track_genre'].values #classe
df_clas_test = df_test_2['track_genre'].values #classe

lvq = LVQ()
lvq._init_(0.01,30)
lvq.fit(df_cara_train, df_clas_train)
lvq.train()
df_clas_pred = lvq.predict(df_clas_test)

acuracia = accuracy_score(df_clas_test, df_clas_pred)

evaluation={
'accuracy': accuracy_score(df_clas_test, df_clas_pred),
'precision': precision_score(df_clas_test, df_clas_pred, average='weighted'),
'recall': recall_score(df_clas_test, df_clas_pred, average='weighted'),
'f1': f1_score(df_clas_test, df_clas_pred, average='weighted')
}

print(acuracia,'\n')
print(evaluation)

## <a id='toc8_3_'></a>[Decision Tree](#toc0_)

## <a id='toc8_4_'></a>[MLP](#toc0_)

In [12]:
mlp = MLPClassifier()

In [18]:
param_grid = {
    'hidden_layer_sizes': [(100,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'], #, 'sgd'],
    'alpha': [0.0001,0.001],# 0.01],
    'learning_rate': ['adaptive'],
    'max_iter': [700]#300,500,#testar com mais iterações
}

grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='accuracy', verbose=3)
grid_search.fit(df_cara_train, df_clas_train)

Fitting 2 folds for each of 8 candidates, totalling 16 fits




[CV 1/2] END activation=relu, alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.245 total time= 3.5min




[CV 2/2] END activation=relu, alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.248 total time= 1.2min




[CV 1/2] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.255 total time= 3.0min




[CV 2/2] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.262 total time= 6.0min




[CV 1/2] END activation=relu, alpha=0.001, hidden_layer_sizes=(50,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.245 total time= 2.3min




[CV 2/2] END activation=relu, alpha=0.001, hidden_layer_sizes=(50,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.250 total time= 1.7min




[CV 1/2] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.258 total time= 1.6min




[CV 2/2] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.262 total time= 1.3min




[CV 1/2] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.253 total time=  32.4s




[CV 2/2] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.257 total time=  31.9s




[CV 1/2] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.258 total time=  38.1s




[CV 2/2] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.264 total time=  37.7s




[CV 1/2] END activation=tanh, alpha=0.001, hidden_layer_sizes=(50,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.253 total time=  31.6s




[CV 2/2] END activation=tanh, alpha=0.001, hidden_layer_sizes=(50,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.261 total time=  31.8s




[CV 1/2] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.258 total time=  37.9s




[CV 2/2] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=500, solver=adam;, score=0.263 total time=  38.1s




In [19]:
grid_search.cv_results_['mean_test_score']

array([0.24640351, 0.25877193, 0.24745614, 0.25964912, 0.25467836,
       0.26146199, 0.25660819, 0.26046784])

In [20]:
print(f'Melhor parametro: {grid_search.best_params_}')
print(f'Melhor resultado: {grid_search.best_score_}')

Melhor parametro: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'max_iter': 500, 'solver': 'adam'}
Melhor resultado: 0.26146198830409356


In [21]:
# Predizendo os rótulos dos dados de teste
df_clas_pred = grid_search.best_estimator_.predict(df_cara_test)

evaluation={
'accuracy': accuracy_score(df_clas_test, df_clas_pred),
'precision': precision_score(df_clas_test, df_clas_pred, average='weighted'),
'recall': recall_score(df_clas_test, df_clas_pred, average='weighted'),
'f1': f1_score(df_clas_test, df_clas_pred, average='weighted')
}

print(f'Dados de Teste')
print(evaluation)

Dados de Teste
{'accuracy': 0.2833333333333333, 'precision': 0.273592647284825, 'recall': 0.2833333333333333, 'f1': 0.26570096883360567}


## <a id='toc8_5_'></a>[SVM](#toc0_)

In [16]:
class_svm = SVC().fit(df_cara_train,df_clas_train)

###CUIDADO AO RODAR A CÉLULA ABAIXO

In [17]:
lista_kernels=['linear','rbf','poly']
lista_c =[1,2,3,4,5,7,10,100]
lista_gamma = [1,2,3,4,5,7,10,100]

# Criando um dicionário com os hiperparâmetros e valores a serem testados
param_grid = {'kernel': lista_kernels,'C': lista_c, 'gamma':lista_gamma}

In [18]:
grid = GridSearchCV(class_svm, param_grid, cv=5, scoring='accuracy')
grid.fit(df_cara_train,df_clas_train)

In [36]:
grid.cv_results_['mean_test_score']

array([0.23959782, 0.24529967, 0.24655703, 0.24743428, 0.24910085,
       0.2461184 , 0.17243205, 0.24614763, 0.2496857 , 0.25003663,
       0.25120619, 0.24933483, 0.24588455, 0.17181803, 0.24822371,
       0.25141093, 0.25392558, 0.25322383, 0.24965652, 0.24518281,
       0.1715256 , 0.25117696, 0.25418874, 0.25462738, 0.25339931,
       0.25018284, 0.24287283, 0.17140864, 0.25448113, 0.2567034 ,
       0.25576777, 0.25237586, 0.24731738, 0.23977326, 0.17120395,
       0.2579023 , 0.25857481, 0.25576782, 0.25217119, 0.24605999,
       0.23573809, 0.17088231, 0.26281478, 0.25585556, 0.24801923,
       0.24097225, 0.23056255, 0.22348627, 0.17041447])

In [37]:
print(f'Melhor parametro: {grid.best_params_}')
print(f'Melhor resultado: {grid.best_score_}')
#lista_kernels=['linear','rbf']
#lista_c =[2,3,4,5,7,10,100]
#lista_gamma = [2,3,4,5,7,10,100]
#Melhor parametro: {'C': 100, 'gamma': 2, 'kernel': 'rbf'}
#Melhor resultado: 0.26281478175137607

Melhor parametro: {'C': 100, 'gamma': 2}
Melhor resultado: 0.26281478175137607


In [39]:
# Predizendo os rótulos dos dados de teste
df_clas_pred = grid.best_estimator_.predict(df_cara_test)

evaluation={
'accuracy': accuracy_score(df_clas_test, df_clas_pred),
'precision': precision_score(df_clas_test, df_clas_pred, average='weighted'),
'recall': recall_score(df_clas_test, df_clas_pred, average='weighted'),
'f1': f1_score(df_clas_test, df_clas_pred, average='weighted')
}

print(f'Dados de Teste')
print(evaluation)

Dados de Teste
{'accuracy': 0.26570175438596494, 'precision': 0.2596195645386902, 'recall': 0.26570175438596494, 'f1': 0.2578082413357941}


## <a id='toc8_6_'></a>[Stacking](#toc0_)

## <a id='toc8_7_'></a>[Random Forest](#toc0_)