**Table of contents**<a id='toc0_'></a>    
- [Imports](#toc1_)    
- [Read Data](#toc2_)    
- [Data Preprocessing](#toc3_)
  - [Drop rows with missing values](#toc3_2_)    
  - [Removing Categorical Columns](#toc4_)    
  - [Split Train and Test Data](#toc5_)    
  - [Data Cleaning](#toc6_)    
    - [Impute missing numeric data](#toc6_1_)    
  - [Data Normalization](#toc7_)    
- [Model training](#toc8_)    
  - [KNN](#toc8_1_)  
  - [LVQ](#toc8_2_)
  - [Decision Tree](#toc8_3_)  
  - [MLP](#toc8_4_)
  - [SVM](#toc8_5_)  
  - [Stacking](#toc8_6_)  
  - [Random Forest](#toc8_7_)  

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

----

# <a id='toc1_'></a>[Imports](#toc0_)

In [3]:
import numpy as np
from matplotlib import pyplot as plt
from math import sqrt
from tqdm import tqdm

import pandas as pd
pd.options.display.max_colwidth = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 200

import sklearn
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate,train_test_split, GridSearchCV
from sklvq import GLVQ
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, make_scorer
import random
from random import seed,randrange
import requests
import io
import pickle

In [4]:
pip install -U git+https://github.com/rickvanveen/sklvq.git

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/rickvanveen/sklvq.git
  Cloning https://github.com/rickvanveen/sklvq.git to c:\users\pichau\appdata\local\temp\pip-req-build-ffx81b9c
  Resolved https://github.com/rickvanveen/sklvq.git to commit 4fba52a14ed37b0444becb96ef09c40d38d263ff
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Note: you may need to restart the kernel to use updated packages.


  Running command git clone --filter=blob:none --quiet https://github.com/rickvanveen/sklvq.git 'C:\Users\pichau\AppData\Local\Temp\pip-req-build-ffx81b9c'


# <a id='toc2_'></a>[Read Data](#toc0_)

In [23]:
# Downloading the csv file from your GitHub account

url = "https://raw.githubusercontent.com/Zuluke/Projetos-AM/main/spotify_activity/dataset.csv" # Make sure the url is the raw version of the file on GitHub
download = requests.get(url).content

# Reading the downloaded content and turning it into a pandas dataframe

df = pd.read_csv(io.StringIO(download.decode('utf-8')))

## <a id='toc3_'></a>[Visualize Data](#toc0_)

In [24]:
df.shape
print(df.shape)
print('\n')
df.info()
print('\n')
df.head()

(114000, 21)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liven

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Soundtrack),Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


# <a id='toc3_'></a>[Data Preprocessing](#toc0_)

## <a id='toc3_2_'></a>[Drop rows with missing values](#toc0_)

In [25]:
df.dropna(inplace=True, axis=0, how='any')

## <a id='toc4_'></a>[Removing Categorical Columns](#toc0_)

In [26]:
categorical_columns = ['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name', 'explicit', 'key', 'mode', 'time_signature']
df = df.drop(categorical_columns, axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   popularity        113999 non-null  int64  
 1   duration_ms       113999 non-null  int64  
 2   danceability      113999 non-null  float64
 3   energy            113999 non-null  float64
 4   loudness          113999 non-null  float64
 5   speechiness       113999 non-null  float64
 6   acousticness      113999 non-null  float64
 7   instrumentalness  113999 non-null  float64
 8   liveness          113999 non-null  float64
 9   valence           113999 non-null  float64
 10  tempo             113999 non-null  float64
 11  track_genre       113999 non-null  object 
dtypes: float64(9), int64(2), object(1)
memory usage: 11.3+ MB


## <a id='toc5_'></a>[Split Train and Test Data](#toc0_)

In [27]:
def train_validation_test_split(df, target_column, validation_size=0.1, test_size=0.1, random_state=42):
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=random_state, stratify=df[target_column])
    
    df_train, df_validation = train_test_split(df_train,
                                               test_size=validation_size/(1 - test_size),
                                               random_state=random_state,
                                               stratify=df_train[target_column])
    return df_train, df_validation, df_test  

In [28]:
df_train, df_validation, df_test = train_validation_test_split(df, "track_genre",0.2, 0.2)
df.info()

print('\n',len(df_train.values)/float(len(df)),len(df_test.values)/float(len(df)),len(df_validation.values)/float(len(df))) #Garantindo que o percentual ocorre

<class 'pandas.core.frame.DataFrame'>
Index: 113999 entries, 0 to 113999
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   popularity        113999 non-null  int64  
 1   duration_ms       113999 non-null  int64  
 2   danceability      113999 non-null  float64
 3   energy            113999 non-null  float64
 4   loudness          113999 non-null  float64
 5   speechiness       113999 non-null  float64
 6   acousticness      113999 non-null  float64
 7   instrumentalness  113999 non-null  float64
 8   liveness          113999 non-null  float64
 9   valence           113999 non-null  float64
 10  tempo             113999 non-null  float64
 11  track_genre       113999 non-null  object 
dtypes: float64(9), int64(2), object(1)
memory usage: 11.3+ MB

 0.5999964911972913 0.2000017544013544 0.2000017544013544


## <a id='toc6_'></a>[Data Cleaning](#toc0_)

### <a id='toc6_1_'></a>[Impute missing numeric data](#toc0_)

In [29]:
numeric_columns = df_train.select_dtypes(include=['number']).columns

numeric_imputer = SimpleImputer(strategy='median')
numeric_imputer.fit(df_train[numeric_columns])

df_train[numeric_columns] = numeric_imputer.transform(df_train[numeric_columns])
df_validation[numeric_columns] = numeric_imputer.transform(df_validation[numeric_columns])
df_test[numeric_columns] = numeric_imputer.transform(df_test[numeric_columns])

## <a id='toc7_'></a>[Data Normalization](#toc0_)

In [30]:
normalizer = MinMaxScaler()

normalizer.fit(df_train[numeric_columns])

df_train[numeric_columns] = normalizer.transform(df_train[numeric_columns])
df_validation[numeric_columns] = normalizer.transform(df_validation[numeric_columns])
df_test[numeric_columns] = normalizer.transform(df_test[numeric_columns])

In [31]:
#Div. de dados atributos e classe
df_cara_train = df_train[numeric_columns].values  #caracteristicas
df_clas_train = df_train['track_genre'].values #classe

df_cara_validation = df_validation[numeric_columns].values  #caracteristicas
df_clas_validation = df_validation['track_genre'].values #classe

df_cara_test = df_test[numeric_columns].values  #caracteristicas
df_clas_test = df_test['track_genre'].values #classe

# <a id='toc8_'></a>[Model training](#toc0_)

## <a id='toc8_1_'></a>[KNN](#toc0_)

In [28]:
knn = KNeighborsClassifier().fit(df_cara_train,df_clas_train)
param_grid = {
    'n_neighbors': np.arange(1,81,2),
    'metric': ['euclidean', 'manhattan']
}
grid = GridSearchCV(knn, param_grid, cv=5, scoring='f1_weighted')
grid.fit(df_cara_train,df_clas_train)

with open('KNN_model_searcher.pkl', 'wb') as f:
    pickle.dump(grid,f)

print(grid.cv_results_['mean_test_score'],'\n\n')
print(f'Melhor parametro: {grid.best_params_}')
print(f'Melhor resultado: {grid.best_score_}','\n\n')

df_clas_pred = grid.best_estimator_.predict(df_cara_test)

evaluation={
'accuracy': accuracy_score(df_clas_test, df_clas_pred),
'precision': precision_score(df_clas_test, df_clas_pred, average='weighted'),
'recall': recall_score(df_clas_test, df_clas_pred, average='weighted'),
'f1': f1_score(df_clas_test, df_clas_pred, average='weighted')
}

print(f'Dados de Teste')
print(evaluation)

[0.19271113 0.17981638 0.19097023 0.19725932 0.20040176 0.20113115
 0.20136294 0.20156006 0.20189206 0.20220118 0.20195172 0.2014731
 0.20076795 0.20051412 0.20003    0.19940234 0.19883093 0.19865899
 0.19789806 0.19729613 0.19769722 0.19671547 0.19640499 0.19497864
 0.19443508 0.19392372 0.19358433 0.1924345  0.19211111 0.19212783
 0.19173116 0.19203969 0.19049933 0.18997615 0.18952271 0.18933138
 0.18881198 0.18835951 0.18699562 0.18676004 0.20888562 0.19609269
 0.20843932 0.21383991 0.21765953 0.22100124 0.2209014  0.22256828
 0.22332125 0.22330809 0.22372879 0.22404477 0.22400909 0.22443181
 0.22480136 0.2234478  0.22331913 0.22432702 0.2232941  0.22294893
 0.22275483 0.22209562 0.22132017 0.21978464 0.21881537 0.21773707
 0.21828    0.21700103 0.21672093 0.21673155 0.21646879 0.21606207
 0.21474505 0.21442645 0.21436507 0.2141266  0.21331732 0.21299634
 0.21243477 0.2120711 ] 


Melhor parametro: {'metric': 'manhattan', 'n_neighbors': 29}
Melhor resultado: 0.22480135939698612 


D

## <a id='toc8_2_'></a>[LVQ](#toc0_)

In [18]:
#divisao de dados atributos e classe
df_cara = df.values[:, 0:-1] #caracteristicas
df_clas = df.values[:, -1] #classe

#usando o metodo para criar os conjuntos de treinamento, validacao e teste
df_cara_train, df_cara_test, df_clas_train, df_clas_test = train_test_split(df_cara, df_clas, test_size = 0.40, random_state = 10)

df_cara_train, df_cara_valid, df_clas_train, df_clas_valid = train_test_split(df_cara_train, df_clas_train, test_size = 0.50, random_state = 10)

scaler = StandardScaler()
scaler.fit(df_cara_train)

df_cara_train_scaled = scaler.transform(df_cara_train)
df_cara_valid_scaled = scaler.transform(df_cara_valid)
df_cara_test_scaled = scaler.transform(df_cara_test)

In [None]:
# Definindo o classificador LVQ
lvq = GLVQ()

# Criando o dicionário de parâmetros para o grid search
param_grid = {
    "prototype_n_per_class": [1,3],  # Número de protótipos por classe
    "distance_type": ["euclidean"],
    "solver_params": [{"max_runs": 5, "step_size": step} for step in [0.1, 0.5]]  # Lista de dicionários para diferentes step_sizes
}
 


# Criando os scorers personalizados
scorers = {
    "accuracy": make_scorer(accuracy_score),
    "precision_macro": make_scorer(precision_score, average='macro', zero_division = 0),
    "recall_macro": make_scorer(recall_score, average='macro'),
    "f1_macro": make_scorer(f1_score, average='macro')
    
}

# Criando o objeto GridSearchCV
grid_search = GridSearchCV(lvq, param_grid, cv=5, scoring=scorers, refit="accuracy")

# Treinando o GridSearchCV com os dados de treino escalados
grid_search.fit(df_cara_train_scaled, df_clas_train)

with open('LVQ_model_searcher.pkl', 'wb') as f:
    pickle.dump(grid_search,f)

# Os melhores parâmetros encontrados
print("Melhores parâmetros: ", grid_search.best_params_)

# O melhor classificador encontrado pelo grid search

best_lvq = grid_search.best_estimator_

# Avaliando a Acurácia, recall, F1 score, precisão e roc_auc

# Evaluating accuracy, recall, F1 score, and precision with the trained data
accuracy = best_lvq.score(df_cara_train_scaled, df_clas_train)
recall = recall_score(df_clas_train, best_lvq.predict(df_cara_train_scaled), average='macro')
f1 = f1_score(df_clas_train, best_lvq.predict(df_cara_train_scaled), average='macro')
precision = precision_score(df_clas_train, best_lvq.predict(df_cara_train_scaled), average='macro', zero_division=0)

print("Accuracy on the train set: ", accuracy)
print("Recall on the train set: ", recall)
print("F1 score on the train set: ", f1)
print("Precision on the train set: ", precision)
# Evaluating accuracy, recall, F1 score, and precision with the trained data
accuracy = best_lvq.score(df_cara_test_scaled, df_clas_test)
recall = recall_score(df_clas_test, best_lvq.predict(df_cara_test_scaled), average='macro')
f1 = f1_score(df_clas_test, best_lvq.predict(df_cara_test_scaled), average='macro')
precision = precision_score(df_clas_test, best_lvq.predict(df_cara_test_scaled), average='macro', zero_division=0)

print("Accuracy on the test set: ", accuracy)
print("Recall on the test set: ", recall)
print("F1 score on the test set: ", f1)
print("Precision on the test set: ", precision)

# Avaliando a Acurácia, recall, F1 score, precisão e roc_auc
accuracy = best_lvq.score(df_cara_valid_scaled, df_clas_valid)
recall = recall_score(df_clas_valid, best_lvq.predict(df_cara_valid_scaled), average='macro')
f1 = f1_score(df_clas_valid, best_lvq.predict(df_cara_valid_scaled), average='macro')
precision = precision_score(df_clas_valid, best_lvq.predict(df_cara_valid_scaled), average='macro', zero_division=0)

print("Precisão no conjunto de validação: ", accuracy)
print("Recall no conjunto de validação: ", recall)
print("F1 score no conjunto de validação: ", f1)
print("Precision on the train set: ", precision)

LVQ_best = GLVQ(distance_type='euclidean', prototype_n_per_class=3, solver_params={"max_runs": 5, "step_size": 0.1})

LVQ_best.fit(df_cara_train_scaled, df_clas_train)

evaluation = {
    'accuracy': make_scorer(accuracy_score),
    'recall_macro': make_scorer(recall_score, average='macro'),
    'f1_macro': make_scorer(f1_score, average='macro'),
    'precision_macro': make_scorer(precision_score, average='macro', zero_division = 0)
}

# 10 folds cross validation

cv_results = cross_validate(LVQ_best, df_cara_valid_scaled, df_clas_valid, cv=5, scoring=evaluation)



results_df = pd.DataFrame()
# results per metric
for metric in evaluation:
    print(f"{metric} per fold: ", cv_results[f'test_{metric}'])
    results_df[f'{metric}_per_fold'] = cv_results[f'test_{metric}']

for i in range(4):
    print(np.mean(results_df.values[:,i])*100)

In [31]:
grid_search.best_params_

{'distance_type': 'euclidean',
 'prototype_n_per_class': 3,
 'solver_params': {'max_runs': 5, 'step_size': 0.1}}

## <a id='toc8_3_'></a>[Decision Tree](#toc0_)

## <a id='toc8_4_'></a>[MLP](#toc0_)

In [12]:
mlp = MLPClassifier()
param_grid = {
    'hidden_layer_sizes': [(100,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'], #, 'sgd'],
    'alpha': [0.0001],# 0.01],
    'learning_rate': ['adaptive'],
    'max_iter': [700]#300,500,#testar com mais iterações
}

grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='accuracy', verbose=3)
grid_search.fit(df_cara_train, df_clas_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits




[CV 1/5] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=700, solver=adam;, score=0.300 total time=18.2min




[CV 2/5] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=700, solver=adam;, score=0.297 total time= 8.3min




[CV 3/5] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=700, solver=adam;, score=0.303 total time= 9.8min




[CV 4/5] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=700, solver=adam;, score=0.304 total time= 9.4min




[CV 5/5] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=700, solver=adam;, score=0.298 total time= 8.9min




[CV 1/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=700, solver=adam;, score=0.300 total time= 3.6min




[CV 2/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=700, solver=adam;, score=0.295 total time= 3.9min




[CV 3/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=700, solver=adam;, score=0.304 total time= 3.9min




[CV 4/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=700, solver=adam;, score=0.301 total time= 3.1min




[CV 5/5] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, max_iter=700, solver=adam;, score=0.299 total time= 2.9min




In [13]:
with open('MLP_model_searcher.pkl', 'wb') as f:
    pickle.dump(grid_search, f)
print(grid_search.cv_results_['mean_test_score'])
print(f'Melhor parametro: {grid_search.best_params_}')
print(f'Melhor resultado: {grid_search.best_score_}')

[0.30055992 0.29971197]
Melhor parametro: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'max_iter': 700, 'solver': 'adam'}
Melhor resultado: 0.30055991939665916


In [14]:
# Predizendo os rótulos dos dados de teste
df_clas_pred = grid_search.best_estimator_.predict(df_cara_test)

evaluation={
'accuracy': accuracy_score(df_clas_test, df_clas_pred),
'precision': precision_score(df_clas_test, df_clas_pred, average='weighted'),
'recall': recall_score(df_clas_test, df_clas_pred, average='weighted'),
'f1': f1_score(df_clas_test, df_clas_pred, average='weighted')
}

print(f'Dados de Teste')
print(evaluation)

Dados de Teste
{'accuracy': 0.3073245614035088, 'precision': 0.29637937670600994, 'recall': 0.3073245614035088, 'f1': 0.2909215595871962}


## <a id='toc8_5_'></a>[SVM](#toc0_)

In [32]:
class_svm = SVC().fit(df_cara_train,df_clas_train)
###CUIDADO AO RODAR AS CÉLULAS ABAIXO

In [None]:
lista_kernels=['rbf']
lista_c =[100]
lista_gamma = [2]

# Criando um dicionário com os hiperparâmetros e valores a serem testados
param_grid = {'kernel': lista_kernels,'C': lista_c, 'gamma':lista_gamma}

In [None]:
grid = GridSearchCV(class_svm, param_grid, cv=5, scoring='accuracy')
grid.fit(df_cara_train,df_clas_train)

with open('SVM_model_searcher.pkl', 'wb') as f:
    pickle.dump(grid,f)
print(grid.cv_results_['mean_test_score'])
print(f'Melhor parametro: {grid.best_params_}')
print(f'Melhor resultado: {grid.best_score_}')
#lista_kernels=['linear','rbf']
#lista_c =[2,3,4,5,7,10,100]
#lista_gamma = [2,3,4,5,7,10,100]
#Melhor parametro: {'C': 100, 'gamma': 2, 'kernel': 'rbf'}
#Melhor resultado: 0.26281478175137607

In [None]:
# Predizendo os rótulos dos dados de teste
df_clas_pred = grid.best_estimator_.predict(df_cara_test)

evaluation={
'accuracy': accuracy_score(df_clas_test, df_clas_pred),
'precision': precision_score(df_clas_test, df_clas_pred, average='weighted'),
'recall': recall_score(df_clas_test, df_clas_pred, average='weighted'),
'f1': f1_score(df_clas_test, df_clas_pred, average='weighted')
}

print(f'Dados de Teste')
print(evaluation)

Dados de Teste
{'accuracy': 0.26570175438596494, 'precision': 0.2596195645386902, 'recall': 0.26570175438596494, 'f1': 0.2578082413357941}


## <a id='toc8_6_'></a>[Stacking](#toc0_)

## <a id='toc8_7_'></a>[Random Forest](#toc0_)