# Base de dados de músicas do Spotify

In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics, svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

### Removendo campos desnecessários

In [2]:
def remove_features(lista_features):
    for i in lista_features:
        dataset.drop(i, axis=1, inplace=True)
    return 0

In [3]:
# Informações das Features da Base de dados.
# https://developer.spotify.com/web-api/get-audio-features/
# carrega o dataset - Classe target indica quem gostou ou nao da musica 0- Não 1-Sim

dataset = pd.read_csv('datasets/spotify.csv')
remove_features(['id','song_title', 'artist'])

#dataset.head(20)

0

### Total de registros

In [4]:
# total de linhas e colunas
dataset.shape

(2017, 14)

# 1. KNN

### Processo de treino-teste

In [5]:
X = dataset.drop(columns=['target'])
y = dataset['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, stratify=y)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

knn.predict(X_test)

array([1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,

## Acurácia do modelo KNN

In [180]:
print("Acuracia: {0}".format(knn.score(X_test, y_test))) 

Acuracia: 0.5470297029702971


## Acurácia do modelo KNN com Cross Validation CV = 10

In [181]:
cv_scores_knn = cross_val_score(knn, X, y, cv=10)
print("Acuracias do KNN de cada CV: {0}".format(cv_scores_knn))
print("Acuracia media do KNN: {0}".format(np.mean(cv_scores_knn)))

Acuracias do KNN de cada CV: [0.58910891 0.59405941 0.53465347 0.56930693 0.57425743 0.46534653
 0.53465347 0.5920398  0.58706468 0.58208955]
Acuracia media do KNN: 0.5622580168464607


## Tuning de parametros com KNN

In [149]:
grid_params = {'n_neighbors' : [1,2,3,4,5,6,7,8,9,10], 'weights': ['uniform', 'distance'], 'metric' : ['euclidean', 'minkowski']}
grid = GridSearchCV(KNeighborsClassifier(), grid_params, cv=10, verbose=1, n_jobs=-1)
grid_result_knn = grid.fit(X_train, y_train)

print("Melhor parametro: {0}".format(grid_result_knn.best_params_))
print("Melhor score: {0}".format(grid_result_knn.best_score_))
print("Melhor estimativa: {0}".format(grid_result_knn.best_estimator_))

Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    8.9s


Melhor parametro: {'metric': 'euclidean', 'n_neighbors': 10, 'weights': 'uniform'}
Melhor score: 0.6001239925604464
Melhor estimativa: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=None, n_neighbors=10, p=2,
           weights='uniform')


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:   15.2s finished


## KNN com melhor parametro / estimativa

In [182]:
knn_best_case = KNeighborsClassifier(n_neighbors=10, weights='uniform', metric='euclidean')
knn_best_case.fit(X_train, y_train)

knn_best_case.predict(X_test)
print("Acuracia: {0}".format(knn_best_case.score(X_test, y_test))) 

Acuracia: 0.5544554455445545


# 2. Naive Bayes

In [192]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, stratify=y)

model_nb = GaussianNB()
model_nb.fit(X_train, y_train)
predicted_nb = model_nb.predict(X_test)

## Acurácia do modelo Naive Bayes

In [193]:
print("Acuracia: {0}".format(metrics.accuracy_score(y_test, predicted_nb)))

Acuracia: 0.5792079207920792


## Acurácia do modelo Naive Bayes com Cross Validation CV = 10

In [189]:
cv_scores_nb = cross_val_score(model_nb, X, y, cv=10)
print("Acuracias do NB de cada CV: {0}".format(cv_scores_nb))
print("Acuracia media do NB: {0}".format(np.mean(cv_scores_nb)))

Acuracias do NB de cada CV: [0.68811881 0.63861386 0.56930693 0.57920792 0.63861386 0.31188119
 0.5990099  0.51741294 0.62686567 0.58208955]
Acuracia media do NB: 0.5751120634451505


# 3. Decision Trees

In [25]:
model_dt = DecisionTreeClassifier()
model_dt = model_dt.fit(X_train,y_train)
predicted_dt = model_dt.predict(X_test)

## Acurácia do modelo Decision Trees

In [26]:
print("Acuracia: {0}".format(metrics.accuracy_score(y_test, predicted_dt)))

Acuracia: 0.6856435643564357


## Acurácia do modelo Decision Trees com Cross Validation CV = 10

In [27]:
cv_scores_dt = cross_val_score(model_dt, X, y, cv=10)
print("Acuracias do DT de cada CV: {0}".format(cv_scores_dt))
print("Acuracia media do DT: {0}".format(np.mean(cv_scores_dt)))

Acuracias do DT de cada CV: [0.77227723 0.68316832 0.72772277 0.66831683 0.62871287 0.7029703
 0.64356436 0.58706468 0.68159204 0.65174129]
Acuracia media do DT: 0.6747130683217575


## Tuning de parametros com Decision Trees

In [80]:
grid_params = { 'max_depth' : [1,2,3,4,5,6,7,8,9,10], 'criterion':['gini','entropy']}
grid = GridSearchCV(DecisionTreeClassifier(), grid_params, cv=10, verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

print("Melhor parametro: {0}".format(grid_result.best_params_))
print("Melhor score: {0}".format(grid_result.best_score_))
print("Melhor estimativa: {0}".format(grid_result.best_estimator_))

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Melhor parametro: {'criterion': 'gini', 'max_depth': 5}
Melhor score: 0.7327960322380657
Melhor estimativa: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    1.3s finished


## Decision Trees com melhor parametro / estimativa

In [133]:
dt_best_case = DecisionTreeClassifier(max_depth=5, criterion='gini')
dt_best_case = dt_best_case.fit(X_train,y_train)
predicted_best_case = dt_best_case.predict(X_test)
print("Acuracia: {0}".format(metrics.accuracy_score(y_test, predicted_best_case)))

Acuracia: 0.698019801980198


# 4. Regression Linear

In [201]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

model_rl = LogisticRegression()
model_rl.fit(X_train, y_train)
y_pred = model_rl.predict(X_test)
print('Acurácia:', accuracy_score(y_test, y_pred))

Acurácia: 0.5693069306930693




## Acurácia do modelo Decision Trees com Cross Validation CV = 10

In [208]:
cv_scores_rl = cross_val_score(model_rl, X, y, cv=10)
print("Acuracias do RL de cada CV: {0}".format(cv_scores_rl))
print("Acuracia media do RL: {0}".format(np.mean(cv_scores_rl)))



Acuracias do RL de cada CV: [0.72277228 0.55445545 0.66336634 0.65841584 0.57920792 0.41089109
 0.71782178 0.5721393  0.67164179 0.55223881]
Acuracia media do RL: 0.6102950593566819


## Tuning de parametros com Regression Linear

In [204]:
grid_params = {'C':[1,2,3,4,5,6,7,8,9,10], "penalty":["l1","l2"]}
grid_result = GridSearchCV(model_rl, grid_params, cv=10)
grid_result.fit(X,y)

print("Melhor parametro: {0}".format(grid_result.best_params_))
print("Melhor score: {0}".format(grid_result.best_score_))
print("Melhor estimativa: {0}".format(grid_result.best_estimator_))











Melhor parametro: {'C': 2, 'penalty': 'l1'}
Melhor score: 0.615765989092712
Melhor estimativa: LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)




## Regression Linear com melhor parametro / estimativa

In [9]:
model_rl = LogisticRegression(C=2, penalty='l1')
model_rl.fit(X_train, y_train)
y_pred = model_rl.predict(X_test)
print('Acurácia:', accuracy_score(y_test, y_pred))

Acurácia: 0.6262376237623762




# 5. Support Vector Machine (SVM)

In [7]:
X = dataset.drop(columns=['target'])
y = dataset['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2, stratify=y)

model_svm = svm.SVC().fit(X_train,y_train)
predicted_svm = model_svm.predict(X_test)



## Acurácia do modelo SVM

In [11]:
print("Acuracia: {0}".format(metrics.accuracy_score(y_test, predicted_svm)))

Acuracia: 0.5099009900990099


## Acurácia do modelo SVM com Cross Validation CV = 10

In [12]:
cv_scores_svm = cross_val_score(model_svm, X, y, cv=10)
print("Acuracias do SVM de cada CV: {0}".format(cv_scores_svm))
print("Acuracia media do SVM: {0}".format(np.mean(cv_scores_svm)))



Acuracias do SVM de cada CV: [0.5049505  0.51485149 0.5049505  0.5049505  0.51485149 0.51485149
 0.50990099 0.51741294 0.51243781 0.52238806]
Acuracia media do SVM: 0.5121545736663219


# Tuning de parametros com SVM

In [None]:
#grid_params = {'C': [0.001, 0.01, 0.1, 1, 10], 'gamma' : [0.001, 0.01, 0.1, 1]}
grid_params = {'C':[1,2,3,4,5,6,7,8,9,10], 'gamma' : [1,2,3,4,5,6,7,8,9,10]}
grid_result = GridSearchCV(model_svm, grid_params, cv=10)
grid_result.fit(X,y)

print("Melhor parametro: {0}".format(grid_result.best_params_))
print("Melhor score: {0}".format(grid_result.best_score_))
print("Melhor estimativa: {0}".format(grid_result.best_estimator_))

# SVM com melhor parametro / estimativa

In [None]:
model_svm = svm.SVC().fit(X_train,y_train)
predicted_svm = model_svm.predict(X_test)