# Base de dados de músicas do Spotify

In [191]:
import numpy as np
import pandas as pd

from sklearn import metrics, svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [163]:
# Informações das Features da Base de dados.
# https://developer.spotify.com/web-api/get-audio-features/
# carrega o dataset - Classe target indica quem gostou ou nao da musica 0- Não 1-Sim

dataset = pd.read_csv('datasets/spotify.csv')
remove_features(['id','song_title', 'artist'])

#dataset.head(20)

0

# Total de registros

In [164]:
# total de linhas e colunas
dataset.shape

(2017, 14)

# Removendo campos desnecessários

In [165]:
def remove_features(lista_features):
    for i in lista_features:
        dataset.drop(i, axis=1, inplace=True)
    return 0

# 1. KNN

# Processo de treino-teste

In [166]:
X = dataset.drop(columns=['target'])
y = dataset['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

knn = KNeighborsClassifier(n_neighbors=12)
knn.fit(X_train, y_train)

knn.predict(X_test)

array([0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,

# Acurácia do modelo KNN

In [167]:
print("Acuracia: {0}".format(knn.score(X_test, y_test))) 

Acuracia: 0.5495049504950495


# Acurácia do modelo KNN com Cross Validation CV = 10

In [168]:
cv_scores_knn = cross_val_score(knn, X, y, cv=10)
print("Acuracias do KNN de cada CV: {0}".format(cv_scores_knn))
print("Acuracia media do KNN: {0}".format(np.mean(cv_scores_knn)))

Acuracias do KNN de cada CV: [0.63861386 0.62871287 0.53465347 0.57920792 0.61881188 0.45049505
 0.54455446 0.5721393  0.60696517 0.58208955]
Acuracia media do KNN: 0.5756243534801241


# 2. Naive Bayes

In [169]:
model_nb = GaussianNB()
model_nb.fit(X_train, y_train)
predicted_nb = model_nb.predict(X_test)

# Acurácia do modelo Naive Bayes

In [170]:
print("Acuracia: {0}".format(metrics.accuracy_score(y_test, predicted_nb)))

Acuracia: 0.6262376237623762


# Acurácia do modelo Naive Bayes com Cross Validation CV = 10

In [171]:
cv_scores_nb = cross_val_score(model_nb, X, y, cv=10)
print("Acuracias do NB de cada CV: {0}".format(cv_scores_nb))
print("Acuracia media do NB: {0}".format(np.mean(cv_scores_nb)))

Acuracias do NB de cada CV: [0.68811881 0.63861386 0.56930693 0.57920792 0.63861386 0.31188119
 0.5990099  0.51741294 0.62686567 0.58208955]
Acuracia media do NB: 0.5751120634451505


# 3. Decision Trees

In [172]:
model_dt = DecisionTreeClassifier()
model_dt = model_dt.fit(X_train,y_train)
predicted_dt = model_dt.predict(X_test)

# Acurácia do modelo Decision Trees

In [145]:
print("Acuracia: {0}".format(metrics.accuracy_score(y_test, predicted_dt)))

Acuracia: 0.7227722772277227


# Acurácia do modelo Decision Trees com Cross Validation CV = 10

In [148]:
cv_scores_dt = cross_val_score(model_dt, X, y, cv=10)
print("Acuracias do DT de cada CV: {0}".format(cv_scores_dt))
print("Acuracia media do DT: {0}".format(np.mean(cv_scores_dt)))

Acuracias do DT de cada CV: [0.78217822 0.65841584 0.72772277 0.63861386 0.61881188 0.72277228
 0.63366337 0.59701493 0.68159204 0.61691542]
Acuracia media do DT: 0.6677700605881484


# 4. Regression Linear

In [185]:
X = dataset.drop(columns=['target'])
y = dataset['target'].values

model_rl = LogisticRegression()

grid_params = {'C':[1,2,3,4,5,6,7,8,9,10]}
grid = GridSearchCV(model_rl, grid_params, cv=10, scoring="accuracy")
grid.fit(X,y)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [188]:
print("Melhor parametro: {0}".format(grid.best_params_))

Melhor parametro: {'C': 1}


In [189]:
print("Melhor estimativa: {0}".format(grid.best_estimator_))

Melhor estimativa: LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [186]:
print("Melhor Score: {0}".format(grid.best_score_))

Melhor Score: 0.5349529003470501


# 5. Support Vector Machine (SVM)

In [194]:
X = dataset.drop(columns=['target'])
y = dataset['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

model_svm = svm.SVC().fit(X_train,y_train)
predicted_svm = model_svm.predict(X_test)

Acuracia: 0.49257425742574257


# Acurácia do modelo SVM

In [196]:
print("Acuracia: {0}".format(metrics.accuracy_score(y_test, predicted_svm)))

Acuracia: 0.49257425742574257


# Acurácia do modelo SVM com Cross Validation CV = 10

In [197]:
cv_scores_svm = cross_val_score(model_svm, X, y, cv=10)
print("Acuracias do SVM de cada CV: {0}".format(cv_scores_svm))
print("Acuracia media do SVM: {0}".format(np.mean(cv_scores_svm)))

Acuracias do SVM de cada CV: [0.5049505  0.51485149 0.5049505  0.5049505  0.51485149 0.51485149
 0.50990099 0.51741294 0.51243781 0.52238806]
Acuracia media do SVM: 0.5121545736663219
