In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

In [2]:
import pandas as pd

In [3]:
songs = pd.read_csv('..//data/my_songs_with_labels.csv', index_col = [0])

## Model development

In [63]:
songs_features = ['danceability', 'energy',
                  'mode', 'acousticness', 'instrumentalness',
                  'liveness', 'valence', 
                  'loudness_norm', 'tempo_norm', 'key_norm', 
                  'time_signature_norm']

In [64]:
models = []

models.append(("LogisticRegression",LogisticRegression(solver = 'lbfgs',multi_class='auto', max_iter = 42000, random_state = 42)))
models.append(("LinearSVC",LinearSVC(max_iter = 42000, random_state = 42)))
models.append(("DecisionTree",DecisionTreeClassifier()))
models.append(("RandomForest",RandomForestClassifier(n_estimators = 100)))
models.append(("MLPClassifier",MLPClassifier(solver='lbfgs', max_iter = 9000, random_state=42)))

In [65]:
X = songs[songs_features]
y = songs.iloc[:,-1]    #labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [66]:
results = []
names = []

for name, model in models:
    result = cross_val_score(model, X_train, y_train,  cv = 5)
    names.append(name)
    results.append(result)

for i in range(len(names)):
    print(names[i],results[i].mean())

LogisticRegression 0.9652173913043478
LinearSVC 0.9855072463768115
DecisionTree 0.9391304347826088
RandomForest 0.9594202898550724
MLPClassifier 0.9855072463768115


In [67]:
model = LogisticRegression(solver = 'lbfgs',multi_class='auto', max_iter = 10000, random_state = 99)
model.fit(X_train, y_train)

LogisticRegression(max_iter=10000, random_state=99)

In [68]:
y_pred = model.predict(X_test)

In [69]:
confusion_matrix(y_test, y_pred)

array([[33,  0,  0,  0,  2],
       [ 0, 22,  0,  1,  0],
       [ 0,  0, 22,  0,  1],
       [ 0,  0,  0, 28,  0],
       [ 0,  0,  0,  0, 40]])

In [70]:
model2 = MLPClassifier(solver='lbfgs', max_iter = 9000, random_state=42)
model2.fit(X_train, y_train)

MLPClassifier(max_iter=9000, random_state=42, solver='lbfgs')

In [71]:
y_pred2 = model2.predict(X_test)
confusion_matrix(y_test, y_pred2)

array([[35,  0,  0,  0,  0],
       [ 0, 21,  0,  2,  0],
       [ 0,  0, 23,  0,  0],
       [ 0,  0,  0, 28,  0],
       [ 1,  0,  0,  0, 39]])

Test on realout of sample data

In [26]:
import config

In [27]:
client_id = config.client_id
client_secret = config.client_secret

client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [28]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util

import sys
sys.path.insert(0, '/Users/alanlau/Documents/Projects/music_analyzer/data')

import scraper

In [31]:
df_test = scraper.datascrape('spotify', '37i9dQZF1DX3lmnsqvYpgD', sp)

In [32]:
df_test

Unnamed: 0,title,first_artist,all_artists,id,danceability,energy,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,(一個男人) 一個女人 和浴室,Louis Koo,"[Louis Koo, Kay Tse]",5NvsRbIoMFMqBAVFHn1f3X,0.53,0.341,2,-11.377,1,0.836,0.0,0.139,0.467,111.81,300000,4
1,彌敦道,Ken Hung,[Ken Hung],2HRMHz6NgX2p2BehbjAxN4,0.625,0.466,2,-7.242,1,0.805,0.0,0.117,0.416,112.916,245523,4
2,"但願人長久 - 劇集 ""跳躍生命線"" 插曲",HANA,[HANA],2cZ78DPU09gFPZZdAjykGP,0.397,0.136,11,-12.295,0,0.854,0.0,0.107,0.232,183.357,240547,4
3,老少平安,Pakho Chau,[Pakho Chau],0BWHYaG1cdHp1uVDH0BiDp,0.558,0.384,1,-7.285,1,0.844,0.0,0.0861,0.567,144.109,208923,4
4,困獸‧28,Juno Mak,[Juno Mak],5xe1gdm6N93e3FUiXZ56dZ,0.506,0.443,9,-8.238,1,0.623,0.0,0.0837,0.366,151.762,267632,4
5,海枯石爛,Kary Ng,[Kary Ng],3sPjiPdvxAPqIqqMjwCThw,0.419,0.487,9,-6.311,1,0.645,0.0,0.104,0.19,141.936,257000,4
6,"Tracey (電影 ""翠絲"" 主題曲)",Panther Chan,[Panther Chan],4lmOXmwygGpCEwXqXryY6B,0.471,0.203,6,-8.19,1,0.807,0.0,0.111,0.412,144.077,268000,4
7,掉進海的眼淚,Kaho Hung,[Kaho Hung],14E27fUMTIrhQtOxMWP0py,0.52,0.569,8,-6.196,1,0.65,0.0,0.193,0.355,123.955,239248,4
8,永生花,曾樂彤,[曾樂彤],0GnO5L1HgmcQlFaRbmdfVX,0.691,0.268,8,-13.758,1,0.719,0.0,0.114,0.539,118.988,228692,4
9,狂迷,鄧思朗,[鄧思朗],6XO1iXHKTLkk231Q3RfYkT,0.669,0.68,5,-7.818,0,0.25,0.0,0.107,0.576,112.013,246828,4


In [73]:
music_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 
                  'acousticness', 'instrumentalness', 'liveness', 
                  'valence', 'duration_ms', 'time_signature']

In [74]:
from sklearn import preprocessing 

In [75]:
# Feature: loudness
loudness = df_test[['loudness']]
min_max_scalar = preprocessing.MinMaxScaler()

loudness = min_max_scalar.fit_transform(loudness)
df_test['loudness_norm'] = loudness

# Feature: tempo
tempo = df_test[['tempo']]
tempo = min_max_scalar.fit_transform(tempo)
df_test['tempo_norm'] = tempo

# # Feature: duration_ms
# duration_ms = my_songs[['duration_ms']]
# duration_ms = min_max_scalar.fit_transform(duration_ms)
# my_songs['duration_norm'] = duration_ms

# Feature: key
key = df_test[['key']]
key = min_max_scalar.fit_transform(key)
df_test['key_norm'] = key

# Feature: time_signature
time_signature = df_test[['time_signature']]
time_signature = min_max_scalar.fit_transform(time_signature)
df_test['time_signature_norm'] = time_signature

In [76]:
music_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 
                  'acousticness', 'instrumentalness', 'liveness', 
                  'valence', 'tempo', 'duration_ms', 'time_signature']

songs_features = df_test[new_music_features]

In [77]:
X_train.head(1)

Unnamed: 0,danceability,energy,mode,acousticness,instrumentalness,liveness,valence,loudness_norm,tempo_norm,key_norm,time_signature_norm
129,0.776,0.377,1,0.567,0.00233,0.108,0.772,0.463343,0.08339,0.090909,1.0


In [78]:
X_train.shape

(345, 11)

In [79]:
songs_features.head(1)

Unnamed: 0,danceability,energy,key_norm,mode,acousticness,instrumentalness,liveness,valence,tempo_norm,time_signature_norm,loudness_norm
0,0.53,0.341,0.181818,1,0.836,0.0,0.139,0.467,0.345702,1.0,0.468112


In [80]:
songs_features.shape

(50, 11)

In [86]:
model2.predict(songs_features)

y_pred2 = model2.predict(songs_features)
#confusion_matrix(y_test, y_pred2)

In [91]:
df_test['Predicted_label'] = y_pred2

In [94]:
df_test['Predicted_label'].value_counts()

2    42
4     4
3     4
Name: Predicted_label, dtype: int64

In [None]:
https://open.spotify.com/playlist/37i9dQZF1DWZd79rJ6a7lp?si=EcNa9oIIRoi6gu_02X7yBg

In [162]:
df_test_2 = scraper.datascrape('spotify', '37i9dQZF1DWZd79rJ6a7lp', sp)

In [163]:
# Feature: loudness
loudness = df_test_2[['loudness']]
min_max_scalar = preprocessing.MinMaxScaler()

loudness = min_max_scalar.fit_transform(loudness)
df_test_2['loudness_norm'] = loudness

# Feature: tempo
tempo = df_test_2[['tempo']]
tempo = min_max_scalar.fit_transform(tempo)
df_test_2['tempo_norm'] = tempo

# # Feature: duration_ms
# duration_ms = my_songs[['duration_ms']]
# duration_ms = min_max_scalar.fit_transform(duration_ms)
# my_songs['duration_norm'] = duration_ms

# Feature: key
key = df_test_2[['key']]
key = min_max_scalar.fit_transform(key)
df_test_2['key_norm'] = key

# Feature: time_signature
time_signature = df_test_2[['time_signature']]
time_signature = min_max_scalar.fit_transform(time_signature)
df_test_2['time_signature_norm'] = time_signature

In [164]:
music_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 
                  'acousticness', 'instrumentalness', 'liveness', 
                  'valence', 'tempo', 'duration_ms', 'time_signature']

songs_features = df_test_2[new_music_features]

In [165]:
model2.predict(songs_features)

y_pred2 = model2.predict(songs_features)
#confusion_matrix(y_test, y_pred2)
df_test_2['Predicted_label'] = y_pred2

In [166]:
df_test_2.Predicted_label.value_counts()

2    82
4    11
3     6
1     1
Name: Predicted_label, dtype: int64