In [44]:
import pandas as pd
import numpy as np

In [23]:
df = pd.read_csv('data/final_spotify_data.csv')

In [24]:
print(df.head())

                       id                               name  \
0  2HZzRcA4c9XbGDpt7U1ZBf  I Wait for You - Ben Pearce Remix   
1  29YCC9Vad3O2jI36akkltE                        Communicate   
2  7wZnt8sHhTMO8PA4o5Ah3n                            Tempest   
3  6j4zFIzEe3NMzF6oC8XVZ0                         Jungle Dub   
4  1FXlhqcYaiw6KuUYtXdtAi                              Crash   

                                  uri_x  acousticness  danceability  \
0  spotify:track:2HZzRcA4c9XbGDpt7U1ZBf      0.000011         0.573   
1  spotify:track:29YCC9Vad3O2jI36akkltE      0.006640         0.587   
2  spotify:track:7wZnt8sHhTMO8PA4o5Ah3n      0.000960         0.798   
3  spotify:track:6j4zFIzEe3NMzF6oC8XVZ0      0.002080         0.798   
4  spotify:track:1FXlhqcYaiw6KuUYtXdtAi      0.000455         0.689   

   duration_ms  energy  instrumentalness  key  liveness  loudness  mode  \
0       404520   0.770             0.685   11    0.0975    -9.517     0   
1       285600   0.925             0.6

In [32]:
## FIRST NEED TO PUT GENRES INTO CLASSES (1,2,3,4)
g = ['workout', 'chill', 'party', 'focus']

df['labels'] = df['playlist']

d = dict(zip(g, range(0,4)))

df['labels'] = df['labels'].map(d, na_action='ignore')


In [41]:
from sklearn.model_selection import train_test_split

## SPLIT INTO TRAINING AND TEST

X = df.drop(['labels', 'playlist','id','name','uri_x'], axis = 1)
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report,confusion_matrix, accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, roc_curve)

rfclf = RandomForestClassifier()
rfclf.fit(X, y)

y_pred = rfclf.predict(X_test)

print('\n Clasification Report:\n',classification_report(y_test,y_pred))
print('\n Confusion Matrix:\n',confusion_matrix(y_test,y_pred))
print("Accuracy:", (accuracy_score(y_test,y_pred)))



 Clasification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95       180
           1       1.00      0.98      0.99       173
           2       0.96      0.95      0.95       182
           3       1.00      1.00      1.00       175

   micro avg       0.97      0.97      0.97       710
   macro avg       0.97      0.97      0.97       710
weighted avg       0.97      0.97      0.97       710


 Confusion Matrix:
 [[173   0   7   0]
 [  3 170   0   0]
 [ 10   0 172   0]
 [  0   0   0 175]]
Accuracy: 0.971830985915493




In [64]:
## OPTIMIZE PARAMETERS FOR RANDOM FOREST ###
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': np.arange(20,200,10), # Number of trees in random forest
              #'min_samples_leaf': np.arange(1,100,10), # Minimum number of samples required at each leaf node
              'max_features': ['auto', 'sqrt']} # Method of selecting samples for training each tree


rfclf = RandomForestClassifier()

cv = GridSearchCV(rfclf, param_grid, cv=5)
cv.fit(X_train, y_train)
print(cv.best_params_)

{'max_features': 'auto', 'n_estimators': 120}
0.7241139680333565


In [70]:
ne = cv.best_params_['n_estimators']
mf = cv.best_params_['max_features']

rfclf_tuned = RandomForestClassifier(n_estimators=ne, max_features=mf)

rfclf_tuned.fit(X, y)

y_pred = rfclf_tuned.predict(X_test)

print('\n Clasification Report:\n',classification_report(y_test,y_pred))
print('\n Confusion Matrix:\n',confusion_matrix(y_test,y_pred))
print("Accuracy:", (accuracy_score(y_test,y_pred)))


 Clasification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.97       180
           1       0.99      1.00      1.00       173
           2       0.98      0.96      0.97       182
           3       1.00      1.00      1.00       175

   micro avg       0.98      0.98      0.98       710
   macro avg       0.98      0.98      0.98       710
weighted avg       0.98      0.98      0.98       710


 Confusion Matrix:
 [[175   1   4   0]
 [  0 173   0   0]
 [  7   0 175   0]
 [  0   0   0 175]]
Accuracy: 0.9830985915492958


In [76]:
feature_importances = pd.DataFrame(rfclf_tuned.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
print(feature_importances)

                  importance
instrumentalness    0.183875
acousticness        0.123998
loudness            0.120965
energy              0.119094
valence             0.093822
danceability        0.080587
speechiness         0.077449
duration_ms         0.065999
tempo               0.050838
liveness            0.044210
key                 0.027048
mode                0.008709
time_signature      0.003406


In [78]:
## REMOVE LOW IMPORTANCE VARIABLES TO SEE IF MAKES MODEL BETTER ###
X = df.drop(['labels', 'playlist','id','name','uri_x','mode'], axis = 1)
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
rfclf = RandomForestClassifier()
rfclf.fit(X, y)

y_pred = rfclf.predict(X_test)

print('\n Clasification Report:\n',classification_report(y_test,y_pred))
print('\n Confusion Matrix:\n',confusion_matrix(y_test,y_pred))
print("Accuracy:", (accuracy_score(y_test,y_pred)))


 Clasification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95       180
           1       0.99      0.99      0.99       173
           2       0.97      0.95      0.96       182
           3       1.00      0.99      0.99       175

   micro avg       0.97      0.97      0.97       710
   macro avg       0.97      0.97      0.97       710
weighted avg       0.97      0.97      0.97       710


 Confusion Matrix:
 [[174   1   5   0]
 [  0 172   1   0]
 [ 10   0 172   0]
 [  1   1   0 173]]
Accuracy: 0.9732394366197183




In [79]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': np.arange(20,200,10), # Number of trees in random forest
              #'min_samples_leaf': np.arange(1,100,10), # Minimum number of samples required at each leaf node
              'max_features': ['auto', 'sqrt']} # Method of selecting samples for training each tree


rfclf = RandomForestClassifier()

cv = GridSearchCV(rfclf, param_grid, cv=5)
cv.fit(X_train, y_train)
print(cv.best_params_)

{'max_features': 'auto', 'n_estimators': 90}


In [80]:
ne = cv.best_params_['n_estimators']
mf = cv.best_params_['max_features']

rfclf_tuned = RandomForestClassifier(n_estimators=ne, max_features=mf)

rfclf_tuned.fit(X, y)

y_pred = rfclf_tuned.predict(X_test)

print('\n Clasification Report:\n',classification_report(y_test,y_pred))
print('\n Confusion Matrix:\n',confusion_matrix(y_test,y_pred))
print("Accuracy:", (accuracy_score(y_test,y_pred)))


 Clasification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96       180
           1       0.99      1.00      1.00       173
           2       0.96      0.97      0.96       182
           3       1.00      1.00      1.00       175

   micro avg       0.98      0.98      0.98       710
   macro avg       0.98      0.98      0.98       710
weighted avg       0.98      0.98      0.98       710


 Confusion Matrix:
 [[172   1   7   0]
 [  0 173   0   0]
 [  6   0 176   0]
 [  0   0   0 175]]
Accuracy: 0.9802816901408451


Removing mode makes inital model more accurate, but the tuned model accuracy is lower. 

In [None]:
## TRY REMOVING OUTLIERS THEN REMODELING ##

In [None]:
## KNN ? USE GRID SEARCH ###