In [None]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA


data = pd.read_csv('./datasets/relax+banger+classical+sleep+study_dataset_cleaned.csv')

#x_data = data.loc[:, data.columns != "isBanger"]
#x_data = data[['acousticness','danceability','duration_ms','energy','instrumentalness','key','liveness','loudness','mode','popularity','speechiness','tempo','time_signature','valence','explicit']]
x_data = data[['danceability', 'duration_ms', 'energy', 'key', 'loudness', 'mode', 'popularity', 'tempo', 'time_signature']]
y_data = data.loc[:, "isBanger"]

       acousticness  danceability  duration_ms  energy  instrumentalness  key  \
0            0.2750         0.714       170200   0.817          0.000000    1   
1            0.8770         0.838       135597   0.549          0.000964    9   
2            0.1230         0.732       160096   0.746          0.000000    3   
3            0.2750         0.630       203067   0.692          0.000000    7   
4            0.1460         0.592       209449   0.572          0.000172    5   
5            0.0519         0.828       253346   0.686          0.000002    2   
6            0.3870         0.730       172321   0.725          0.000000   10   
7            0.0664         0.675       216533   0.848          0.000001    1   
8            0.5190         0.732       178000   0.583          0.000020    7   
9            0.0498         0.758       140587   0.762          0.000004    0   
10           0.1620         0.764       182925   0.744          0.000000   11   
11           0.3910         

In [15]:
random_state = 10

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=random_state)

In [16]:
print('########################## Linear Regression ##########################\n')
reg = LinearRegression().fit(X_train, y_train)

reg_train_accuracy = accuracy_score(y_train, reg.predict(X_train).round())
reg_test_accuracy = accuracy_score(y_test,reg.predict(X_test).round())
print(reg_train_accuracy,reg_test_accuracy)

########################## Linear Regression ##########################

0.9055948910176316 0.9002590673575129


In [None]:
print('\n\n########################## Random Forest ##########################\n')
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train, y_train)
clf_train_accuracy = accuracy_score(y_train, clf.predict(X_train).round())
clf_test_accuracy = accuracy_score(y_test, clf.predict(X_test).round())
print(clf_train_accuracy, clf_test_accuracy)


print(pd.Series(clf.feature_importances_, index=x_data.columns).sort_values(ascending=False))

param_grid = {
    'max_depth': [1, 3, 10, 30, 100, 300],
    'n_estimators': [1, 3, 10, 30, 100, 300, 1000]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)



########################## Random Forest ##########################

0.9014299597389976 0.9034974093264249
explicit            0.346028
loudness            0.182599
instrumentalness    0.134246
energy              0.083212
speechiness         0.071448
acousticness        0.067080
danceability        0.065127
popularity          0.026011
valence             0.009938
time_signature      0.006634
tempo               0.006518
duration_ms         0.001158
mode                0.000000
liveness            0.000000
key                 0.000000
dtype: float64
Fitting 10 folds for each of 42 candidates, totalling 420 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 420 out of 420 | elapsed:  1.3min finished


{'max_depth': 100, 'n_estimators': 100}
0.9307233097320561


In [20]:
print('\n\n########################## Support Vector Machine ##########################\n')

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
clf = SVC(gamma='auto')
clf.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)
clf_train_accuracy = accuracy_score(y_train, clf.predict(X_train_scaled).round())
clf_test_accuracy = accuracy_score(y_test, clf.predict(X_test_scaled).round())
print(clf_train_accuracy, clf_test_accuracy)

param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
}
rf = SVC(gamma='auto')
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10, n_jobs=-1, verbose=1)
grid_search.fit(X_train_scaled, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

print(grid_search.cv_results_)



########################## Support Vector Machine ##########################



  return self.partial_fit(X, y)
  
  app.launch_new_instance()


0.9362765514369012 0.9229274611398963
Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   26.5s finished


{'C': 1, 'kernel': 'rbf'}
0.9257253921976955
{'mean_fit_time': array([ 0.50843754,  2.69657161,  0.35503478,  1.08603098,  0.30984581,
        0.56118984,  0.5435147 ,  0.59133668,  2.46578703,  0.81687722,
       13.45910792,  1.47196095]), 'std_fit_time': array([0.03173589, 0.03788017, 0.05400447, 0.06094466, 0.01897483,
       0.01407405, 0.02395511, 0.04193463, 0.0912971 , 0.07986721,
       0.33243924, 0.07318635]), 'mean_score_time': array([0.02808297, 0.1612709 , 0.02137563, 0.08071251, 0.01843805,
       0.04484797, 0.01851132, 0.04263055, 0.02204781, 0.03724051,
       0.01312656, 0.03363314]), 'std_score_time': array([0.0039058 , 0.01901546, 0.00379577, 0.00175376, 0.00062121,
       0.00059989, 0.00073742, 0.00496656, 0.00237497, 0.00300215,
       0.00281202, 0.00080861]), 'param_C': masked_array(data=[0.001, 0.001, 0.01, 0.01, 0.1, 0.1, 1, 1, 10, 10, 100,
                   100],
             mask=[False, False, False, False, False, False, False, False,
                   

In [21]:
print('\n\n########################## Principal Component Analysis ##########################\n')

pca = PCA(n_components=10, svd_solver='full')
pca.fit(x_data)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)



########################## Principal Component Analysis ##########################

[9.99999884e-01 9.36448309e-08 1.81788038e-08 3.29489794e-09
 1.22518811e-09 2.33297998e-11 2.07640534e-11 1.87453660e-11
 7.42930047e-12 4.72745665e-12]
[1.04812866e+07 3.20742514e+03 1.41317932e+03 6.01638482e+02
 3.66873219e+02 5.06255937e+01 4.77607009e+01 4.53796963e+01
 2.85685754e+01 2.27891644e+01]
