In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import cross_val_score

In [13]:
df = pd.read_csv('data/genre_data_2.csv')
genre_list = ['acoustic','rock', 'classical', 'techno', 'metal', 'jazz']
df = df[df['genre'].isin(genre_list)]

# Based on ANOVA scores, keeping only 5 features 
X = df[['energy','loudness', 'acousticness', 'instrumentalness', 'danceability']]

#X = df[['danceability', 'speechiness', 'liveness', 'tempo','valence']]
y = LabelEncoder().fit_transform(df['genre'])

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=43)

# Pipeline

In [15]:
def pipeline(model):
    return Pipeline([
        ('Scaling', StandardScaler()),
        ('Training', model)
    ])

In [16]:
models = {
    'Logistic Regression' : LogisticRegression(),
    #'SVC' : SVC(),
    'Decision Tree' : DecisionTreeClassifier(),
    'Random Forest' : RandomForestClassifier(),
    'K Nearest Neighbors' : KNeighborsClassifier(),
    'Naive Bayes' : GaussianNB()
}

In [17]:
for model_name, model in models.items():
    p = pipeline(model)
    p.fit(X_train, y_train)
    print('cross_val_score for {} : {}'.format(
        model_name,
        cross_val_score(p, X_train, y_train, cv=5)
    ))
    print('test_set score for {} : {}'.format(
        model_name,
        p.score(X_test, y_test)
    ))

cross_val_score for Logistic Regression : [0.5812567  0.58139643 0.58451721 0.58274722 0.58079091]
test_set score for Logistic Regression : 0.5851250139732459
cross_val_score for Decision Tree : [0.631096   0.62858074 0.63244678 0.63104942 0.63612651]
test_set score for Decision Tree : 0.6380743004061556
cross_val_score for Random Forest : [0.69285947 0.69276631 0.69262658 0.69155527 0.69886814]
test_set score for Random Forest : 0.7011215858702537
cross_val_score for K Nearest Neighbors : [0.62396944 0.6181471  0.62066235 0.61940472 0.6231776 ]
test_set score for K Nearest Neighbors : 0.6407944256064388
cross_val_score for Naive Bayes : [0.55102706 0.55079417 0.54976943 0.55009549 0.54967628]
test_set score for Naive Bayes : 0.5528188694712524
