# Classifiers

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

spotify = pd.read_csv(
    "https://raw.githubusercontent.com/rmratliffbrown/ml-genre-assignment/main/00_raw_data/spotify_songs.csv"
)
spotify.head()

music_covariates = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms']

other_covariates = ['track_album_name', 'track_name', 'track_artist']

music = spotify[music_covariates] #subset for music covariates only
music_X = spotify[music_covariates]
music_Y = spotify['playlist_genre']

In [2]:
# First we import the labels created by clustering
new_Y = pd.read_csv(
    "../20_classification/labels_kmeans.csv"
)

In [3]:
# Split train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    music_X, new_Y['0'], test_size=0.20, random_state=24
)

In [4]:
# Split train and val
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.20, random_state=42
)

### Model #1: knn (simple)

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

X_norm_train = StandardScaler().fit_transform(X_train)
X_norm_test = StandardScaler().fit_transform(X_test)
X_norm_val = StandardScaler().fit_transform(X_val)

In [6]:
from sklearn.ensemble import RandomForestClassifier

# Basic RF
clf = RandomForestClassifier(random_state = 47, n_jobs=-1).fit(X_train, y_train)
clf.score(X_val, y_val)

0.9453749524172059

In [7]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform
from sklearn.neural_network import MLPClassifier
import warnings
from sklearn.exceptions import ConvergenceWarning

In [8]:
params = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
warnings.filterwarnings('ignore')

rand_search = RandomizedSearchCV(clf, param_distributions=params, n_iter=200, 
                                                        n_jobs=-1, random_state = 37)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore', category = ConvergenceWarning, module='sklearn' )

    rand_search.fit(X_train, y_train)

In [None]:
# Find the best parameter values
best_params = rand_search.best_params_
best_params