In [1]:
import os

import pandas as pd
import numpy as np
import csv

import dateutil.parser
import datetime
import sys

import spotipy
import spotipy.util

import sklearn

In [2]:
def get_newest_dirpath(data_dirpath):
    dir_contents = [d for d in os.listdir(data_dirpath) if d.startswith('D-')]
    dir_contents.sort(reverse = True)
    dir_name = dir_contents[0]
    return os.path.join(data_dirpath, dir_name)

def get_newest_dataset():
    pull_dirpath = get_newest_dirpath('./data')
    data_filepath = [d for d in os.listdir(pull_dirpath) if d.startswith('ml_dataset')][0]
    spotify_df = pd.read_csv(os.path.join(pull_dirpath, data_filepath), index_col=0)
    return spotify_df

In [9]:
# 3.1. Data Wrangling
spotify_df = get_newest_dataset()
spotify_df.drop(columns=['track_number', 'total_tracks'], inplace=True)
spotify_df.reset_index(inplace=True)

# 3.1.1. Features and Targets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

targets = spotify_df.iloc[:, -1]
features = spotify_df.iloc[:, 4:-1]

def gen_one_hot_encodings(col_names_arr, features):
    for col_name in col_names_arr:
        rename_cols = features.columns
        oh_encoder = OneHotEncoder(sparse=False, categories='auto')
        oh_arr = oh_encoder.fit_transform(X=features[col_name].values.reshape(-1, 1))
        oh_df = pd.DataFrame(oh_arr)
        features = pd.concat([features, oh_df], ignore_index=True, sort=False, axis=1)
        features.columns = list(rename_cols) + list(oh_encoder.get_feature_names())
    return features

one_hot_vars = ['explicit', 'album_type','time_signature', 'mode', 'key']
features = gen_one_hot_encodings(one_hot_vars, features)
features = features.drop(columns=one_hot_vars)

duration_scaler = MinMaxScaler().fit(features['duration_ms'].values.reshape(-1, 1))
features['duration_ms'] = duration_scaler.transform(features['duration_ms'].values.reshape(-1, 1))
features.head()

Unnamed: 0,duration_ms,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,x0_2,x0_3,x0_4,x0_5,x0_6,x0_7,x0_8,x0_9,x0_10,x0_11
0,0.342576,0.927,0.665,-5.313,0.244,0.061,0.0,0.123,0.175,127.076,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.197801,0.928,0.481,-9.35,0.287,0.105,0.0,0.176,0.613,134.007,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.219171,0.681,0.594,-7.028,0.282,0.165,3e-06,0.134,0.535,186.054,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.235056,0.748,0.524,-5.599,0.0338,0.414,0.0,0.111,0.661,95.01,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.286449,0.794,0.632,-6.163,0.0649,0.142,0.0,0.128,0.355,145.926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# 3.1.2. Feature Selection
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(features, targets, random_state=3000)

selector = SelectFromModel(DecisionTreeClassifier(random_state = 3000), threshold = 'median')
selector.fit(X_train, y_train)

X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

In [5]:
# 3.3-4. Model Instantiation, Fitting and Evaluation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression

classifiers = {
    'k-Nearest Neighbor' : KNeighborsClassifier(),
    'Support Vector Machine' : SVC(gamma='auto'), 
    'Gaussian Naive Bayes' : GaussianNB(),
    'Ridge Classifier' : RidgeClassifier(),
    'Logistic Regression' : LogisticRegression()
}

for estimator_name, estimator_obj in classifiers.items():
    X_trn_data = X_train_selected
    X_tst_data = X_test_selected
    
    estimator_obj.fit(X=X_trn_data, y=y_train)
    
    train_score = estimator_obj.score(X_trn_data, y_train)
    test_score = estimator_obj.score(X_tst_data, y_test)
    
    print(estimator_name + ':')
    print('\t', 'Training score: ', '{:.4%}'.format(train_score))
    print('\t', 'Testing score: ', '{:.4%}'.format(test_score), '\n')

k-Nearest Neighbor:
	 Training score:  20.5506%
	 Testing score:  0.8850% 

Support Vector Machine:
	 Training score:  9.1200%
	 Testing score:  1.1799% 

Gaussian Naive Bayes:
	 Training score:  1.1799%
	 Testing score:  0.3687% 

Ridge Classifier:
	 Training score:  2.8269%
	 Testing score:  1.4012% 





Logistic Regression:
	 Training score:  2.6303%
	 Testing score:  1.1062% 



In [7]:
# 3.5-6. Model Optimization and Testing
from sklearn.model_selection import GridSearchCV

param_grids = {
    'k-Nearest Neighbor' : {'n_neighbors':range(1, 20), 
                            'weights':['uniform', 'distance'], 
                            'algorithm':['ball_tree', 'kd_tree', 'brute']},
    'Support Vector Machine' : {'C':[10**(e) for e in range(-10, 3)]}, 
    'Gaussian Naive Bayes' : {'var_smoothing':[10**(e) for e in range(-10, 3)]},
    'Ridge Classifier' : {'alpha':[10**(e) for e in range(0, 5)]},
    'Logistic Regression' : {'penalty':['l1', 'l2'], 
                             'C':[10**(e) for e in range(-10, 3)]}
}

for estimator_name, param_grid in param_grids.items():
    search = GridSearchCV(classifiers[estimator_name], param_grid, cv=5)
    search.fit(X=X_train_selected, y=y_train)
    
    train_score = search.score(X_train_selected, y_train)
    test_score = search.score(X_test_selected, y_test)
    
    print(estimator_name + ':')
    print('\t', 'Best parameters: ', search.best_params_)
    print('\t', 'Training score: ', '{:.4%}'.format(train_score))
    print('\t', 'Testing score: ', '{:.4%}'.format(test_score))



k-Nearest Neighbor:
	 Best parameters:  {'algorithm': 'kd_tree', 'n_neighbors': 10, 'weights': 'uniform'}
	 Training score:  12.2911%
	 Testing score:  0.9587%
Support Vector Machine:
	 Best parameters:  {'C': 1e-10}
	 Training score:  1.4995%
	 Testing score:  1.6962%




Gaussian Naive Bayes:
	 Best parameters:  {'var_smoothing': 1}
	 Training score:  1.6962%
	 Testing score:  1.4012%
Ridge Classifier:
	 Best parameters:  {'alpha': 10000}
	 Training score:  1.8437%
	 Testing score:  1.5487%


















Logistic Regression:
	 Best parameters:  {'C': 0.01, 'penalty': 'l2'}
	 Training score:  1.6962%
	 Testing score:  1.7699%
