# 
# Tracks' Genre Classification - [ *Team Emer* ]
# 

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# 
# Feature Selection

In [2]:
#################    ALL     #######################
# genre_names = ['Acoustic', 'R&B', 'Classical', 'Country', 'Electronic', 'Hiphop', 'Jazz', 'Pop', 'Rock', 'Reggae', "Rap"]
# feature_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness','liveness', 'valence', 'tempo']


#############     TYRONE    ####################
# genre_names = ['Rock', 'R&B', 'Acoustic']
# feature_cols =  ['danceability', 'energy', 'acousticness', 'valence', 'tempo']  #'loudness','speechiness', 'instrumentalness', 'liveness'


# ###############    RODS    #######################
# genre_names = ['Classical', 'Reggae', 'Acoustic', 'Hiphop', 'Rock']
# feature_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo'] #'liveness'


# ##############     G-AR    #######################
genre_names = ['Rock', 'Hiphop', 'R&B', 'Folk', 'Indie', 'Acoustic']
feature_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness','liveness', 'valence', 'tempo']


##############     KAYE    ####################
# genre_names = ['Rock', 'R&B', 'Acoustic', 'Country']
# feature_cols = ['popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness','liveness', 'valence', 'tempo']




In [3]:
tracks = []
for i, KEYWORD in enumerate(genre_names):
    # Read and process the playlist data for keyword
    playlist_df = pd.read_csv('data/'+KEYWORD+'_playlist_data.csv')    
    tdf = pd.read_csv('data/'+KEYWORD+'_playlist_tracks_data.csv')\
    .merge(pd.read_csv('data/'+KEYWORD+'_playlist_tracks.csv')[['track_id','playlist_id','playlist_name']],\
                      on='track_id',how='left')
    
    # Make duration ms to minutes
    tdf['duration_mins'] = tdf['duration'] / 60000
    # Tag the 'genre' column with keyword then append to tmp 'tracks' dataframe
    tdf['genre'] = KEYWORD
    tdf['genre_id'] = i + 1
    tracks.append(tdf)
    
# Get union of all playlist tracks list
tracks_df = pd.concat(tracks)
# Clean the final dataframe for modeling
tracks_df = tracks_df.dropna(axis=1)
tracks_df['playlist_id'] = tracks_df['playlist_id_x']
tracks_df['playlist_name'] = tracks_df['playlist_name_x']
tracks_df = tracks_df.drop(['playlist_id_x', 'playlist_name_x', 'playlist_id_y', 'playlist_name_y'], axis=1)
tracks_df.isnull().any()

track_id            False
track_name          False
artist_id           False
artist_name         False
album_id            False
duration            False
release_date        False
popularity          False
danceability        False
energy              False
key                 False
loudness            False
mode                False
speechiness         False
acousticness        False
instrumentalness    False
liveness            False
valence             False
tempo               False
duration_mins       False
genre               False
genre_id            False
playlist_id         False
playlist_name       False
dtype: bool

# 
# Dataset *(Training and Test Sets)*

In [4]:
from sklearn.model_selection import train_test_split

# Create feature matrix (X)
X = tracks_df[feature_cols]
y = tracks_df['genre_id']

# Create the training set, test set  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ... and checking shapes
print("Shape of X_Train: " + str(X_train.shape))
print("Shape of y_Train: " + str(y_train.shape))
print("Shape of X_Test: " + str(X_test.shape))
print("Shape of y_Test: " + str(y_test.shape))

Shape of X_Train: (9293, 11)
Shape of y_Train: (9293,)
Shape of X_Test: (2324, 11)
Shape of y_Test: (2324,)


# 
# Feature Scaling

In [5]:
# Normalize numeric columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

tracks_df[feature_cols].describe()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,11617.0,11617.0,11617.0,11617.0,11617.0,11617.0,11617.0,11617.0,11617.0,11617.0,11617.0
mean,0.589706,0.519276,5.199449,-9.215049,0.683137,0.07837,0.406117,0.126395,0.164809,0.462995,118.879511
std,0.154988,0.234693,3.557682,3.979487,0.465274,0.089979,0.344336,0.276091,0.130097,0.235595,29.904804
min,0.0,0.000879,0.0,-38.243,0.0,0.0,1e-06,0.0,0.0177,0.0,0.0
25%,0.486,0.335,2.0,-11.398,0.0,0.0324,0.0661,0.0,0.097,0.273,94.967
50%,0.592,0.527,5.0,-8.559,1.0,0.0415,0.326,0.000138,0.115,0.442,116.907
75%,0.696,0.7,8.0,-6.296,1.0,0.0735,0.753,0.0282,0.176,0.645,139.841
max,0.974,0.998,11.0,0.175,1.0,0.907,0.996,0.996,0.986,0.999,214.527


### Classification and Accuracy Result Function

In [6]:
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

classifiers = []

def classify(classifier, name, X_train, y_train, y_test, y_pred, cv=10):
    # evaluate accuracy
    acc = accuracy_score(y_test, y_pred) * 100
    print(f'\nThe accuracy of the {name} classifier is {acc} %')

    # show classification report
    print('\n\nClassification Report\n')
    print(classification_report(y_test, y_pred, target_names=genre_names))
    
    # show cross-validation accuracy
    accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=cv)
    print(f'\nCross-Validation Accuracy at cv={cv} is: {accuracies.mean() * 100} %\n')
    
    classifiers.append((classifier, name, accuracies.mean() * 100))
    # classifiers.append((classifier, name, acc))  ## use THIS(instead of above) to skip cross-validation accuracy metric to speed up this function call
    
    
    
    
    ###################################################################
    #######   Add code here to save the model in pickl format   #######
    ###################################################################


# 
# KNN Classifier

In [7]:
from sklearn.neighbors import KNeighborsClassifier 

# kNN model tuning
cv_scores = []
neighbors = np.arange(2,51)

for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
    print(f'Fitting for k={k} \t with score={scores.mean()}')

# Changing to misclassification error
mse = [1 - x for x in cv_scores]

# Determine the best 'k'
optimal_k = neighbors[mse.index(min(mse))]
print(f'\nThe optimal number of neighbors for kNN classifier is k={optimal_k}.')

Fitting for k=2 	 with score=0.4302147065291619
Fitting for k=3 	 with score=0.44065511533965296
Fitting for k=4 	 with score=0.45195481324583026
Fitting for k=5 	 with score=0.4591655960276399
Fitting for k=6 	 with score=0.46152956699885406
Fitting for k=7 	 with score=0.45916281815340804
Fitting for k=8 	 with score=0.4590561014850053
Fitting for k=9 	 with score=0.4601333379631237
Fitting for k=10 	 with score=0.45937902936444547
Fitting for k=11 	 with score=0.45884220516916097
Fitting for k=12 	 with score=0.4615317661492876
Fitting for k=13 	 with score=0.4579817586258782
Fitting for k=14 	 with score=0.4606709723717259
Fitting for k=15 	 with score=0.4649756357280924
Fitting for k=16 	 with score=0.4633619222889684
Fitting for k=17 	 with score=0.46863224417514504
Fitting for k=18 	 with score=0.4646513189115363
Fitting for k=19 	 with score=0.4642201696818177
Fitting for k=20 	 with score=0.46260437283701983
Fitting for k=21 	 with score=0.46314351192749753
Fitting for k=22 	 

## Optimal kNN Model

In [8]:
# Train the model
classifier = knn_optimal = KNeighborsClassifier(n_neighbors=optimal_k)
classifier.fit(X_train, y_train)

# Test the prediction
y_pred = classifier.predict(X_test)

# Show the classification and accuracy result
classify(classifier=classifier, name='kNN', X_train=X_train, y_train=y_train, y_test=y_test, y_pred=y_pred)


The accuracy of the kNN classifier is 46.77280550774527 %


Classification Report

              precision    recall  f1-score   support

        Rock       0.48      0.60      0.53       428
      Hiphop       0.67      0.75      0.71       443
         R&B       0.40      0.22      0.28       241
        Folk       0.32      0.31      0.31       392
       Indie       0.35      0.22      0.27       447
    Acoustic       0.46      0.60      0.52       373

    accuracy                           0.47      2324
   macro avg       0.44      0.45      0.44      2324
weighted avg       0.45      0.47      0.45      2324


Cross-Validation Accuracy at cv=10 is: 46.8632244175145 %



# 
# Support Vector Machine Classifiers

In [9]:
from sklearn.svm import SVC

## SVM (Linear Kernel) Classifier

In [10]:
# Train the model
classifier = SVC(kernel='linear', probability=True)
classifier.fit(X_train, y_train) 

# Test the prediction
y_pred = classifier.predict(X_test)

# Show the classification and accuracy result
classify(classifier=classifier, name='SVM (Linear Kernel)', X_train=X_train, y_train=y_train, y_test=y_test, y_pred=y_pred)


The accuracy of the SVM (Linear Kernel) classifier is 47.37521514629948 %


Classification Report

              precision    recall  f1-score   support

        Rock       0.48      0.68      0.56       428
      Hiphop       0.63      0.75      0.69       443
         R&B       0.39      0.23      0.29       241
        Folk       0.35      0.24      0.28       392
       Indie       0.33      0.16      0.22       447
    Acoustic       0.46      0.69      0.55       373

    accuracy                           0.47      2324
   macro avg       0.44      0.46      0.43      2324
weighted avg       0.44      0.47      0.44      2324


Cross-Validation Accuracy at cv=10 is: 47.77814044469137 %



## SVM (Polynomial Kernel) Classifier

In [None]:
# Train the model
classifier = SVC(kernel='poly', degree=3, gamma=0.9, probability=True)
classifier.fit(X_train, y_train) 

# Test the prediction
y_pred = classifier.predict(X_test)

# Show the classification and accuracy result
classify(classifier=classifier, name='SVM (Polynomial Kernel)', X_train=X_train, y_train=y_train, y_test=y_test, y_pred=y_pred)


The accuracy of the SVM (Polynomial Kernel) classifier is 49.26850258175559 %


Classification Report

              precision    recall  f1-score   support

        Rock       0.49      0.61      0.54       428
      Hiphop       0.68      0.72      0.70       443
         R&B       0.43      0.34      0.38       241
        Folk       0.38      0.34      0.36       392
       Indie       0.40      0.27      0.32       447
    Acoustic       0.49      0.62      0.54       373

    accuracy                           0.49      2324
   macro avg       0.48      0.48      0.47      2324
weighted avg       0.48      0.49      0.48      2324



## SVM (RBF Kernel) Classifier

In [None]:
# Train the model
classifier = SVC(kernel='rbf', gamma=0.8, probability=True)
classifier.fit(X_train, y_train) 

# Test the prediction
y_pred = classifier.predict(X_test)

# Show the classification and accuracy result
classify(classifier=classifier, name='SVM (RBF Kernel)', X_train=X_train, y_train=y_train, y_test=y_test, y_pred=y_pred)

# 
# Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

# Train the model
classifier = GaussianNB()
classifier.fit(X_train, y_train) 

# Test the prediction
y_pred = classifier.predict(X_test)

# Show the classification and accuracy result
classify(classifier=classifier, name='Naive Bayes', X_train=X_train, y_train=y_train, y_test=y_test, y_pred=y_pred)

# 
# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier as DT

# Train the model
classifier = DT(criterion = 'entropy', random_state=42)
classifier.fit(X_train, y_train) 

# Test the prediction
y_pred = classifier.predict(X_test)

# Show the classification and accuracy result
classify(classifier=classifier, name='Decision Tree', X_train=X_train, y_train=y_train, y_test=y_test, y_pred=y_pred)

# 
# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier as RF

# Train the model
classifier = RF(n_estimators=10, criterion='entropy', random_state=42)
classifier.fit(X_train, y_train) 

# Test the prediction
y_pred = classifier.predict(X_test)

# Show the classification and accuracy result
classify(classifier=classifier, name='Random Forest', X_train=X_train, y_train=y_train, y_test=y_test, y_pred=y_pred)

# 
# XGBoost Classifier

In [None]:
from xgboost import XGBClassifier

# Train the model
classifier = XGBClassifier(verbosity=0)
classifier.fit(X_train, y_train) 

# Test the prediction
y_pred = classifier.predict(X_test)

# Show the classification and accuracy result
classify(classifier=classifier, name='XGBoost', X_train=X_train, y_train=y_train, y_test=y_test, y_pred=y_pred)

# 
# Best Model Selection

Which between KNN and SVM performed better? 
Create a column matching the predicted genre and probability of the better model to each of the tracks

In [None]:
classifiers

In [None]:
from operator import itemgetter

best_model = max(classifiers, key=itemgetter(2))[0]
best_name = max(classifiers, key=itemgetter(2))[1]
best_score = max(classifiers, key=itemgetter(2))[2]
print(f"\nBest Model is '{best_name}' with accuracy score of {best_score}\n")

### Prediction DataFrame Function

In [None]:
def create_prediction_df(df, model):
    df['predicted_genre_id'] =\
    df.apply(lambda x:  model.predict(scaler.transform(x[feature_cols].values.reshape(1,-1)))[0], axis=1)
    df['predicted_genre_prob'] =\
    df.apply(lambda x:  np.max(model.predict_proba(scaler.transform(x[feature_cols].values.reshape(1,-1)))), axis=1)
    return df

# 
# Classify *Spotify Daily Charts'* Tracks

In [None]:
chart_tracks_df = pd.read_csv("data/spotify_daily_charts_tracks.csv")
chart_tracks_df = create_prediction_df(chart_tracks_df, best_model)
chart_tracks_df

In [None]:
# Check Spotify Daily Charts' Tracks classified but with low (< 50%) probability 
chart_tracks_df[chart_tracks_df['predicted_genre_prob'] < .5]

In [None]:
# View Spotify Daily Charts Tracks' histogram of probabilities
plt.figure(figsize=(12, 8))
chart_tracks_df['predicted_genre_prob'].hist()
plt.show();

# 
# Classify *Nyoy Volante's* Tracks

In [None]:
artist_name = 'Nyoy Volante'
artist_tracks_df = pd.read_csv('data/'+artist_name.lower()+'_album_tracks_data.csv')
artist_tracks_df = artist_tracks_df[artist_tracks_df['artist_name']==artist_name]\
.drop_duplicates(subset=['track_name']).reset_index()
artist_tracks_df = create_prediction_df(artist_tracks_df, best_model)
artist_tracks_df

In [None]:
# Check Artist Tracks' classified genres with but low (< 50%) probability 
artist_tracks_df[artist_tracks_df['predicted_genre_prob'] < .5]

In [None]:
# View Artist Tracks' histogram of probabilities
plt.figure(figsize=(12, 8))
artist_tracks_df['predicted_genre_prob'].hist()
plt.show();

In [None]:
genre_lookup = dict(zip(np.arange(1, len(genre_names) + 1), genre_names))

artist_tracks_df = artist_tracks_df.sort_values(by=['predicted_genre_prob', 'popularity', 'release_date'], ascending=False)
artist_tracks_df['predicted_genre'] =  artist_tracks_df['predicted_genre_id'].map(lambda x: genre_lookup[x])
artist_tracks_df['classification_probability'] =  artist_tracks_df['predicted_genre_prob'].apply(lambda x: f'{x * 100.00}%')

print(f'\nArtist: {artist_name}')
cols = ['track_name', 'release_date', 'popularity', 'predicted_genre', 'classification_probability']
artist_tracks_df = artist_tracks_df[cols].reset_index()
artist_tracks_df