In [2]:
import pandas as pd
pd.options.display.max_columns = 200

songs = pd.read_csv('../data/spotify_simplified.csv', index_col=[0])
songs.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,0,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,pop
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,0,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,rock
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,0,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,rock
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,0,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,rock
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,0,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,rock


### Fit and evaluate the model

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


songs_data = songs.drop(columns = ["track_id", "artists", "album_name", "track_name", "track_genre"])
genres = songs["track_genre"]
# Numerically encode the labels
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)
X_train, X_test, y_train, y_test = train_test_split(songs_data, encoded_genres, test_size=0.3, 
                                                    stratify=encoded_genres, shuffle=True, random_state=100)
# Train model
decision_tree = DecisionTreeClassifier(max_depth=20, min_samples_leaf=20, max_features=None, min_samples_split=2)
decision_tree.fit(X_train, y_train)

# Evaluate model
predictions = decision_tree.predict(X_test)
base_accuracy = accuracy_score(y_test, predictions)
base_f1_weighted = f1_score(y_test, predictions, average='weighted')
print("Baseline performance using a Decision Tree")
print(f"Accuracy: {base_accuracy}")
print(f"F1-score: {base_f1_weighted}")

Baseline performance using a Decision Tree
Accuracy: 0.44814649728846295
F1-score: 0.44150886950981943


### Experiment by adding artists as features

In [4]:
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from data_prep import add_artists_as_features
import nltk
# Need to download stopwords and punkt to use the add_authors_as_features function
nltk.download('stopwords')
nltk.download('punkt')

songs_data = add_artists_as_features(songs, 2000)
songs_data = songs_data.drop(columns = ["track_id", "artists", "album_name", "track_name", "track_genre"])
genres = songs["track_genre"]
# Numerically encode the labels
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)
# Using stratify might help because we have an imbalanced dataset
X_train, X_test, y_train, y_test = train_test_split(songs_data, encoded_genres, test_size=0.3, 
                                                    stratify=encoded_genres, shuffle=True, random_state=100)
# Train model
decision_tree = DecisionTreeClassifier(max_depth=20, min_samples_leaf=20, max_features=None, min_samples_split=2)
decision_tree.fit(X_train, y_train)
# Evaluate model
predictions = decision_tree.predict(X_test)
base_accuracy = accuracy_score(y_test, predictions)
base_f1_weighted = f1_score(y_test, predictions, average='weighted')
print("Baseline performance using a Decision Tree")
print(f"Accuracy: {base_accuracy}")
print(f"F1-score: {base_f1_weighted}")

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\masam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\masam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Baseline performance using a Decision Tree
Accuracy: 0.45654111878760867
F1-score: 0.45155868026031326


### Get more insights in the performance of each class

In [5]:
from sklearn.metrics import classification_report

report = classification_report(y_test, predictions, target_names=list(label_encoder.classes_))
print(report)

              precision    recall  f1-score   support

     ambient       0.54      0.58      0.56      1820
    children       0.48      0.41      0.44       873
   classical       0.66      0.62      0.64       797
      comedy       0.96      0.81      0.88       297
     country       0.52      0.42      0.46       871
         edm       0.39      0.44      0.41      3268
    european       0.25      0.22      0.23      1157
        folk       0.38      0.34      0.36      2122
     hip-hop       0.39      0.13      0.20       481
       latin       0.49      0.57      0.53      2672
       metal       0.58      0.65      0.61      1800
         pop       0.35      0.35      0.35      1891
      reggae       0.26      0.18      0.21       738
        rock       0.34      0.38      0.35      3045
  show-tunes       0.22      0.15      0.18       521
       sleep       0.79      0.76      0.78       299
      techno       0.67      0.70      0.68      2401
       world       0.41    

In [None]:
def generate_predicitons_df(y_test, predictions, class_names):
    # Get more detailed performance information for each class
    f1 = f1_score(y_test, predictions, average=None)
    class_to_f1 = dict(zip(class_names, f1))
    # Sort f1-score in descending order
    class_to_f1 = dict(sorted(class_to_f1.items(), key=lambda item: item[1], reverse=True))
    grouped_by_genre = songs.groupby(['track_genre']).size()
    genre_count = grouped_by_genre.sort_values(ascending=False)
    count = list()
    for genre in class_to_f1.keys():
        count.append(genre_count[genre])
    predictions_df = pd.DataFrame([], columns=['genre', 'f1-score', 'count'])
    predictions_df['genre'] = class_to_f1.keys()
    predictions_df['f1-score'] = class_to_f1.values()
    predictions_df['count'] = count
    return predictions_df

predictions_df = generate_predicitons_df(y_test, predictions, label_encoder.classes_)
display(predictions_df)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def plot_confusion_matrix(conf_matrix, class_names):
    cm_df = pd.DataFrame(conf_matrix, index=class_names, columns=class_names)
    heatmap = sns.heatmap(cm_df, annot=True, fmt='d', annot_kws={"size": 12})
    heatmap.set(xlabel='Predicted class', ylabel='True class')
    return heatmap

cm = confusion_matrix(y_test, predictions)
heatmap = plot_confusion_matrix(cm, label_encoder.classes_)
plt.rcParams['figure.figsize'] = [15, 15]
plt.show()

## Experiment #1: Balancing
* Try to overcome the class imbalance problem by using under or over sampling

### Undersampling

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold, cross_val_predict

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
sampler = RandomUnderSampler()
pipeline = Pipeline([('balancing', sampler), ('classifier', decision_tree)])
# predictions = cross_val_predict(pipeline, songs_data, encoded_genres, cv=cv, n_jobs=-1)
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1_weighted = f1_score(y_test, predictions, average='weighted')
print(f"Accuracy: {accuracy:.4f}")
print(f"F1: {f1_weighted:.4f}")

### Oversampling

In [None]:
pipeline.named_steps['balancing'] = RandomOverSampler()
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1_weighted = f1_score(y_test, predictions, average='weighted')
# predictions = cross_val_predict(pipeline, songs_data_modified, encoded_genres, cv=cv, n_jobs=-1)
# accuracy = accuracy_score(encoded_genres, predictions)
# f1_weighted = f1_score(encoded_genres, predictions, average='weighted')
print(f"Accuracy: {accuracy:.4f}")
print(f"F1: {f1_weighted:.4f}")

### Experiment \#1 conclusion:
_Under and over sampling hinder the performace of the decision tree_