In [3]:
import pandas as pd
# from sklearnex import patch_sklearn
# patch_sklearn()
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from data_prep import add_artists_as_features
import nltk
# Need to download stopwords and punkt to use the add_artists_as_features function
nltk.download('stopwords')
nltk.download('punkt')

pd.options.display.max_columns = 200

songs = pd.read_csv('../data/spotify_simplified.csv', index_col=[0])
songs_data = add_artists_as_features(songs, 2000)
songs_data = songs_data.drop(columns = ["track_id", "artists", "album_name", "track_name", "track_genre"])
genres = songs["track_genre"]
# Numerically encode the labels
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)
# Using stratify might help because we have an imbalanced dataset
X_train, X_test, y_train, y_test = train_test_split(songs_data, encoded_genres, test_size=0.3, 
                                                    stratify=encoded_genres, shuffle=True, random_state=100)
# Train model
model = RandomForestClassifier(max_depth=50, min_samples_leaf=1, min_samples_split=2, n_estimators=300)
model.fit(X_train, y_train)
# Evaluate model
predictions = model.predict(X_test)
base_accuracy = accuracy_score(y_test, predictions)
base_f1_weighted = f1_score(y_test, predictions, average='weighted')
print("Peformance using Random Forest")
print(f"Accuracy: {base_accuracy}")
print(f"F1-score: {base_f1_weighted}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\masam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\masam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Peformance using Random Forest
Accuracy: 0.6359111507317435
F1-score: 0.6277655228348183


In [14]:
from sklearn.metrics import classification_report

report = classification_report(y_test, predictions, target_names=list(label_encoder.classes_))
print(report)

              precision    recall  f1-score   support

     ambient       0.67      0.66      0.66      1820
    children       0.65      0.73      0.69       873
   classical       0.59      0.83      0.69       797
      comedy       0.74      0.91      0.81       297
     country       0.52      0.76      0.62       871
         edm       0.65      0.32      0.43      3268
    european       0.31      0.45      0.37      1157
        folk       0.71      0.28      0.40      2122
     hip-hop       0.35      0.63      0.45       481
       latin       0.63      0.59      0.61      2672
       metal       0.53      0.86      0.65      1800
         pop       0.58      0.45      0.51      1891
      reggae       0.22      0.62      0.33       738
        rock       0.60      0.24      0.34      3045
  show-tunes       0.26      0.66      0.38       521
       sleep       0.76      0.99      0.86       299
      techno       0.68      0.79      0.73      2401
       world       0.54    

In [13]:
def generate_predicitons_df(y_test, predictions, class_names):
    # Get more detailed performance information for each class
    f1 = f1_score(y_test, predictions, average=None)
    class_to_f1 = dict(zip(class_names, f1))
    # Sort f1-score in descending order
    class_to_f1 = dict(sorted(class_to_f1.items(), key=lambda item: item[1], reverse=True))
    grouped_by_genre = songs.groupby(['track_genre']).size()
    genre_count = grouped_by_genre.sort_values(ascending=False)
    count = list()
    for genre in class_to_f1.keys():
        count.append(genre_count[genre])
    predictions_df = pd.DataFrame([], columns=['genre', 'f1-score', 'count'])
    predictions_df['genre'] = class_to_f1.keys()
    predictions_df['f1-score'] = class_to_f1.values()
    predictions_df['count'] = count
    return predictions_df

predictions_df = generate_predicitons_df(y_test, predictions, label_encoder.classes_)
display(predictions_df)

Unnamed: 0,genre,f1-score,count
0,sleep,0.856729,998
1,comedy,0.81448,990
2,techno,0.729015,8002
3,classical,0.688645,2655
4,children,0.687163,2911
5,ambient,0.660942,6066
6,metal,0.654515,5999
7,country,0.617041,2902
8,latin,0.606154,8908
9,world,0.510376,6230


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def plot_confusion_matrix(conf_matrix, class_names):
    cm_df = pd.DataFrame(conf_matrix, index=class_names, columns=class_names)
    heatmap = sns.heatmap(cm_df, annot=True, fmt='d', annot_kws={"size": 14})
    heatmap.set(xlabel='Predicted class', ylabel='True class')
    return heatmap

cm = confusion_matrix(y_test, predictions)
heatmap = plot_confusion_matrix(cm, label_encoder.classes_)
plt.figure(figsize=[10, 30])
plt.show()

## Balancing

In [4]:
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
sampler = RandomUnderSampler()
pipeline = Pipeline([('balancing', sampler), ('classifier', model)])
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1_weighted = f1_score(y_test, predictions, average='weighted')
print("Peformance using undersampling")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1: {f1_weighted:.4f}")

Peformance using undersampling
Accuracy: 0.5401
F1: 0.5294


In [5]:
pipeline.named_steps['balancing'] = RandomOverSampler()
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1_weighted = f1_score(y_test, predictions, average='weighted')
# predictions = cross_val_predict(pipeline, songs_data_modified, encoded_genres, cv=cv, n_jobs=-1)
# accuracy = accuracy_score(encoded_genres, predictions)
# f1_weighted = f1_score(encoded_genres, predictions, average='weighted')
print("Performance using oversampling")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1: {f1_weighted:.4f}")

Performance using oversampling
Accuracy: 0.5384
F1: 0.5285


### Experiment conclusion:
_Under and over sampling hinder the performace of the random forest_