In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
features = ['duration',
 'spectral_bandwidth_kurtosis_01',
 'spectral_bandwidth_max_01',
 'spectral_bandwidth_mean_01',
 'spectral_bandwidth_median_01',
 'spectral_bandwidth_min_01',
 'spectral_bandwidth_skew_01',
 'spectral_bandwidth_std_01',
 'spectral_centroid_kurtosis_01',
 'spectral_centroid_max_01',
 'spectral_centroid_mean_01',
 'spectral_centroid_median_01',
 'spectral_centroid_min_01',
 'spectral_centroid_skew_01',
 'spectral_centroid_std_01',
 'spectral_rolloff_kurtosis_01',
 'spectral_rolloff_max_01',
 'spectral_rolloff_mean_01',
 'spectral_rolloff_median_01',
 'spectral_rolloff_min_01',
 'spectral_rolloff_skew_01',
 'spectral_rolloff_std_01',
 'acousticness',
 'danceability',
 'energy',
 'instrumentalness',
 'liveness',
 'speechiness',
 'tempo',
 'valence']

target_col = 'main_genre'


In [3]:
test_data = pd.read_csv('../DataBase/processed/test_data.csv')
train_data = pd.read_csv('../DataBase/processed/train_data.csv')

Electronic + Experimental + Pop + Hip-Hop --> Energique
Rock + Blues + Instrumental --> Rock
Classical + Jazz --> Acoustic
OldTime/Historic + Folk + Country + International --> Traditionnal

In [7]:
genre_mapping = {
    "Energique": ["Electronic", "Experimental", "Pop", "Hip-Hop"],
    "Rock": ["Rock", "Blues", "Instrumental"],
    "Acoustic": ["Classical", "Jazz"],
    "Traditionnal": ["Old-Time / Historic", "Folk", "Country", "International"]
}

def map_main_genre(genre):
    for main, genres in genre_mapping.items():
        if genre in genres:
            return main
    return "Other" 


train_data['main_genre'] = train_data['genre_title'].apply(map_main_genre)
test_data['main_genre'] = test_data['genre_title'].apply(map_main_genre)

In [8]:
le = LabelEncoder()
train_data[target_col] = le.fit_transform(train_data[target_col])

mapping = dict(zip(le.classes_, np.arange(len(le.classes_))))
print(mapping)

{'Acoustic': np.int64(0), 'Energique': np.int64(1), 'Rock': np.int64(2), 'Traditionnal': np.int64(3)}


In [9]:
le = LabelEncoder()
test_data[target_col] = le.fit_transform(test_data[target_col])

mapping = dict(zip(le.classes_, np.arange(len(le.classes_))))
print(mapping)

{'Acoustic': np.int64(0), 'Energique': np.int64(1), 'Rock': np.int64(2), 'Traditionnal': np.int64(3)}


In [10]:
X_train, y_train, X_test, y_test = train_data[features], train_data[target_col], test_data[features], test_data[target_col]
# sample_weight = y_train.map(weights_encoded)

In [11]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
predictions = random_forest.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.89      0.53      0.67        77
           1       0.78      0.83      0.80       579
           2       0.75      0.80      0.77       531
           3       0.78      0.59      0.67       187

    accuracy                           0.77      1374
   macro avg       0.80      0.69      0.73      1374
weighted avg       0.77      0.77      0.77      1374



In [12]:
xgb = XGBClassifier(eval_metric="mlogloss")
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.85      0.51      0.63        77
           1       0.82      0.84      0.83       579
           2       0.77      0.82      0.80       531
           3       0.76      0.70      0.73       187

    accuracy                           0.79      1374
   macro avg       0.80      0.72      0.75      1374
weighted avg       0.80      0.79      0.79      1374



'Acoustic': 0, 'Energique': 1, 'Rock': 2, 'Traditionnal': 3