# XGBoost Model

In [16]:
import json
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

def spotify(client_id, client_secret):
    # Authenticate
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
    return sp

# Set up the credentials
with open('config.json') as config_file:
    config = json.load(config_file)

client_id = config['client_id']['0']
client_secret = config['client_secret']['0']

sp = spotify(client_id, client_secret)


## Import training data and drop irrelevant columns/features

In [17]:
df = pd.read_csv("my_songs.csv")
df

Unnamed: 0.1,Unnamed: 0,playlist,id,name,genre,danceability,energy,key,loudness,mode,...,instrumentalness,liveness,valence,tempo,type,uri,track_href,analysis_url,duration_ms,time_signature
0,0,Its time,2FoahzOSxJnalPA8aBUme3,all of me,Rap,0.652,0.80600,5,-5.707,0,...,0.000011,0.8420,0.779,159.947,audio_features,spotify:track:2FoahzOSxJnalPA8aBUme3,https://api.spotify.com/v1/tracks/2FoahzOSxJna...,https://api.spotify.com/v1/audio-analysis/2Foa...,198293,4
1,1,Its time,6x9pCndnXEoea0CMcfjs9W,n.h.i.e.,Rap,0.818,0.51200,5,-9.056,0,...,0.037100,0.1100,0.153,131.974,audio_features,spotify:track:6x9pCndnXEoea0CMcfjs9W,https://api.spotify.com/v1/tracks/6x9pCndnXEoe...,https://api.spotify.com/v1/audio-analysis/6x9p...,143719,4
2,2,Its time,2FDTHlrBguDzQkp7PVj16Q,Sprinter,Rap,0.918,0.68100,1,-4.705,1,...,0.000000,0.0615,0.706,139.057,audio_features,spotify:track:2FDTHlrBguDzQkp7PVj16Q,https://api.spotify.com/v1/tracks/2FDTHlrBguDz...,https://api.spotify.com/v1/audio-analysis/2FDT...,229133,4
3,3,Its time,5KI7I4mEtulXcv5VQJaV35,just like me,Rap,0.701,0.65300,1,-4.695,1,...,0.000028,0.3060,0.505,82.984,audio_features,spotify:track:5KI7I4mEtulXcv5VQJaV35,https://api.spotify.com/v1/tracks/5KI7I4mEtulX...,https://api.spotify.com/v1/audio-analysis/5KI7...,231338,4
4,4,Its time,4yLyVdEqV790aIXyGif85v,red sky,Rap,0.474,0.62100,7,-8.253,0,...,0.000000,0.1600,0.396,114.314,audio_features,spotify:track:4yLyVdEqV790aIXyGif85v,https://api.spotify.com/v1/tracks/4yLyVdEqV790...,https://api.spotify.com/v1/audio-analysis/4yLy...,176835,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,284,Old,6crfO56bDm0RjpctUuGs5X,I'm In The Mood For Love,Old,0.580,0.05450,9,-14.594,0,...,0.000265,0.0967,0.177,123.341,audio_features,spotify:track:6crfO56bDm0RjpctUuGs5X,https://api.spotify.com/v1/tracks/6crfO56bDm0R...,https://api.spotify.com/v1/audio-analysis/6crf...,149827,4
285,285,Old,4l9hml2UCnxoNI3yCdL1BW,My Funny Valentine,Old,0.438,0.00756,0,-26.440,0,...,0.006430,0.1090,0.277,133.320,audio_features,spotify:track:4l9hml2UCnxoNI3yCdL1BW,https://api.spotify.com/v1/tracks/4l9hml2UCnxo...,https://api.spotify.com/v1/audio-analysis/4l9h...,141133,4
286,286,Old,7Kqk5EpwqiukPkDutSFFZk,Did I Remember,Old,0.677,0.26400,8,-9.111,1,...,0.006010,0.0825,0.438,120.974,audio_features,spotify:track:7Kqk5EpwqiukPkDutSFFZk,https://api.spotify.com/v1/tracks/7Kqk5Epwqiuk...,https://api.spotify.com/v1/audio-analysis/7Kqk...,206653,4
287,287,Old,1sJnKCMOkGvcRPes8Tln2x,My Kind of Night,Old,0.840,0.17800,10,-11.133,1,...,0.000000,0.1130,0.529,114.412,audio_features,spotify:track:1sJnKCMOkGvcRPes8Tln2x,https://api.spotify.com/v1/tracks/1sJnKCMOkGvc...,https://api.spotify.com/v1/audio-analysis/1sJn...,123711,4


In [18]:
drop_columns = ['Unnamed: 0', 'playlist', 'type', 'id', 'name', 'uri', 'track_href', 'analysis_url']

df_knn = df.drop(columns=drop_columns)

df_knn

Unnamed: 0,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Rap,0.652,0.80600,5,-5.707,0,0.3020,0.1220,0.000011,0.8420,0.779,159.947,198293,4
1,Rap,0.818,0.51200,5,-9.056,0,0.0884,0.0963,0.037100,0.1100,0.153,131.974,143719,4
2,Rap,0.918,0.68100,1,-4.705,1,0.2010,0.2630,0.000000,0.0615,0.706,139.057,229133,4
3,Rap,0.701,0.65300,1,-4.695,1,0.1050,0.4790,0.000028,0.3060,0.505,82.984,231338,4
4,Rap,0.474,0.62100,7,-8.253,0,0.1030,0.1870,0.000000,0.1600,0.396,114.314,176835,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,Old,0.580,0.05450,9,-14.594,0,0.0473,0.9630,0.000265,0.0967,0.177,123.341,149827,4
285,Old,0.438,0.00756,0,-26.440,0,0.0382,0.9410,0.006430,0.1090,0.277,133.320,141133,4
286,Old,0.677,0.26400,8,-9.111,1,0.0357,0.9040,0.006010,0.0825,0.438,120.974,206653,4
287,Old,0.840,0.17800,10,-11.133,1,0.0700,0.9730,0.000000,0.1130,0.529,114.412,123711,4


## Scale features to ensure fairness

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
scaler = StandardScaler()

In [21]:
scaler.fit(df_knn.drop('genre', axis = 1))

In [22]:
scaled_features = scaler.transform(df_knn.drop('genre', axis = 1))

In [23]:
df_feat = pd.DataFrame(scaled_features, columns = df_knn.columns[1:])
df_feat

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0.298154,1.909276,0.014842,0.928430,-1.266335,1.970345,-1.320475,-0.409573,6.097448,2.244916,1.484358,-0.008676,0.32655
1,1.212476,0.673546,0.014842,0.446438,-1.266335,-0.122286,-1.392112,-0.277711,-0.472223,-1.014116,0.607365,-0.770294,0.32655
2,1.763272,1.383880,-1.128989,1.072639,0.789681,0.980852,-0.927450,-0.409612,-0.907508,1.864869,0.829427,0.421717,0.32655
3,0.568044,1.266192,-1.128989,1.074078,0.789681,0.040344,-0.325370,-0.409511,1.286869,0.818438,-0.928541,0.452489,0.32655
4,-0.682263,1.131691,0.586758,0.562007,-1.266335,0.020750,-1.139293,-0.409612,-0.023475,0.250971,0.053699,-0.308137,0.32655
...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,-0.098419,-1.249400,1.158674,-0.350597,-1.266335,-0.524941,1.023737,-0.408669,-0.591590,-0.889170,0.336708,-0.685052,0.32655
285,-0.880550,-1.446696,-1.414947,-2.055487,-1.266335,-0.614093,0.962414,-0.386751,-0.481198,-0.368557,0.649564,-0.806383,0.32655
286,0.435853,-0.368838,0.872716,0.438522,0.789681,-0.638586,0.859280,-0.388244,-0.719034,0.469628,0.262499,0.107993,0.32655
287,1.333651,-0.730309,1.444631,0.147514,0.789681,-0.302550,1.051611,-0.409612,-0.445298,0.943385,0.056771,-1.049519,0.32655


In [24]:
from sklearn.model_selection import train_test_split

## Split, train and test our training dataset using xgboost

In [72]:
from sklearn.model_selection import train_test_split

In [93]:
X = df_feat

genre_mapping = {
    'Rap': 0,
    'RnB': 1,
    'Classical': 2,
    'Covers': 3,
    'Old': 4,
    'Easy': 5
}

# Apply the mapping to the 'genre' column
y = df_knn['genre'].map(genre_mapping)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2024)

In [75]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
xgb_model = xgb.XGBClassifier(eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

In [64]:
# Make predictions and calculate accuracy
pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 60.34%


## How did our model do?

In [65]:
from sklearn.metrics import classification_report, confusion_matrix

In [68]:
print(confusion_matrix(y_test, pred))

[[11  2  0  0  0  1]
 [ 2  5  0  0  0  7]
 [ 0  0  6  0  0  0]
 [ 0  0  0  2  0  3]
 [ 0  0  0  3  4  0]
 [ 3  1  0  0  1  7]]


In [69]:
print(classification_report(y_test, pred, zero_division=0))
correct_predictions = (y_test == pred).sum()
incorrect_predictions = (y_test != pred).sum()

print(f"Correct predictions: {correct_predictions}")
print(f"Incorrect predictions: {incorrect_predictions}")


              precision    recall  f1-score   support

           0       0.69      0.79      0.73        14
           1       0.62      0.36      0.45        14
           2       1.00      1.00      1.00         6
           3       0.40      0.40      0.40         5
           4       0.80      0.57      0.67         7
           5       0.39      0.58      0.47        12

    accuracy                           0.60        58
   macro avg       0.65      0.62      0.62        58
weighted avg       0.63      0.60      0.60        58

Correct predictions: 35
Incorrect predictions: 23


## Hyperparameter tuning

In [78]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.5],
    'min_child_weight': [1, 5, 10]
}

# Initialize the XGBoost classifier without early stopping and eval set for now
xgb_model = xgb.XGBClassifier(eval_metric='mlogloss')

# Use RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, 
                                   n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)

# Fit the model using RandomizedSearchCV
random_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Accuracy:", random_search.best_score_)

# Evaluate on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Best Parameters: {'subsample': 0.8, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 1.0}
Best Accuracy: 0.6017316017316018
Test Accuracy: 62.07%


In [79]:
## Now use these parameters to retrain the model

In [87]:
# Best parameters from the RandomizedSearchCV
best_params = {
    'subsample': 0.8,
    'n_estimators': 200,
    'min_child_weight': 1,
    'max_depth': 3,
    'learning_rate': 0.1,
    'gamma': 0.1,
    'colsample_bytree': 1.0
}

# Initialize the final model with the best parameters
final_model = xgb.XGBClassifier(**best_params, eval_metric='mlogloss')

# Train the model with early stopping by using evals parameter
final_model.fit(X_train, y_train, 
                eval_set=[(X_test, y_test)], 
                verbose=False)

# Make predictions on the test set
pred = final_model.predict(X_test)

# Calculate accuracy
final_accuracy = accuracy_score(y_test, pred)
print(f'Final Test Accuracy: {final_accuracy * 100:.2f}%')

Final Test Accuracy: 62.07%


## How did our model do?

In [89]:
print(confusion_matrix(y_test, pred))

[[11  2  0  0  0  1]
 [ 3  5  0  0  0  6]
 [ 0  0  6  0  0  0]
 [ 0  0  0  3  0  2]
 [ 0  0  0  3  4  0]
 [ 2  2  0  0  1  7]]


In [90]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.69      0.79      0.73        14
           1       0.56      0.36      0.43        14
           2       1.00      1.00      1.00         6
           3       0.50      0.60      0.55         5
           4       0.80      0.57      0.67         7
           5       0.44      0.58      0.50        12

    accuracy                           0.62        58
   macro avg       0.66      0.65      0.65        58
weighted avg       0.63      0.62      0.62        58



In [91]:
correct_predictions = (y_test == pred).sum()
incorrect_predictions = (y_test != pred).sum()

print(f"Correct predictions: {correct_predictions}")
print(f"Incorrect predictions: {incorrect_predictions}")


Correct predictions: 36
Incorrect predictions: 22


## Importing our test dataset and dropping irrelevant features/columns.
## Then test our model using this test dataset and see how it does

In [104]:
test = pd.read_csv('test.csv')
test
features = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
    'duration_ms', 'time_signature', 'genre'
]

df_testing = test[~test['track_name'].isin(df.name)].copy()
df_test = df_testing[features].copy()

scaled_features_test = scaler.transform(df_test.drop('genre', axis = 1))
X = pd.DataFrame(scaled_features_test, columns = df_test.columns[:-1])

genre_mapping = {
    'Rap': 0,
    'RnB': 1,
    'Classical': 2,
    'Covers': 3,
    'Old': 4,
    'Easy': 5
}

# Apply the mapping to the 'genre' column
y = df_test['genre'].map(genre_mapping)

In [105]:
pred_test = final_model.predict(X)

In [106]:
print(confusion_matrix(y, pred_test))

[[ 74  10   0   0   0   7]
 [ 33  29   0   0   0  25]
 [  0   0 100   0   0   0]
 [  0   1   2  31   9  53]
 [  4   5   5  23  22  40]
 [  4   9   1   8   4  45]]


In [107]:
print(classification_report(y, pred_test))

              precision    recall  f1-score   support

           0       0.64      0.81      0.72        91
           1       0.54      0.33      0.41        87
           2       0.93      1.00      0.96       100
           3       0.50      0.32      0.39        96
           4       0.63      0.22      0.33        99
           5       0.26      0.63      0.37        71

    accuracy                           0.55       544
   macro avg       0.58      0.55      0.53       544
weighted avg       0.60      0.55      0.54       544



In [110]:
correct_predictions = (y == pred_test).sum()
incorrect_predictions = (y != pred_test).sum()

print(f"Correct predictions: {correct_predictions}")
print(f"Incorrect predictions: {incorrect_predictions}")


Correct predictions: 301
Incorrect predictions: 243


In [123]:
# Get the probabilities for the test set
y_proba = final_model.predict_proba(X)

# Create a DataFrame with the predicted probabilities
prob_df = pd.DataFrame(y_proba, columns=['Rap', 'RnB', 'Classical', 'Covers', 'Old', 'Easy'])

# Round the probabilities to 2 decimal places
prob_df = prob_df.round(2)

# Add the track name and real genre to the DataFrame
prob_df['track_name'] = df_testing['track_name'].values
prob_df['real_genre'] = df_testing['genre'].values

# Reorder the columns to have 'track_name' as the first column and 'real_genre' as the last column
prob_df = prob_df[['track_name'] + prob_df.columns[:-2].tolist() + ['real_genre']]

# Display the first few rows to make sure it looks good
prob_df

Unnamed: 0,track_name,Rap,RnB,Classical,Covers,Old,Easy,real_genre
0,Not Like Us,0.63,0.01,0.00,0.00,0.00,0.36,Rap
1,Breathe,0.66,0.28,0.02,0.01,0.00,0.02,Rap
2,BANDIT,0.94,0.05,0.00,0.00,0.00,0.01,Rap
3,CARNIVAL,0.99,0.00,0.00,0.00,0.00,0.01,Rap
4,Push Ups,0.99,0.00,0.00,0.00,0.00,0.01,Rap
...,...,...,...,...,...,...,...,...
539,Let My Baby Stay,0.00,0.53,0.07,0.03,0.13,0.24,Easy
540,Sky is the Limit,0.00,0.00,0.00,0.61,0.13,0.25,Easy
541,4EVER,0.09,0.20,0.00,0.02,0.01,0.68,Easy
542,Chinese Satellite,0.00,0.38,0.34,0.04,0.01,0.23,Easy


## Let's see which songs are being put in the wrong genre and see if it make sense

In [125]:
# Predict the genres for the test set
pred_test = final_model.predict(X)
prob_df['predicted_genre'] = pred_test

# Find mismatched predictions
mismatched_predictions = prob_df[prob_df['real_genre'] != prob_df['predicted_genre']]

# Display the mismatched predictions
mismatched_predictions


Unnamed: 0,track_name,Rap,RnB,Classical,Covers,Old,Easy,real_genre,predicted_genre
0,Not Like Us,0.63,0.01,0.00,0.00,0.00,0.36,Rap,0
1,Breathe,0.66,0.28,0.02,0.01,0.00,0.02,Rap,0
2,BANDIT,0.94,0.05,0.00,0.00,0.00,0.01,Rap,0
3,CARNIVAL,0.99,0.00,0.00,0.00,0.00,0.01,Rap,0
4,Push Ups,0.99,0.00,0.00,0.00,0.00,0.01,Rap,0
...,...,...,...,...,...,...,...,...,...
539,Let My Baby Stay,0.00,0.53,0.07,0.03,0.13,0.24,Easy,1
540,Sky is the Limit,0.00,0.00,0.00,0.61,0.13,0.25,Easy,3
541,4EVER,0.09,0.20,0.00,0.02,0.01,0.68,Easy,5
542,Chinese Satellite,0.00,0.38,0.34,0.04,0.01,0.23,Easy,1
