In [2]:
# working with data
import numpy as np
import pandas as pd
# data visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors
# modelling
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import pickle

## Data Preparation

In [3]:
# Read in dataset from csv
songs = pd.read_csv("songs.csv", index_col=False)
# Omit time_signature and mode just as we did with random forest model
df_no_genres = songs.drop(columns=['track_id', 'track_name', 'genres', 'time_signature', 'mode'])

In [4]:
# Split dataset into train and test
y = df_no_genres['user_like']
X = df_no_genres.drop(columns='user_like')
X, y = shuffle(X, y, random_state=1234)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

In [5]:
# Using default settings
base_model = xgb.XGBClassifier(random_state = 123)

In [6]:
# Fit the model with hyperparameter tuning
base_model.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = base_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print("Base Model Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Base Model Accuracy: 0.7628865979381443

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.81      0.80        57
           1       0.72      0.70      0.71        40

    accuracy                           0.76        97
   macro avg       0.76      0.75      0.75        97
weighted avg       0.76      0.76      0.76        97



In [7]:
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [400],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.5, 0.7, 0.9],
    'colsample_bytree': [0.5, 0.7, 0.9],
}

In [8]:
grid_search = RandomizedSearchCV(estimator = base_model,
                                 param_distributions = param_grid,
                                 n_iter = 100,                                 
                                 random_state = 123,
                                 cv = 5,
                                 scoring = 'accuracy')

In [9]:
# Fit the model with hyperparameter tuning
grid_search.fit(X_train, y_train)



In [10]:
# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

best_params

{'subsample': 0.5,
 'n_estimators': 400,
 'max_depth': 5,
 'learning_rate': 0.01,
 'colsample_bytree': 0.5}

In [11]:
# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print("Best Hyperparameters:", best_params)
print("Best Model Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Best Hyperparameters: {'subsample': 0.5, 'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.5}
Best Model Accuracy: 0.7628865979381443

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.81      0.80        57
           1       0.72      0.70      0.71        40

    accuracy                           0.76        97
   macro avg       0.76      0.75      0.75        97
weighted avg       0.76      0.76      0.76        97



In [12]:
# Narrower gridsearch
# Define the hyperparameter grid
param_grid2 = {
    'n_estimators': [400],
    'learning_rate': [0.001, 0.005, 0.01],
    'max_depth': [5],
    'subsample': [0.3, 0.4, 0.5],
    'colsample_bytree': [0.3, 0.4, 0.5],
}

In [13]:
# Using default settings
base_model2 = xgb.XGBClassifier(random_state = 123)

In [14]:
grid_search2 = GridSearchCV(estimator=base_model2,
                           param_grid=param_grid2,
                           cv=10,
                           scoring='accuracy')

In [15]:
# Fit the model with hyperparameter tuning
grid_search2.fit(X_train, y_train)

# Get the best parameters and model
best_params2 = grid_search2.best_params_
best_model2 = grid_search2.best_estimator_

In [16]:
best_params2

{'colsample_bytree': 0.5,
 'learning_rate': 0.01,
 'max_depth': 5,
 'n_estimators': 400,
 'subsample': 0.5}

In [17]:
# Make predictions on the test set using the best model
y_pred2 = best_model2.predict(X_test)

# Evaluate the best model
accuracy2 = accuracy_score(y_test, y_pred)
print("Best Hyperparameters:", best_params2)
print("Best Model Accuracy:", accuracy2)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Best Hyperparameters: {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.5}
Best Model Accuracy: 0.7628865979381443

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.81      0.80        57
           1       0.72      0.70      0.71        40

    accuracy                           0.76        97
   macro avg       0.76      0.75      0.75        97
weighted avg       0.76      0.76      0.76        97



In [21]:
# fit on whole dataset
best_model2.fit(X, y)
# write model to disk
pickle.dump(best_model2, open('xgb.sav', 'wb'))