In [19]:
# working with data
import numpy as np
import pandas as pd
# data visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors
# modelling
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import pickle

## Data Preparation

In [2]:
# Read in dataset from csv
songs = pd.read_csv("songs.csv", index_col=False)
df_no_genres = songs.drop(columns=['track_id', 'track_name', 'genres', 'time_signature'])

In [3]:
# Split dataset into train and test
y = df_no_genres['user_like']
X = df_no_genres.drop(columns='user_like')
X, y = shuffle(X, y, random_state=1234)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [4]:
# Using default settings
base_model = xgb.XGBClassifier(random_state = 123)

In [5]:
# Fit the model with hyperparameter tuning
base_model.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = base_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print("Base Model Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Base Model Accuracy: 0.7216494845360825

Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.80      0.71        41
           1       0.82      0.66      0.73        56

    accuracy                           0.72        97
   macro avg       0.73      0.73      0.72        97
weighted avg       0.74      0.72      0.72        97



In [6]:
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.5, 0.7, 0.9],
    'colsample_bytree': [0.5, 0.7, 0.9],
}

In [7]:
grid_search = RandomizedSearchCV(estimator = base_model,
                                 param_distributions = param_grid,
                                 n_iter = 100,                                 
                                 random_state = 123,
                                 cv = 3,
                                 scoring = 'accuracy')

In [8]:
# Fit the model with hyperparameter tuning
grid_search.fit(X_train, y_train)

In [9]:
# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

best_params

{'subsample': 0.9,
 'n_estimators': 50,
 'max_depth': 5,
 'learning_rate': 0.1,
 'colsample_bytree': 0.7}

In [10]:
# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print("Best Hyperparameters:", best_params)
print("Best Model Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Best Hyperparameters: {'subsample': 0.9, 'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
Best Model Accuracy: 0.7525773195876289

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.78      0.73        41
           1       0.82      0.73      0.77        56

    accuracy                           0.75        97
   macro avg       0.75      0.76      0.75        97
weighted avg       0.76      0.75      0.75        97



In [13]:
# Narrower gridsearch
# Define the hyperparameter grid
param_grid2 = {
    'n_estimators': [100],
    'learning_rate': [0.1],
    'max_depth': [4, 5],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
}

In [14]:
# Using default settings
base_model2 = xgb.XGBClassifier(random_state = 123)

In [15]:
grid_search2 = GridSearchCV(estimator=base_model2,
                           param_grid=param_grid2,
                           cv=10,
                           scoring='accuracy')

In [16]:
# Fit the model with hyperparameter tuning
grid_search2.fit(X_train, y_train)

# Get the best parameters and model
best_params2 = grid_search2.best_params_
best_model2 = grid_search2.best_estimator_

In [17]:
best_params2

{'colsample_bytree': 0.8,
 'learning_rate': 0.1,
 'max_depth': 5,
 'n_estimators': 100,
 'subsample': 0.9}

In [18]:
# Make predictions on the test set using the best model
y_pred2 = best_model2.predict(X_test)

# Evaluate the best model
accuracy2 = accuracy_score(y_test, y_pred)
print("Best Hyperparameters:", best_params2)
print("Best Model Accuracy:", accuracy2)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Best Hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.9}
Best Model Accuracy: 0.7525773195876289

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.78      0.73        41
           1       0.82      0.73      0.77        56

    accuracy                           0.75        97
   macro avg       0.75      0.76      0.75        97
weighted avg       0.76      0.75      0.75        97



In [21]:
# fit on whole dataset
best_model2.fit(X, y)
# write model to disk
pickle.dump(best_model2, open('xgb.sav', 'wb'))