This notebook trains a few models on the df2 dataset.  In particular, we consider a random forest classifier and a XGBoost classifier, wrapped in a MultiOutputClassifier for the purpose of multi-label classification.

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [3]:
# We load up the three DataFrames constructed from the data analysis notebook.  However, as indicated in that notebook,
# we will only focus on df2.

df1 = pd.read_pickle('../electronic_df_EDAv1.pkl')
df2 = pd.read_pickle('../electronic_df_EDAv2.pkl')
df3 = pd.read_pickle('../electronic_df_EDAv3.pkl')

In [4]:
# preparing the data to be fed into our model
labels = ['mbdata.techno', 'mbdata.house', 'mbdata.trance', 'mbdata.dnb']
drop_cols = ['mbdata.id', 'mbdata.title', 'mbdata.artist-name', 'mbdata.artist-id',
             'mbdata.all-tags', 'mbdata.genre']

X1 = df1.drop(columns=drop_cols+labels)
y1 = df1[labels]
X2 = df2.drop(columns=drop_cols+labels)
y2 = df2[labels]
X3 = df3.drop(columns=drop_cols+labels)
y3 = df3[labels]

In [5]:
# train test split
from sklearn.model_selection import train_test_split

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=415)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=415)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=415)

### Dummy Classifier

In [8]:
from sklearn.dummy import DummyClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

In [7]:
# We train a dummy classifier as a baseline model.
# The stratified strategy predicts based on the distribution of labels in
# our dataset.

dummy = DummyClassifier(strategy="stratified", random_state=415)
dummy_multi = MultiOutputClassifier(dummy, n_jobs=-1)
dummy_multi.fit(X2_train, y2_train)

y2_pred_dummy = dummy_multi.predict(X2_test)


In [9]:
# Evaluation metrics
tot_acc = (y2_test == y2_pred_dummy).values.flatten().mean()
genre_acc = (y2_test == y2_pred_dummy).mean()
report = classification_report(y2_test, y2_pred_dummy, target_names=labels)

print(f'Total Accuracy: {tot_acc}')
print()
print(genre_acc)
print(report)

Total Accuracy: 0.5828129242465382

mbdata.techno    0.582949
mbdata.house     0.523894
mbdata.trance    0.591501
mbdata.dnb       0.632908
dtype: float64
               precision    recall  f1-score   support

mbdata.techno       0.30      0.30      0.30      2207
 mbdata.house       0.40      0.41      0.41      2943
mbdata.trance       0.28      0.28      0.28      2111
   mbdata.dnb       0.24      0.24      0.24      1769

    micro avg       0.32      0.32      0.32      9030
    macro avg       0.31      0.31      0.31      9030
 weighted avg       0.32      0.32      0.32      9030
  samples avg       0.13      0.31      0.17      9030



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

In [7]:
# Performing a RandomSearchCV for hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV

base_estimator = RandomForestClassifier(random_state=415)

param_distributions = {
    'estimator__n_estimators': [100, 200, 500],
    'estimator__max_depth': [None, 10, 20],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4],
    'estimator__max_features': ['auto', 'sqrt', 'log2']
}

multi_output_model = MultiOutputClassifier(base_estimator, n_jobs=-1)


random_search_rf = RandomizedSearchCV(estimator=multi_output_model, 
                                   param_distributions=param_distributions, 
                                   n_iter=10, cv=3, verbose=2, n_jobs=-1, random_state=42)
random_search_rf.fit(X2_train, y2_train)

print(random_search_rf.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


12 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\kling\anaconda3\envs\erdos_may_2024\lib\site-packages\joblib\_utils.py", line 72, in __call__
    return self.func(**kwargs)
  File "c:\Users\kling\anaconda3\envs\erdos_may_2024\lib\site-packages\joblib\parallel.py", line 598, in __call__
    return [func(*args, **kwargs)
  File "c:\Users\kling\anaconda3\envs\erdos_may_2024\lib\site-packages\joblib\parallel.py", line 598, in <listcomp>
    return [func(*args, **kwargs)
  File "c:\Users\kling\anaconda3\envs\erdos_may_2024\lib\site-packages\sklearn\utils\

{'estimator__n_estimators': 500, 'estimator__min_samples_split': 10, 'estimator__min_samples_leaf': 4, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 20}


For posterity, the best parameters from the above RandomSearchCV are:
{'estimator__n_estimators': 500, 'estimator__min_samples_split': 10, 'estimator__min_samples_leaf': 4, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 20}

In [8]:
# Using the model with best_params
rf = random_search_rf.best_estimator_
y2_pred_rf = rf.predict(X2_test)

In [12]:
# Evaluation metrics
tot_acc = (y2_test == y2_pred_rf).values.flatten().mean()
genre_acc = (y2_test == y2_pred_rf).mean()
report = classification_report(y2_test, y2_pred_rf, target_names=labels)

print(f'Total Accuracy: {tot_acc}')
print()
print(genre_acc)
print(report)

Total Accuracy: 0.8206625033939723

mbdata.techno    0.784686
mbdata.house     0.757942
mbdata.trance    0.853516
mbdata.dnb       0.886506
dtype: float64
               precision    recall  f1-score   support

mbdata.techno       0.76      0.41      0.53      2207
 mbdata.house       0.79      0.54      0.64      2943
mbdata.trance       0.91      0.54      0.68      2111
   mbdata.dnb       0.91      0.59      0.71      1769

    micro avg       0.84      0.52      0.64      9030
    macro avg       0.84      0.52      0.64      9030
 weighted avg       0.83      0.52      0.64      9030
  samples avg       0.60      0.56      0.57      9030



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### XGBoost

In [5]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier

In [None]:
# Base estimator
xgb_model = XGBClassifier(objective='binary:logistic', 
                          eval_metric='logloss', 
                          use_label_encoder=False,
                          tree_method='gpu_hist', # using GPU
                          predictor='gpu_predictor')

# Parameter grid for RandomizedSearchCV
param_distributions = {
    'estimator__n_estimators': [100, 200, 500],
    'estimator__max_depth': [3, 5, 7, 10],
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__subsample': [0.6, 0.8, 1.0],
    'estimator__colsample_bytree': [0.6, 0.8, 1.0]
}

# Wrap XGBClassifier with MultiOutputClassifier
multi_output_model = MultiOutputClassifier(xgb_model, n_jobs=-1)

# Set up RandomizedSearchCV
xgb_random_search = RandomizedSearchCV(
    estimator=multi_output_model,
    param_distributions=param_distributions,
    n_iter=20,  # Number of parameter settings sampled
    cv=3,  # 3-fold cross-validation
    verbose=2,  # Verbosity level
    n_jobs=-1,  # Use all processors
    random_state=42
)

# Fit RandomizedSearchCV to the training data
xgb_random_search.fit(X2_train, y2_train)

After running the RandomizedSearchCV, we obtain parameters:
Best Parameters: {'estimator__subsample': 1.0, 'estimator__n_estimators': 500, 'estimator__max_depth': 10, 'estimator__learning_rate': 0.1, 'estimator__colsample_bytree': 0.8}

In [6]:
# Running XGBClassifier on the above best parameters.
# We will do this manually to not re-run the RandomizedSearchCV.

xgb_model = XGBClassifier(objective='binary:logistic', 
                          eval_metric='logloss', 
                          subsample=1.0,
                          n_estimators=500,
                          max_depth=10,
                          learning_rate=0.1,
                          colsample_bytree=0.8,
                          use_label_encoder=False,
                          random_state=415
                          )

xgb_model_multi = MultiOutputClassifier(xgb_model, n_jobs=-1)

xgb_model_multi.fit(X2_train, y2_train)

In [7]:
y2_pred_xgb = xgb_model_multi.predict(X2_test)

In [9]:
# Evaluation metrics
from sklearn.metrics import classification_report

tot_acc = (y2_test == y2_pred_xgb).values.flatten().mean()
genre_acc = (y2_test == y2_pred_xgb).mean()
report = classification_report(y2_test, y2_pred_xgb, target_names=labels)

print(f'Total Accuracy: {tot_acc}')
print()
print(genre_acc)
print(report)

Total Accuracy: 0.8458457778984524

mbdata.techno    0.802742
mbdata.house     0.789438
mbdata.trance    0.882297
mbdata.dnb       0.908906
dtype: float64
               precision    recall  f1-score   support

mbdata.techno       0.76      0.50      0.60      2207
 mbdata.house       0.77      0.67      0.72      2943
mbdata.trance       0.87      0.70      0.77      2111
   mbdata.dnb       0.87      0.73      0.79      1769

    micro avg       0.81      0.65      0.72      9030
    macro avg       0.82      0.65      0.72      9030
 weighted avg       0.81      0.65      0.72      9030
  samples avg       0.73      0.69      0.69      9030



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform


In [15]:
# Initialize the base logistic regression model
logistic = LogisticRegression(solver='liblinear')

# Wrap it with MultiOutputClassifier
multi_target_lr = MultiOutputClassifier(logistic, n_jobs=-1)

# Define the parameter distribution for RandomizedSearchCV
param_distributions = {
    'estimator__C': uniform(loc=0, scale=4),  # C parameter for Logistic Regression
    'estimator__penalty': ['l1', 'l2'],  # Penalty type
}

# Set up RandomizedSearchCV
random_search_lr = RandomizedSearchCV(
    multi_target_lr, 
    param_distributions=param_distributions, 
    n_iter=10,  # Number of iterations
    cv=3,  # 5-fold cross-validation
    scoring='accuracy',  # Evaluation metric
    n_jobs=-1,  # Use all available cores
    random_state=415
)

random_search_lr.fit(X2, y2)

print("Best parameters found:", random_search_lr.best_params_)


In [None]:
random_search_lr.fit(X2, y2)

In [98]:
y2_pred_lr = multi_output_logi.predict(X2_test.values)

In [99]:
(y2_test == y2_pred_logi).mean()

mbdata.techno    0.700380
mbdata.house     0.600462
mbdata.trance    0.713413
mbdata.dnb       0.759843
dtype: float64

In [105]:
(y2_test == y2_pred_logi).values.flatten().mean()

0.6935243008417051

In [96]:
classification_report(y2_test, y2_pred_logi, zero_division=0, target_names=labels)

'               precision    recall  f1-score   support\n\nmbdata.techno       0.00      0.00      0.00      2207\n mbdata.house       0.00      0.00      0.00      2943\nmbdata.trance       0.00      0.00      0.00      2111\n   mbdata.dnb       0.00      0.00      0.00      1769\n\n    micro avg       0.00      0.00      0.00      9030\n    macro avg       0.00      0.00      0.00      9030\n weighted avg       0.00      0.00      0.00      9030\n  samples avg       0.00      0.00      0.00      9030\n'

In [108]:
# trying a different threshold
y2_pred_logi_prob = multi_output_logi.predict_proba(X2_test.values)



In [115]:
thresholds = [0.5, 0.5, 0.5, 0.5]

y2_pred_logi_thres = np.array([
    (probs[:, 1] >= thresholds[i]).astype(int)
    for i, probs in enumerate(y2_pred_logi_prob)
]).T

In [116]:
print((y2_pred_logi_thres == y2_test).mean())

print(classification_report(y2_test, y2_pred_logi_thres))

mbdata.techno    0.700380
mbdata.house     0.600462
mbdata.trance    0.713413
mbdata.dnb       0.759843
dtype: float64
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2207
           1       0.00      0.00      0.00      2943
           2       0.00      0.00      0.00      2111
           3       0.00      0.00      0.00      1769

   micro avg       0.00      0.00      0.00      9030
   macro avg       0.00      0.00      0.00      9030
weighted avg       0.00      0.00      0.00      9030
 samples avg       0.00      0.00      0.00      9030



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Random Guessing

In [118]:
def generate_weighted_random_predictions(X, label_probabilities):
    """
    Generate random binary predictions for multi-label classification based on label probabilities.
    
    Parameters:
    - X: DataFrame or array-like, shape (n_samples, n_features)
    - label_probabilities: list or array, shape (num_labels,), probabilities of each label being assigned
    
    Returns:
    - y_pred_weighted: array, shape (n_samples, num_labels)
    """
    num_samples = X.shape[0]
    num_labels = len(label_probabilities)
    
    # Generate random binary predictions based on label probabilities
    y_pred_weighted = np.zeros((num_samples, num_labels))
    
    for i in range(num_samples):
        for j in range(num_labels):
            if np.random.rand() < label_probabilities[j]:
                y_pred_weighted[i, j] = 1
    
    return y_pred_weighted

In [126]:
techno_pro = (y2['mbdata.techno'].sum())/len(y2)
house_pro = (y2['mbdata.house'].sum())/len(y2)
trance_pro = (y2['mbdata.trance'].sum())/len(y2)
dnb_pro = (y2['mbdata.dnb'].sum())/len(y2)


In [127]:
label_probabilities = [techno_pro, house_pro, trance_pro, dnb_pro]
y2_pred_rand = generate_weighted_random_predictions(X2_test, label_probabilities)

In [128]:
(y2_pred_rand == y2_test).mean()

mbdata.techno    0.576025
mbdata.house     0.518463
mbdata.trance    0.593402
mbdata.dnb       0.638746
dtype: float64

In [129]:
classification_report(y2_test, y2_pred_rand, target_names=labels)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


'               precision    recall  f1-score   support\n\nmbdata.techno       0.29      0.29      0.29      2207\n mbdata.house       0.40      0.42      0.41      2943\nmbdata.trance       0.29      0.28      0.28      2111\n   mbdata.dnb       0.25      0.25      0.25      1769\n\n    micro avg       0.32      0.32      0.32      9030\n    macro avg       0.31      0.31      0.31      9030\n weighted avg       0.32      0.32      0.32      9030\n  samples avg       0.25      0.32      0.26      9030\n'