In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df1 = pd.read_pickle('electronic_df_EDAv1.pkl')
df2 = pd.read_pickle('electronic_df_EDAv2.pkl')
df3 = pd.read_pickle('electronic_df_EDAv3.pkl')

In [146]:
electronic_df = pd.read_pickle('cleaned_electronic_df_new.pkl')

In [147]:
electronic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36829 entries, 0 to 36828
Columns: 2304 entries, tonal.tuning_nontempered_energy_ratio to mbdata.dnb
dtypes: bool(4), float64(2294), object(6)
memory usage: 646.4+ MB


In [4]:
df1.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36829 entries, 0 to 36828
Data columns (total 613 columns):
 #    Column                                          Non-Null Count  Dtype  
---   ------                                          --------------  -----  
 0    lowlevel.average_loudness                       36829 non-null  float64
 1    lowlevel.barkbands_crest.dvar                   36829 non-null  float64
 2    lowlevel.barkbands_crest.max                    36829 non-null  float64
 3    lowlevel.barkbands_crest.min                    36829 non-null  float64
 4    lowlevel.barkbands_crest.var                    36829 non-null  float64
 5    lowlevel.barkbands_flatness_db.min              36829 non-null  float64
 6    lowlevel.barkbands_kurtosis.max                 36829 non-null  float64
 7    lowlevel.barkbands_kurtosis.mean                36829 non-null  float64
 8    lowlevel.barkbands_kurtosis.median              36829 non-null  float64
 9    lowlevel.barkbands_kurtosi

In [3]:
# preparing the data to be fed into our model
labels = ['mbdata.techno', 'mbdata.house', 'mbdata.trance', 'mbdata.dnb']
drop_cols = ['mbdata.id', 'mbdata.title', 'mbdata.artist-name', 'mbdata.artist-id',
             'mbdata.all-tags', 'mbdata.genre']

X1 = df1.drop(columns=drop_cols+labels)
y1 = df1[labels]
X2 = df2.drop(columns=drop_cols+labels)
y2 = df2[labels]
X3 = df3.drop(columns=drop_cols+labels)
y3 = df3[labels]

In [38]:
# train test split
from sklearn.model_selection import train_test_split

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=415)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=415)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=415)

List of Models to Consider:
- XGBClassifier
- RandomForestClassifier
- SVM
- Naive Bayes

Performance Metrics
- Accuracy (not exact match)
- Precision/ Recall/ F1-Score
- Hamming Loss


### Random Forest

In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import jaccard_score


In [39]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
multi_rf_classifier = MultiOutputClassifier(rf_classifier, n_jobs=-1)
multi_rf_classifier.fit(X2_train, y2_train)

In [None]:
y2_pred = multi_rf_classifier.predict(X2_test)
for i, label in enumerate(y2.columns):
    print(f"Label: {label}")
    print(classification_report(y2_test.iloc[:, i], y2_pred[:, i]))

In [None]:
# Performing a RandomSearchCV for hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV

base_estimator = RandomForestClassifier(random_state=42)

param_distributions = {
    'estimator__n_estimators': [100, 200, 500],
    'estimator__max_depth': [None, 10, 20],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4],
    'estimator__max_features': ['auto', 'sqrt', 'log2']
}

multi_output_model = MultiOutputClassifier(base_estimator, n_jobs=-1)


random_search = RandomizedSearchCV(estimator=multi_output_model, 
                                   param_distributions=param_distributions, 
                                   n_iter=10, cv=3, verbose=2, n_jobs=-1, random_state=42)
random_search.fit(X2_train, y2_train)

print(random_search.best_params_)

In [47]:
# Using the model with best_params

best_model = random_search.best_estimator_
y2_pred = best_model.predict(X2_test)

In [78]:
(y2_test == y2_pred).values.flatten().mean()

0.8196103719793646

In [65]:
hamming_loss(y2_test, y2_pred)

0.18038962802063535

In [62]:
accuracy = accuracy_score(y2_test, y2_pred, normalize=True)
print(f'Accuracy: {accuracy}')
print(classification_report(y2_test, y2_pred, target_names=labels))

Accuracy: 0.46592451805593266
               precision    recall  f1-score   support

mbdata.techno       0.76      0.41      0.53      2207
 mbdata.house       0.78      0.53      0.63      2943
mbdata.trance       0.92      0.53      0.67      2111
   mbdata.dnb       0.91      0.59      0.71      1769

    micro avg       0.83      0.51      0.64      9030
    macro avg       0.84      0.52      0.64      9030
 weighted avg       0.83      0.51      0.63      9030
  samples avg       0.60      0.55      0.56      9030



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### XGBoost

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier

#### EDA Version 2

In [None]:
# Base estimator
xgb_model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# Parameter grid for RandomizedSearchCV
param_distributions = {
    'estimator__n_estimators': [100, 200, 500],
    'estimator__max_depth': [3, 5, 7, 10],
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__subsample': [0.6, 0.8, 1.0],
    'estimator__colsample_bytree': [0.6, 0.8, 1.0]
}

# Wrap XGBClassifier with MultiOutputClassifier
multi_output_model = MultiOutputClassifier(xgb_model, n_jobs=-1)

# Set up RandomizedSearchCV
xgb_random_search = RandomizedSearchCV(
    estimator=multi_output_model,
    param_distributions=param_distributions,
    n_iter=10,  # Number of parameter settings sampled
    cv=3,  # 3-fold cross-validation
    verbose=2,  # Verbosity level
    n_jobs=-1,  # Use all processors
    random_state=42
)

# Fit RandomizedSearchCV to the training data
xgb_random_search.fit(X2_train, y2_train)

# Print the best parameters found by RandomizedSearchCV
print("Best Parameters:", random_search.best_params_)

In [13]:

xgb_model = XGBClassifier(
    booster = 'gbtree',
    learning_rate = 0.2,
    max_depth = 8,
    subsample = 0.8,
    objective = 'binary:logistic',
    num_class = 4
    )

clf = MultiOutputClassifier(xgb_model)

clf.fit(X2_train, y2_train)


In [None]:
# hyperparameter tuning
xgb_model = XGBClassifier()

param_dist = {
    'estimator__learning_rate': uniform(0.01, 0.2),
    'estimator__max_depth': randint(3, 10),
    'estimator__n_estimators': randint(50, 200),
    'estimator__subsample': uniform(0.8, 1.0),
    'estimator__colsample_bytree': uniform(0.8, 1.0)
}

random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist,
                                   n_iter=50, scoring='f1_weighted', cv=3, verbose=1, random_state=42)


random_search.fit(X2_train, y2_train)

best_params = random_search.best_params_
print("Best parameters found: ", best_params)



In [17]:
best_model = random_search.best_estimator_
y2_pred = best_model.predict(X2_test)

In [22]:
accuracy = (y2_pred == y2_test).mean()
accuracy

mbdata.techno    0.798670
mbdata.house     0.768395
mbdata.trance    0.876324
mbdata.dnb       0.908363
dtype: float64

In [21]:
from sklearn.metrics import classification_report
print(classification_report(y2_test, y2_pred, target_names=labels))

               precision    recall  f1-score   support

mbdata.techno       0.71      0.52      0.60      2147
 mbdata.house       0.74      0.67      0.70      2995
mbdata.trance       0.84      0.70      0.76      2117
   mbdata.dnb       0.85      0.75      0.80      1757

    micro avg       0.78      0.66      0.71      9016
    macro avg       0.78      0.66      0.72      9016
 weighted avg       0.78      0.66      0.71      9016
  samples avg       0.72      0.70      0.69      9016



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
# trying without OnevsRest
xgb_model = XGBClassifier()
xgb_model.fit(X2_train, y2_train)
y2_pred = xgb_model.predict(X2_test)

In [29]:
accuracy = (y2_pred == y2_test).mean()
accuracy

mbdata.techno    0.797719
mbdata.house     0.761472
mbdata.trance    0.873880
mbdata.dnb       0.906462
dtype: float64

In [30]:
report = classification_report(y2_test, y2_pred, target_names=['techno', 'house', 'trance', 'dnb'])
print(report)

              precision    recall  f1-score   support

      techno       0.70      0.54      0.61      2147
       house       0.73      0.66      0.69      2995
      trance       0.84      0.70      0.76      2117
         dnb       0.84      0.75      0.79      1757

   micro avg       0.77      0.66      0.71      9016
   macro avg       0.78      0.66      0.71      9016
weighted avg       0.77      0.66      0.71      9016
 samples avg       0.71      0.70      0.69      9016



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Logistic Regression

In [79]:
from sklearn.linear_model import LogisticRegression

In [97]:
logistic_model = LogisticRegression(solver='lbfgs', max_iter=1000)
multi_output_logi = MultiOutputClassifier(logistic_model, n_jobs=-1)
multi_output_logi.fit(X2_train, y2_train)

In [98]:
y2_pred_logi = multi_output_logi.predict(X2_test.values)

In [99]:
(y2_test == y2_pred_logi).mean()

mbdata.techno    0.700380
mbdata.house     0.600462
mbdata.trance    0.713413
mbdata.dnb       0.759843
dtype: float64

In [105]:
(y2_test == y2_pred_logi).values.flatten().mean()

0.6935243008417051

In [96]:
classification_report(y2_test, y2_pred_logi, zero_division=0, target_names=labels)

'               precision    recall  f1-score   support\n\nmbdata.techno       0.00      0.00      0.00      2207\n mbdata.house       0.00      0.00      0.00      2943\nmbdata.trance       0.00      0.00      0.00      2111\n   mbdata.dnb       0.00      0.00      0.00      1769\n\n    micro avg       0.00      0.00      0.00      9030\n    macro avg       0.00      0.00      0.00      9030\n weighted avg       0.00      0.00      0.00      9030\n  samples avg       0.00      0.00      0.00      9030\n'

In [108]:
# trying a different threshold
y2_pred_logi_prob = multi_output_logi.predict_proba(X2_test.values)



In [115]:
thresholds = [0.5, 0.5, 0.5, 0.5]

y2_pred_logi_thres = np.array([
    (probs[:, 1] >= thresholds[i]).astype(int)
    for i, probs in enumerate(y2_pred_logi_prob)
]).T

In [116]:
print((y2_pred_logi_thres == y2_test).mean())

print(classification_report(y2_test, y2_pred_logi_thres))

mbdata.techno    0.700380
mbdata.house     0.600462
mbdata.trance    0.713413
mbdata.dnb       0.759843
dtype: float64
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2207
           1       0.00      0.00      0.00      2943
           2       0.00      0.00      0.00      2111
           3       0.00      0.00      0.00      1769

   micro avg       0.00      0.00      0.00      9030
   macro avg       0.00      0.00      0.00      9030
weighted avg       0.00      0.00      0.00      9030
 samples avg       0.00      0.00      0.00      9030



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Random Guessing

In [118]:
def generate_weighted_random_predictions(X, label_probabilities):
    """
    Generate random binary predictions for multi-label classification based on label probabilities.
    
    Parameters:
    - X: DataFrame or array-like, shape (n_samples, n_features)
    - label_probabilities: list or array, shape (num_labels,), probabilities of each label being assigned
    
    Returns:
    - y_pred_weighted: array, shape (n_samples, num_labels)
    """
    num_samples = X.shape[0]
    num_labels = len(label_probabilities)
    
    # Generate random binary predictions based on label probabilities
    y_pred_weighted = np.zeros((num_samples, num_labels))
    
    for i in range(num_samples):
        for j in range(num_labels):
            if np.random.rand() < label_probabilities[j]:
                y_pred_weighted[i, j] = 1
    
    return y_pred_weighted

In [126]:
techno_pro = (y2['mbdata.techno'].sum())/len(y2)
house_pro = (y2['mbdata.house'].sum())/len(y2)
trance_pro = (y2['mbdata.trance'].sum())/len(y2)
dnb_pro = (y2['mbdata.dnb'].sum())/len(y2)


In [127]:
label_probabilities = [techno_pro, house_pro, trance_pro, dnb_pro]
y2_pred_rand = generate_weighted_random_predictions(X2_test, label_probabilities)

In [128]:
(y2_pred_rand == y2_test).mean()

mbdata.techno    0.576025
mbdata.house     0.518463
mbdata.trance    0.593402
mbdata.dnb       0.638746
dtype: float64

In [129]:
classification_report(y2_test, y2_pred_rand, target_names=labels)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


'               precision    recall  f1-score   support\n\nmbdata.techno       0.29      0.29      0.29      2207\n mbdata.house       0.40      0.42      0.41      2943\nmbdata.trance       0.29      0.28      0.28      2111\n   mbdata.dnb       0.25      0.25      0.25      1769\n\n    micro avg       0.32      0.32      0.32      9030\n    macro avg       0.31      0.31      0.31      9030\n weighted avg       0.32      0.32      0.32      9030\n  samples avg       0.25      0.32      0.26      9030\n'

In [135]:
# removing features with three or more genres
num_genres = df2[labels].sum(axis=1)
df2_filtered = df2[num_genres < 3]


In [138]:
X2_f = df2_filtered.drop(columns=drop_cols+labels)
y2_f = df2_filtered[labels]

In [139]:
X2f_train, X2f_test, y2f_train, y2f_test = train_test_split(X2_f, y2_f, test_size=0.2, random_state=415)

In [140]:
xgb_model_f = XGBClassifier(
    subsample=1.0,
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    colsample_bytree=1.0,
    objective='binary:logistic',  # For multi-label classification, use 'binary:logistic'
    use_label_encoder=False,      # This disables the label encoder warning
    eval_metric='logloss'         # Common evaluation metric for binary classification
)

multi_output_xgb_f = MultiOutputClassifier(xgb_model_f, n_jobs=-1)

multi_output_xgb_f.fit(X2f_train, y2f_train)

In [141]:
y2f_pred = multi_output_xgb_f.predict(X2f_test)

In [142]:
(y2f_pred == y2f_test).mean()

mbdata.techno    0.816443
mbdata.house     0.779149
mbdata.trance    0.882296
mbdata.dnb       0.908499
dtype: float64

In [145]:
print(classification_report(y2f_test, y2f_pred, zero_division=0, target_names=labels))

               precision    recall  f1-score   support

mbdata.techno       0.74      0.54      0.62      2038
 mbdata.house       0.75      0.67      0.71      2868
mbdata.trance       0.85      0.70      0.77      1986
   mbdata.dnb       0.84      0.74      0.79      1680

    micro avg       0.79      0.66      0.72      8572
    macro avg       0.80      0.66      0.72      8572
 weighted avg       0.79      0.66      0.72      8572
  samples avg       0.72      0.69      0.69      8572

