In [2]:
import pandas as pd
import numpy as np
import pickle

from collections import Counter

# Data Loading

In [3]:
## Save clustered tracks

CLUSTERED_FEATURES_PARQUET = '../../data/features/featureswithouttags-clustered.parquet'
clustered_tracks = pd.read_parquet(CLUSTERED_FEATURES_PARQUET)
clustered_tracks.cluster.value_counts()

cluster
3    6408
2     325
1     265
Name: count, dtype: int64

## Label Encoding and Train/Test Splits

In [4]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split

In [5]:
features = ['acousticness',	'danceability',	'energy',	'instrumentalness',	'liveness',	'speechiness',	'tempo',	'valence']


X = clustered_tracks[features]
y = clustered_tracks[['cluster']]

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y = le.fit_transform(y)
print(Counter(y))

Counter({2: 6408, 1: 325, 0: 265})


  y = column_or_1d(y, warn=True)


 ## Resampling

Resample using combination of oversampling and undersampling to mitigate issues with loss of information and overffiting

[Reference](https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/)



In [7]:
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# define oversampling strategy


over = RandomOverSampler(sampling_strategy = {0:3000, 1:3000, 2:6408})
# fit and apply the transform
X, y = over.fit_resample(X, y)

# define undersampling strategy
under = RandomUnderSampler(sampling_strategy = {0:3000, 1:3000, 2:3000})
# fit and apply the transform
X, y = under.fit_resample(X, y)

print(Counter(y))

Counter({0: 3000, 1: 3000, 2: 3000})


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7 , random_state=1)


counter = Counter(y_train)
print(counter)
X

Counter({2: 2108, 0: 2107, 1: 2085})


Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence
851,0.992140,0.167797,0.010411,0.922597,0.132348,0.051878,89.896,0.068446
432,0.868906,0.158627,0.151605,0.379458,0.628070,0.034243,69.660,0.209385
2111,0.995735,0.366424,0.077054,0.933022,0.125037,0.038498,141.021,0.533368
2398,0.995548,0.449689,0.148947,0.950846,0.128296,0.050793,72.046,0.150903
2003,0.990834,0.228645,0.025946,0.946418,0.088346,0.050807,126.676,0.038601
...,...,...,...,...,...,...,...,...
8296,0.447676,0.922718,0.423639,0.784376,0.067840,0.377964,107.965,0.899969
11017,0.890645,0.532703,0.711675,0.911201,0.183856,0.029377,108.961,0.506513
9927,0.957875,0.277317,0.430877,0.019805,0.131935,0.036817,150.345,0.406333
6925,0.913720,0.154345,0.134965,0.943050,0.116647,0.035363,82.190,0.036819


# Supervised
---
## XGBoost

In [14]:

import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,accuracy_score


# Define hyperparameters
params = {"objective": "multi:softmax", "num_class": 3}
#evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]


print("="*30)
print(f"Performing XGBoost for selected features: {features}")

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train)
dtest_reg = xgb.DMatrix(X_test)

n = 10
model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,

)

y_train_pred = model.predict(dtrain_reg)
y_test_pred = model.predict(dtest_reg)

print(classification_report(y_train, y_train_pred))
print(classification_report(y_test, y_test_pred))

MODEL_FILE_NAME = 'xgboost.pkl'
pickle.dump(model, open(MODEL_FILE_NAME, 'wb'))

## TEST LOAD ##
# loaded_model = pickle.load(open(MODEL_FILE_NAME, 'rb'))
# y_test_pred = model.predict(dtest_reg)
# print(classification_report(y_test, y_test_pred))


Performing XGBoost for selected features: ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'tempo', 'valence']
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2107
           1       0.91      0.92      0.92      2085
           2       0.93      0.90      0.92      2108

    accuracy                           0.94      6300
   macro avg       0.94      0.94      0.94      6300
weighted avg       0.94      0.94      0.94      6300

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       893
           1       0.88      0.92      0.90       915
           2       0.92      0.86      0.89       892

    accuracy                           0.93      2700
   macro avg       0.93      0.93      0.92      2700
weighted avg       0.93      0.93      0.92      2700



## Adaboost with Decision Trees

In [266]:
## Building the Descision Tree Model
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(
          max_depth=5,
          random_state=42)

tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(classification_report(y_train, y_train_pred))
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91      2107
           1       0.69      0.75      0.72      2085
           2       0.81      0.74      0.77      2108

    accuracy                           0.80      6300
   macro avg       0.80      0.80      0.80      6300
weighted avg       0.80      0.80      0.80      6300

              precision    recall  f1-score   support

           0       0.89      0.89      0.89       893
           1       0.68      0.75      0.71       915
           2       0.80      0.73      0.76       892

    accuracy                           0.79      2700
   macro avg       0.79      0.79      0.79      2700
weighted avg       0.79      0.79      0.79      2700



## Neural Networks

In [68]:
import torch
import torch.nn as nn
import torch.optim as optim

In [69]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [216]:
#X_train['acousticness']

10512    0.995360
15983    0.942041
4581     0.492131
18506    0.806081
18282    0.984884
           ...   
10955    0.992877
17289    0.332367
5192     0.198826
12172    0.995119
235      0.018238
Name: acousticness, Length: 13456, dtype: float64

In [189]:

from sklearn.preprocessing import OneHotEncoder
y_f = y.reshape(-1,1)
y_f

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).fit(y_f)
print(ohe.categories_)

[array([0, 1, 2], dtype=int64)]


In [223]:
X_train_tensor = torch.tensor(X_train['acousticness'].to_numpy().reshape(-1,1), dtype=torch.float32)
y_train_tensor = torch.tensor(ohe.transform(y_train.reshape(-1,1)), dtype=torch.float32)

X_test_tensor = torch.tensor(X_test['acousticness'].to_numpy().reshape(-1,1), dtype=torch.float32)
y_test_tensor = torch.tensor(ohe.transform(y_test.reshape(-1,1)), dtype=torch.float32)

In [224]:
X_train_tensor.shape

torch.Size([13456, 1])

In [225]:
model = nn.Sequential(
    nn.Linear(1, 32),
    nn.Sigmoid(),
    nn.Linear(32, 32),
    nn.Sigmoid(),
    nn.Linear(32, 3),
)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

n_epochs = 1000
batch_size = 8

for epoch in range(n_epochs):
    for i in range(0, len(X_train_tensor), batch_size):
        Xbatch = X_train_tensor[i:i+batch_size]
        y_pred = model(Xbatch)
        ybatch = y_train_tensor[i:i+batch_size]
        #print(y_pred,ybatch)
        training_loss = loss_fn(y_pred, ybatch)
        optimizer.zero_grad()
        training_loss.backward()
        optimizer.step()

    validation_loss = loss_fn(model(X_test_tensor),y_test_tensor)
    print(f'Finished epoch {epoch}, training loss: {training_loss}, validation loss: {validation_loss}')


Finished epoch 0, training loss: 0.47972866892814636, validation loss: 0.7490079402923584
Finished epoch 1, training loss: 0.4625144898891449, validation loss: 0.7208231687545776
Finished epoch 2, training loss: 0.4620116949081421, validation loss: 0.7128740549087524
Finished epoch 3, training loss: 0.45929384231567383, validation loss: 0.7053622603416443
Finished epoch 4, training loss: 0.45849013328552246, validation loss: 0.699413001537323
Finished epoch 5, training loss: 0.4589656591415405, validation loss: 0.6947147846221924
Finished epoch 6, training loss: 0.45999157428741455, validation loss: 0.6907551884651184
Finished epoch 7, training loss: 0.45992717146873474, validation loss: 0.6878984570503235
Finished epoch 8, training loss: 0.4585936665534973, validation loss: 0.6850202679634094
Finished epoch 9, training loss: 0.45819616317749023, validation loss: 0.6825110912322998
Finished epoch 10, training loss: 0.45674148201942444, validation loss: 0.6804521083831787
Finished epoch

KeyboardInterrupt: 

## KNN

In [268]:
## Standardize Data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(X)
X_stand = scaler.transform(X)

In [271]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors = 1)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(classification_report(y_train, y_train_pred))
print(classification_report(y_test, y_test_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2107
           1       1.00      1.00      1.00      2085
           2       1.00      1.00      1.00      2108

    accuracy                           1.00      6300
   macro avg       1.00      1.00      1.00      6300
weighted avg       1.00      1.00      1.00      6300

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       893
           1       0.91      1.00      0.95       915
           2       1.00      0.86      0.92       892

    accuracy                           0.95      2700
   macro avg       0.96      0.95      0.95      2700
weighted avg       0.96      0.95      0.95      2700



In [242]:
predictions = model.predict(clustered_tracks[features])
print(classification_report(le.transform(clustered_tracks[['cluster']]), predictions))

              precision    recall  f1-score   support

           0       0.74      1.00      0.85       265
           1       0.56      0.99      0.72       325
           2       1.00      0.95      0.97      6408

    accuracy                           0.95      6998
   macro avg       0.77      0.98      0.85      6998
weighted avg       0.97      0.95      0.96      6998



  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2, 1, 1, ..., 2, 2, 2], dtype=int64)