In [16]:
#Import required libraries
import torch.nn as nn
import numpy as np, pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score,roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
import torch
from sklearn.feature_extraction.text import TfidfVectorizer


from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.utils.rnn import pad_sequence


import matplotlib.pyplot as plt

## Load Dataset

In [2]:
df = pd.read_csv("./spotify_millsongdata.csv")
print("DF Shape:",df.shape)
df.head()

DF Shape: (57650, 4)


Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.isna().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [12]:
# Preprocessing: Encode Labels
label_encoder = LabelEncoder()
df['artist'] = label_encoder.fit_transform(df['artist'])
df = df.drop(['link'], axis=1, errors='ignore')


In [18]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency
X = tfidf_vectorizer.fit_transform(df['text']).toarray()
y = df['artist'].values

In [19]:
# Split Data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [20]:
# Custom Dataset
class SongDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


In [21]:
# Create Dataset Objects
train_dataset = SongDataset(X_train, y_train)
val_dataset = SongDataset(X_val, y_val)
test_dataset = SongDataset(X_test, y_test)

# DataLoader
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Check DataLoader
for features, labels in train_loader:
    print("Features Batch Shape:", features.shape)
    print("Labels Batch Shape:", labels.shape)
    break

Features Batch Shape: torch.Size([32, 5000])
Labels Batch Shape: torch.Size([32])


### Define a PyTorch Model

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SongClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SongClassifier, self).__init__()
        
        # Define layers
        self.fc1 = nn.Linear(input_size, 256)  # First fully connected layer
        self.fc2 = nn.Linear(256, 128)        # Second fully connected layer
        self.fc3 = nn.Linear(128, num_classes)  # Output layer
        
        self.dropout = nn.Dropout(0.3)  # Dropout for regularization
    
    def forward(self, x):
        # Forward pass through layers
        x = F.relu(self.fc1(x))         # First layer with ReLU activation
        x = self.dropout(x)             # Apply dropout
        x = F.relu(self.fc2(x))         # Second layer with ReLU activation
        x = self.dropout(x)             # Apply dropout
        x = self.fc3(x)                 # Output layer (no activation here)
        return x

# Model Instantiation
input_size = 5000  # Feature size from TF-IDF
num_classes = len(label_encoder.classes_)  # Number of unique artists

model = SongClassifier(input_size=input_size, num_classes=num_classes)

# Print Model Summary
print(model)


SongClassifier(
  (fc1): Linear(in_features=5000, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=643, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


### Define Loss function

In [23]:
criterion = nn.CrossEntropyLoss()


### Define Optimizer

In [24]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


### Training loop

In [None]:
model = SongClassifier(input_size=5000, num_classes=643)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Training Loop Parameters
num_epochs = 10
train_losses, val_losses = [], []


In [27]:
for features, labels in train_loader:
    print("Features Shape:", features.shape)
    print("Labels Shape:", labels.shape)
    print("Features:", features)
    print("Labels:", labels)
    break  # Exit after inspecting the first batch

Features Shape: torch.Size([32, 5000])
Labels Shape: torch.Size([32])
Features: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Labels: tensor([292, 348, 384, 286, 461, 123, 196,  33, 567, 180, 490, 536,  70, 412,
          6, 392,  99,  62,  11, 618,  64, 153, 238,  59,  77, 227, 120, 612,
        362, 275, 514, 198])


In [None]:
#Define function to train the network
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_epochs = 50
batch_size= 128
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    train_loss= 0.0
    
    #Explicitly start model training
    model.train()
    
    for features, labels in train_loader:
        #Extract train batch from X and Y
        
        input_data, labels = features.to(device), labels.to(device)

        #set the gradients to zero before starting to do backpropragation
        
        optimizer.zero_grad()

        #Forward pass using model and the train data
        outputs = model(input_data)  

        #Caculate loss
        loss = criterion(outputs, labels)
        
        #Backpropogate
        loss.backward()
        
        #Update weights
        optimizer.step()
        
        train_loss += loss.item()
        
    avg_train_loss = train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    
    if epoch%10 == 0:
        print("Epoch: {} - Loss:{:.4f}".format(epoch,train_loss / len(train_loader) ))
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_train_loss:.4f}")

# Training Complete
print("Training finished!")

Epoch: 0 - Loss:1.6171
Epoch: 10 - Loss:1.1255
Epoch: 20 - Loss:0.8839
Epoch: 30 - Loss:0.7497
Epoch: 40 - Loss:0.6621
Epoch: 50 - Loss:0.6186
Epoch: 60 - Loss:0.5726
Epoch: 70 - Loss:0.5369
Epoch: 80 - Loss:0.5030
Epoch: 90 - Loss:0.4771
Epoch: 100 - Loss:0.4577
Epoch: 110 - Loss:0.4423
Epoch: 120 - Loss:0.4323
Epoch: 130 - Loss:0.4081
Epoch: 140 - Loss:0.4041


## Validation check

In [None]:
#Define function for evaluating NN

model.eval() #Explicitly set to evaluate mode

#Predict on Train and Validation Datasets
test_prob = model(test_loader)
test_pred =np.where(test_prob>0.5,1,0)
train_prob = model(train_loader)
train_prob =np.where(train_prob>0.5,1,0)

#Compute Training and Validation Metrics
print("\n Model Performance -")
print("Training Accuracy-",round(accuracy_score(train_loader,train_pred),3))
print("Training Precision-",round(precision_score(train_loader,train_pred),3))
print("Training Recall-",round(recall_score(train_loader,train_pred),3))
print("Training ROCAUC", round(roc_auc_score(train_loader
                               ,train_pred.detach().numpy()),3))
print("Validation Accuracy-",round(accuracy_score(test_loader,test_pred),3))
print("Validation Precision-",round(precision_score(test_loader,test_pred),3))
print("Validation Recall-",round(recall_score(test_loader,test_pred),3))
print("Validation ROCAUC", round(roc_auc_score(test_loader
                                 ,test_prob.detach().numpy()),3))
print("\n")


#Plot the Loss curve and ROC Curve
plt.figure(figsize=(20,5))
plt.subplot(1, 2, 1)
plt.plot(loss_list)
plt.title('Loss across epochs')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.subplot(1, 2, 2)


#Validation
fpr_v, tpr_v, _ = roc_curve(test_loader, test_prob.detach().numpy())
roc_auc_v = auc(fpr_v, tpr_v)


#Training
fpr_t, tpr_t, _ = roc_curve(train_loader, train_prob.detach().numpy())
roc_auc_t = auc(fpr_t, tpr_t)
plt.title('Receiver Operating Characteristic:Validation')
plt.plot(fpr_v, tpr_v, 'b', label = 'Validation AUC = %0.2f' % roc_auc_v)
plt.plot(fpr_t, tpr_t, 'r', label = 'Training AUC = %0.2f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

### Preparing data for pytorch and spliting

In [6]:
train_data, test_data = train_test_split(df, test_size=0.2)

In [7]:
# Custom Dataset
class LyricsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        lyric = self.data.iloc[index]['text']
        label = self.data.iloc[index]['artist']
        encoding = self.tokenizer(lyric, truncation=True, padding='max_length', max_length=self.max_len, return_tensors="pt")
        return encoding['input_ids'].squeeze(0), encoding['attention_mask'].squeeze(0), torch.tensor(label)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Datasets
train_dataset = LyricsDataset(train_data, tokenizer)
test_dataset = LyricsDataset(test_data, tokenizer)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)




In [11]:
import torch.nn as nn

class NeuralNetworkClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(2020)
        self.fc1 = nn.Linear(14, 30)
        self.relu1 = nn.ReLU()

        self.fc2 = nn.Linear(30, 30)
        self.relu2 = nn.ReLU()

        self.fc3 = nn.Linear(30, 5)
        self.relu3 = nn.ReLU()
        
        self.out = nn.Linear(5, 1)
        self.final = nn.Sigmoid()

    def forward(self, x):
        op = self.fc1(x)
        op = self.relu1(op)
        
        op = self.fc2(op)
        op = self.relu2(op)

        op = self.fc3(op)
        op = self.relu3(op)

        op = self.out(op)
        y = self.final(op)
        
        return y


In [25]:
num_epochs = 300
batch_size= 128
loss_function = nn.MSELoss()  #Binary Crosss Entropy Loss

#Hyperparameters
weight_decay=0.0 #set to 0; no L2 Regularizer; passed into the Optimizer
lambda_L1=0.0    #Set to 0; no L1 reg; manually added in loss (train_network)

#Create a model instance
model = NeuralNetworkClassifier()

adam_optimizer = torch.optim.Adam(model.parameters(), lr= 0.001,weight_decay=weight_decay)


# Model Parameters
vocab_size = tokenizer.vocab_size
embedding_dim = 128
hidden_dim = 256
output_dim = len(label_encoder.classes_)
pad_idx = tokenizer.pad_token_id

# Model
model = LyricsClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [None]:
# Training Loop
device = "cuda" if torch.cuda.is_available() else "cpu"

for epoch in range(10):  # Number of epochs
    model.train()
    total_loss = 0

    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 6.2857
Epoch 2, Loss: 6.1994
Epoch 3, Loss: 6.1124
Epoch 4, Loss: 6.0050
Epoch 5, Loss: 5.8683


In [None]:
# Evaluation
from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")