In [None]:
import torch
import torch.nn as nn #contains convolution layers and non linear activations
import torch.nn.functional as F #has non linear activations
import torch.optim as optim #functionalities to construct optimizer

# 1. RNN Model Definition
class RNNGenreClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes, bidirectional=True):
        super(RNNGenreClassifier, self).__init__()
        self.rnn = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional
        )
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), num_classes)

    def forward(self, x):
        rnn_out, _ = self.rnn(x)  # rnn_out: [batch_size, seq_len, hidden_dim * num_directions]
        final_output = rnn_out[:, -1, :]  # take last time step
        final_output = self.dropout(final_output)
        logits = self.fc(final_output)
        return logits  # shape: [batch_size, num_classes]


# 2. Setting Hyperparameters

input_dim = 128        # CNN deep feature vector size
hidden_dim = 64
num_layers = 2
num_classes = 10       # GTZAN has 10 genres
batch_size = 4
seq_len = 10

# 3. Instantiate Model, Loss, Optimizer

model = RNNGenreClassifier(input_dim, hidden_dim, num_layers, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. Simulate CNN Deep Vectors and Labels

# Fake CNN output sample
fake_cnn_output = torch.randn(batch_size, seq_len, input_dim)

# Simulated ground truth genre labels (0 to 9)
true_labels = torch.randint(0, num_classes, (batch_size,)) #


# 5. Training Step

model.train()
optimizer.zero_grad()

# Forward pass
logits = model(fake_cnn_output)

# Compute loss
loss = criterion(logits, true_labels)

# Compute predictions and accuracy
predicted = torch.argmax(logits, dim=1)
correct = (predicted == true_labels).sum().item()
accuracy = correct / batch_size

# Backward pass and optimization
loss.backward()
optimizer.step()

print("Training loss:", loss.item())
print("Training accuracy: {:.2f}%".format(accuracy * 100))

# 6. Softmax + Prediction



model.eval()
with torch.no_grad():
    probs = F.softmax(logits, dim=1)  # shape: [batch_size, num_classes]
    predicted_classes = torch.argmax(probs, dim=1)  # shape: [batch_size]

    print("\nPredicted probabilities (softmax):")
    print(probs)

    print("\nPredicted genres (class indices):")
    print(predicted_classes)

    print("\nTrue labels:")
    print(true_labels)


Training loss: 2.326669692993164
Training accuracy: 25.00%

Predicted probabilities (softmax):
tensor([[0.0895, 0.0979, 0.0894, 0.0925, 0.1110, 0.1093, 0.1125, 0.0954, 0.0974,
         0.1052],
        [0.0932, 0.0995, 0.0938, 0.0898, 0.1110, 0.1014, 0.1122, 0.0911, 0.1000,
         0.1077],
        [0.0911, 0.0964, 0.0945, 0.0928, 0.1105, 0.1034, 0.1034, 0.0982, 0.0997,
         0.1099],
        [0.0940, 0.1047, 0.0871, 0.0966, 0.1155, 0.1024, 0.1055, 0.0960, 0.0990,
         0.0992]])

Predicted genres (class indices):
tensor([6, 6, 4, 4])

True labels:
tensor([8, 3, 4, 0])
