In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# 1. CNN + RNN Model Definition
class CNNRNNGenreClassifier(nn.Module):
    def __init__(self, cnn_out_channels, rnn_hidden_dim, num_layers, num_classes, bidirectional=True):
        super(CNNRNNGenreClassifier, self).__init__()

        # CNN part (e.g., 2D conv over spectrograms)
        self.cnn = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=cnn_out_channels, kernel_size=(3, 3), padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 2)),
            nn.Dropout(0.3)
        )

        # RNN part (each column of CNN output becomes a time step)
        self.rnn_input_dim = cnn_out_channels * (128 // 2)  # height becomes height//2 due to pooling
        self.rnn = nn.LSTM(
            input_size=self.rnn_input_dim,
            hidden_size=rnn_hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional
        )

        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(rnn_hidden_dim * (2 if bidirectional else 1), num_classes)

    def forward(self, x):
        # x: [batch_size, 1, height, width]
        cnn_out = self.cnn(x)  # → [B, C, H, W]

        # Flatten [C, H] into features, treat W as time steps
        batch_size, channels, height, width = cnn_out.size()
        cnn_out = cnn_out.view(batch_size, channels * height, width)  # → [B, F, T]
        rnn_input = cnn_out.permute(0, 2, 1)  # → [B, T, F]

        rnn_out, _ = self.rnn(rnn_input)
        final_output = self.dropout(rnn_out[:, -1, :])
        logits = self.fc(final_output)
        return logits

# 2. Hyperparameters
cnn_out_channels = 16
rnn_hidden_dim = 64
num_layers = 2
num_classes = 10
batch_size = 4

# 3. Instantiate Model, Loss, Optimizer
model = CNNRNNGenreClassifier(cnn_out_channels, rnn_hidden_dim, num_layers, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. Simulate Spectrogram Input (batch of spectrograms)
fake_spectrograms = torch.randn(batch_size, 1, 128, 20)  # [batch, channel=1, freq, time]
true_labels = torch.randint(0, num_classes, (batch_size,))

# 5. Training Step
model.train()
optimizer.zero_grad()
logits = model(fake_spectrograms)
loss = criterion(logits, true_labels)
predicted = torch.argmax(logits, dim=1)
accuracy = (predicted == true_labels).sum().item() / batch_size
loss.backward()
optimizer.step()

print("Training loss:", loss.item())
print("Training accuracy: {:.2f}%".format(accuracy * 100))

# 6. Evaluation
model.eval()
with torch.no_grad():
    probs = F.softmax(logits, dim=1)
    predicted_classes = torch.argmax(probs, dim=1)

    print("\nPredicted probabilities (softmax):")
    print(probs)
    print("\nPredicted genres (class indices):")
    print(predicted_classes)
    print("\nTrue labels:")
    print(true_labels)


Training loss: 2.339534282684326
Training accuracy: 0.00%

Predicted probabilities (softmax):
tensor([[0.0923, 0.0903, 0.1015, 0.0856, 0.1140, 0.1030, 0.1036, 0.0894, 0.1102,
         0.1100],
        [0.0951, 0.0907, 0.1029, 0.0912, 0.0996, 0.1084, 0.1042, 0.0952, 0.1060,
         0.1068],
        [0.0920, 0.0950, 0.0984, 0.0924, 0.1050, 0.0946, 0.1085, 0.0969, 0.1058,
         0.1115],
        [0.0951, 0.0894, 0.0944, 0.0930, 0.1126, 0.0956, 0.1116, 0.1066, 0.1038,
         0.0979]])

Predicted genres (class indices):
tensor([4, 5, 9, 4])

True labels:
tensor([5, 7, 5, 3])
