In [15]:
import os
import librosa
import pandas as pd
import numpy as np
import json
import torch
import torchaudio
import torch.nn as nn
from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, random_split
from IPython.display import Audio
from transformers import Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2ForSequenceClassification
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import random

import umap.umap_ as umap
import matplotlib.pyplot as plt

from functions import AudioMNISTDataset, create_dataloaders, collate_fn

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [16]:
# Reload the processor (optional but useful if needed for inference)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

dataloaders = create_dataloaders("AudioMNIST/data/", processor, batch_size=16)

train_loader = dataloaders["train"]
val_loader = dataloaders["val"]
test_loader = dataloaders["test"]

In [19]:
# Load saved model
model1 = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base", num_labels=10)
model1.load_state_dict(torch.load("seq-class-head.pth", weights_only=True))
model1.eval()

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [None]:
# Load saved model
model2 = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base", num_labels=10)
model2.load_state_dict(torch.load("seq-class-fine.pth", weights_only=True)))
model2.eval()

### UMAP of model1 (only class head)

In [20]:
# Hook storage
projector_features = []

# Hook definition
def hook(module, input, output):
    # Apply mean pooling over the sequence length dimension (dim=1)
    pooled_output = output.mean(dim=1)  # Shape: [batch_size, 768]
    projector_features.append(pooled_output.detach())

# Attach hook to the projector layer
hook_handle = model1.projector.register_forward_hook(hook)

# Validation loop
with torch.no_grad():
    all_labels = []  # To store labels for UMAP
    for batch in test_loader:
        input_values, labels = batch
        input_values = input_values.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model1(input_values)  # This will trigger the hook
        all_labels.append(labels.cpu())  # Collect labels

# Remove the hook after extraction
hook_handle.remove()

# Stack the collected features and labels
projector_features = torch.cat(projector_features, dim=0)  # Shape: [total_samples, 768]
all_labels = torch.cat(all_labels, dim=0)  # Shape: [total_samples]

# Convert to NumPy for UMAP
features_np = projector_features.cpu().numpy()
labels_np = all_labels.cpu().numpy()

# UMAP Projection
umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
umap_embeddings = umap_reducer.fit_transform(features_np)

# Plot UMAP
plt.figure(figsize=(10, 7))
plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=labels_np, cmap='tab10', s=5)
plt.colorbar()
plt.title("UMAP Projection of Projector Features")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.show()

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same