In [1]:
# Cell 1: Install dependencies
!pip install -q transformers torchaudio librosa matplotlib scikit-learn

In [None]:
# Cell 2: Imports
import torch
import torchaudio
import librosa
import matplotlib.pyplot as plt
import numpy as np
from transformers import Wav2Vec2Model, Wav2Vec2Processor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from IPython.display import Audio, display

In [None]:
# Cell 3: Load Wav2Vec2 pre-trained model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
model.eval()

In [None]:
# Cell 4: Upload audio files (or use samples)
from google.colab import files
uploaded = files.upload()

# Load and resample to 16kHz mono
waveforms = []
file_names = []

for fn in uploaded.keys():
    waveform, sr = torchaudio.load(fn)
    waveform = waveform.mean(dim=0).unsqueeze(0)  # convert to mono
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)
    waveforms.append(waveform)
    file_names.append(fn)

In [None]:
# Cell 5: Listen to audio
for i, fn in enumerate(file_names):
    print(f"Audio {i+1}: {fn}")
    display(Audio(waveforms[i].squeeze().numpy(), rate=16000))


In [None]:
# Cell 6: Generate embeddings
embeddings = []
with torch.no_grad():
    for waveform in waveforms:
        inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt")
        outputs = model(**inputs)
        # Take mean of last hidden state as embedding
        emb = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(emb)

embeddings = np.stack(embeddings)


In [None]:
# Cell 7: Compute cosine similarity
sim = cosine_similarity(embeddings)
import pandas as pd
df = pd.DataFrame(sim, index=file_names, columns=file_names)
df.style.background_gradient(cmap='Blues')


In [None]:
# Cell 8: Visualize in 2D
pca = PCA(n_components=2)
reduced = pca.fit_transform(embeddings)

plt.figure(figsize=(8, 6))
for i, fn in enumerate(file_names):
    x, y = reduced[i]
    plt.scatter(x, y)
    plt.text(x + 0.01, y + 0.01, fn, fontsize=9)
plt.title("Audio Embeddings in 2D Space")
plt.grid(True)
plt.show()
