<a href="https://colab.research.google.com/github/besimorhino/ai-workshop/blob/main/audio_vectorize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install dependencies
!pip install -q transformers torchaudio librosa matplotlib scikit-learn

In [None]:
# Cell 2: Imports
import torch
import torchaudio
import librosa
import matplotlib.pyplot as plt
import numpy as np
from transformers import Wav2Vec2Model, Wav2Vec2Processor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from IPython.display import Audio, display

In [None]:
# Cell 3: Load Wav2Vec2 pre-trained model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [None]:
# Cell 4: Upload audio files (or use samples)
from google.colab import files
uploaded = files.upload()

# Load and resample to 16kHz mono
waveforms = []
file_names = []

for fn in uploaded.keys():
    waveform, sr = torchaudio.load(fn)
    waveform = waveform.mean(dim=0).unsqueeze(0)  # convert to mono
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)
    waveforms.append(waveform)
    file_names.append(fn)

KeyboardInterrupt: 

In [None]:
# Cell 5: Listen to audio
for i, fn in enumerate(file_names):
    print(f"Audio {i+1}: {fn}")
    display(Audio(waveforms[i].squeeze().numpy(), rate=16000))


In [None]:
# Cell 6: Generate embeddings
embeddings = []
with torch.no_grad():
    for waveform in waveforms:
        inputs = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt")
        outputs = model(**inputs)
        # Take mean of last hidden state as embedding
        emb = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(emb)

embeddings = np.stack(embeddings)


In [None]:
# Cell 7: Compute cosine similarity
sim = cosine_similarity(embeddings)
import pandas as pd
df = pd.DataFrame(sim, index=file_names, columns=file_names)
df.style.background_gradient(cmap='Blues')


In [None]:
# Cell 8: Visualize in 2D
pca = PCA(n_components=2)
reduced = pca.fit_transform(embeddings)

plt.figure(figsize=(8, 6))
for i, fn in enumerate(file_names):
    x, y = reduced[i]
    plt.scatter(x, y)
    plt.text(x + 0.01, y + 0.01, fn, fontsize=9)
plt.title("Audio Embeddings in 2D Space")
plt.grid(True)
plt.show()
