In [1]:
import torch
from transformers import Wav2Vec2FeatureExtractor, HubertModel, Wav2Vec2Model, WavLMModel
from datasets import load_dataset
import numpy as np
from sklearn.cluster import KMeans

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# C'est un dataset similaire à celui qu'on veut mais plus petit
# Tous les datasets sont sur HuggingFace de toute façon donc c'est facile à changer
dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

## 1. HuBERT

### 1.1 Speech Encoder

In [3]:
# Load feature extractor and HuBERT model
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
model.eval()

HubertModel(
  (feature_extractor): HubertFeatureEncoder(
    (conv_layers): ModuleList(
      (0): HubertGroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): HubertFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): HubertEncoder(
    (pos_conv_embed): HubertPositionalConvEmbedding(
      (conv): Para

In [4]:
# Select first point of the dataset
input_values = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")['input_values']

In [5]:
with torch.no_grad():
    outputs = model(input_values)

In [6]:
# Extract encoder output (this includes feature extraction)
encoder_output = outputs['last_hidden_state']  # Shape: (1, seq_len, feature_dim)
print("Encoder Output Shape:", encoder_output.shape)

Encoder Output Shape: torch.Size([1, 292, 768])


In [7]:
encoder_output

tensor([[[ 0.0924, -0.0873,  0.2480,  ..., -0.0481,  0.1011, -0.3813],
         [ 0.1171, -0.0870,  0.2565,  ..., -0.0525,  0.0991, -0.4402],
         [ 0.1896, -0.0639,  0.2879,  ..., -0.0714,  0.0727, -0.5391],
         ...,
         [ 0.1721,  0.3426,  0.0415,  ..., -0.0303, -0.1977, -0.6863],
         [ 0.1121,  0.1157,  0.1866,  ..., -0.1068, -0.1563, -0.5571],
         [ 0.0897,  0.0344,  0.2302,  ..., -0.0846, -0.0011, -0.4501]]])

### 1.2 Quantizer

In [8]:
# Apply K-Means clustering
num_clusters = 50
features = encoder_output.squeeze(0).numpy()
features.shape  # Shape: (seq_len, feature_dim)

(292, 768)

In [9]:
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(features)

In [10]:
# Convert encoder output to discrete representations
quantized_ids = kmeans.predict(features)
print("Discrete Representation (First 20 IDs):", quantized_ids[:20])

Discrete Representation (First 20 IDs): [33 33 33 33  9  9  9  9  9  9  9  9  9  4  4  4  4  4  4  4]


## 2. wav2vec2

### 2.1 Speech Encoder

In [11]:
# Load feature extractor and model
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
model.eval()



Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [12]:
# Select first point of the dataset
input_values = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")['input_values']

In [13]:
with torch.no_grad():
    outputs = model(input_values)

In [14]:
# Extract encoder output (this includes feature extraction)
encoder_output = outputs['last_hidden_state']  # Shape: (1, seq_len, feature_dim)
print("Encoder Output Shape:", encoder_output.shape)

Encoder Output Shape: torch.Size([1, 292, 768])


In [15]:
encoder_output

tensor([[[ 0.0252, -0.0161,  0.1962,  ...,  0.5132,  0.2121, -0.1114],
         [-0.3064, -0.0877,  0.0485,  ...,  0.2346,  0.6384, -0.3538],
         [ 0.2099,  0.1193,  0.5077,  ...,  0.0555,  0.3368,  0.2325],
         ...,
         [-0.3104, -0.0688,  0.0304,  ...,  0.1952,  0.6314, -0.3537],
         [-0.3162, -0.0806,  0.0095,  ...,  0.1865,  0.6372, -0.3541],
         [-0.0199, -0.0527,  0.0903,  ...,  0.3927,  0.2868, -0.3366]]])

### 2.2 Quantizer

In [16]:
# Apply K-Means clustering
num_clusters = 50
features = encoder_output.squeeze(0).numpy()
features.shape  # Shape: (seq_len, feature_dim)

(292, 768)

In [17]:
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(features)

In [18]:
# Convert encoder output to discrete representations
quantized_ids = kmeans.predict(features)
print("Discrete Representation (First 20 IDs):", quantized_ids[:20])

Discrete Representation (First 20 IDs): [29  0  1  1  0  1  0  1  1  1  0  0  0  1 25 10 10 10 41  1]


## 3. WavLM

### 3.1 Speech Encoder

In [19]:
# Load feature extractor and model
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base")
model = WavLMModel.from_pretrained("microsoft/wavlm-base")
model.eval()

WavLMModel(
  (feature_extractor): WavLMFeatureEncoder(
    (conv_layers): ModuleList(
      (0): WavLMGroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x WavLMNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x WavLMNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): WavLMFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): WavLMEncoder(
    (pos_conv_embed): WavLMPositionalConvEmbedding(
      (conv): Parametrized

In [20]:
# Select first point of the dataset
input_values = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")['input_values']

In [21]:
with torch.no_grad():
    outputs = model(input_values)

In [22]:
# Extract encoder output (this includes feature extraction)
encoder_output = outputs['last_hidden_state']  # Shape: (1, seq_len, feature_dim)
print("Encoder Output Shape:", encoder_output.shape)

Encoder Output Shape: torch.Size([1, 292, 768])


In [23]:
encoder_output

tensor([[[-0.1524, -0.2139, -0.1196,  ...,  1.2128,  0.2217, -0.3977],
         [-0.1470, -0.2864, -0.0996,  ...,  1.2637,  0.2217, -0.4654],
         [-0.1055, -0.3247, -0.1150,  ...,  1.3419,  0.2127, -0.4782],
         ...,
         [ 0.0136, -0.2798, -0.4029,  ...,  0.9122,  0.2058, -0.3439],
         [-0.0423, -0.2395, -0.4088,  ...,  0.9519,  0.1429, -0.4677],
         [-0.1248, -0.2294, -0.2764,  ...,  0.9044,  0.1780, -0.5477]]])

### 3.2 Quantizer

In [24]:
# Apply K-Means clustering
num_clusters = 50
features = encoder_output.squeeze(0).numpy()
features.shape  # Shape: (seq_len, feature_dim)

(292, 768)

In [25]:
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(features)

In [26]:
# Convert encoder output to discrete representations
quantized_ids = kmeans.predict(features)
print("Discrete Representation (First 20 IDs):", quantized_ids[:20])

Discrete Representation (First 20 IDs): [12 12 12 12 12  6  6  6  6  6  6  6 29 29 29 29 29 29  3  3]
