In [4]:
#!pip install    datasets

In [5]:
from transformers import Wav2Vec2Processor, HubertModel
from datasets import load_dataset
import soundfile as sf

processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")

OSError: cannot load library '/Users/aziiz/miniforge3/lib/python3.9/site-packages/_soundfile_data/libsndfile.dylib': dlopen(/Users/aziiz/miniforge3/lib/python3.9/site-packages/_soundfile_data/libsndfile.dylib, 0x0002): tried: '/Users/aziiz/miniforge3/lib/python3.9/site-packages/_soundfile_data/libsndfile.dylib' (no such file)

In [5]:
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch


ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.map(map_to_array)



In [6]:
input_values = processor(ds["speech"][0], return_tensors="pt").input_values

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [7]:
input_values

tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]])

In [8]:
import torch
from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features) :
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature } for feature in features]
     
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )  
       
        batch["mask_time_indices"] = [ self.get_mask(i)  for i in batch["input_values"] ]

        return batch


    def get_mask (self , input_values):
     
      batch_size, raw_sequence_length = 1 , input_values.shape[0]
      sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
      mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
      mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)

      return  mask_time_indices


In [9]:
dataCollactor = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [10]:
data =dataCollactor(ds["speech"])

In [11]:
data

{'input_values': tensor([[ 2.3804e-03,  2.0752e-03,  1.9836e-03,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-1.5259e-04, -9.1553e-05, -1.8311e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-6.7139e-04,  6.1035e-05,  5.1880e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [-3.0518e-05, -1.8311e-04, -2.1362e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-3.9673e-04, -3.0518e-05,  8.8501e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.8311e-03, -2.1362e-04, -6.1035e-05,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32), 'mask_time_indices': [tensor([[0, 0, 0,  ..., 1, 1, 1]]), tensor([[0, 0, 0,  ..., 0, 0, 0]]), 

In [12]:
from torch.utils.data import DataLoader , Dataset

In [13]:
# Need to override __init__, __len__, __getitem__
# as per datasets requirement
class CustomDataset(Dataset):
    def __init__(self, data):
      self.data =data
      
    def __len__(self):
        return len(self.data["input_values"])

    def __getitem__(self, idx):

      input_values = self.data["input_values"][idx]
      attention_mask = self.data["attention_mask"][idx]
      mask_time_indices = self.data["mask_time_indices"][idx]

      
      return {'input_values':input_values ,
              'attention_mask': attention_mask,
              'mask_time_indices':mask_time_indices 
              }

In [14]:
costum_dataset = CustomDataset(data) 

In [15]:
train_loader =DataLoader(costum_dataset , batch_size = 4)

In [16]:
data = next(iter(train_loader))

In [17]:
data

{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32),
 'input_values': tensor([[ 2.3804e-03,  2.0752e-03,  1.9836e-03,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-1.5259e-04, -9.1553e-05, -1.8311e-04,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-6.7139e-04,  6.1035e-05,  5.1880e-04,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [-4.5776e-04, -3.9673e-04, -4.8828e-04,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]]),
 'mask_time_indices': tensor([[[0, 0, 0,  ..., 1, 1, 1]],
 
         [[0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 1, 1, 1]],
 
         [[0, 1, 1,  ..., 0, 0, 0]]])}

In [18]:
data["input_values"][0].view(1,-1)

tensor([[0.0024, 0.0021, 0.0020,  ..., 0.0000, 0.0000, 0.0000]])

In [None]:
output = model (data["input_values"][0].view(1,-1) ,attention_mask=data["attention_mask"][0]  , mask_time_indices=data["mask_time_indices"][0]) 

In [2]:
data

NameError: ignored

In [15]:
import librosa 
import torch

In [148]:
from transformers import Wav2Vec2Processor, HubertModel
from datasets import load_dataset
import soundfile as sf

processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch


ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.map(map_to_array)


Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertModel: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [151]:

input_values = processor(ds["speech"][0], return_tensors="pt")  # Batch size 1

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [155]:
input_values

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)}

In [153]:
with torch.no_grad():
  logits = model(**input_values)

In [154]:
logits

BaseModelOutput([('last_hidden_state',
                  tensor([[[ 1.0640e-01,  4.2005e-01,  4.7533e-01,  ...,  1.9920e-01,
                            -2.1367e-01, -6.6571e-02],
                           [ 7.9912e-02,  3.8839e-01,  5.8362e-01,  ...,  2.0243e-01,
                            -1.8895e-01,  1.2788e-01],
                           [ 8.1471e-02,  7.1193e-01, -3.9914e-02,  ..., -2.0461e-01,
                            -2.5948e-01, -2.1146e-01],
                           ...,
                           [ 1.1822e-01,  3.5617e-01,  4.9074e-01,  ...,  1.6533e-01,
                            -3.3429e-01,  2.4379e-01],
                           [-3.8053e-04,  4.1052e-01,  4.0782e-01,  ...,  1.7386e-01,
                            -3.5163e-01,  1.6484e-01],
                           [-4.2190e-02,  4.1631e-01,  2.8498e-01,  ...,  2.0109e-01,
                            -3.4026e-01,  1.4069e-03]]]))])

In [131]:
import  numpy.ma  as ma 

In [135]:
import numpy as np 

In [139]:
mask_time_indices[-1] 

tensor([1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
        1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
        0, 0, 1, 1])

In [137]:
arr =np.random.rand(1, 120000)

In [141]:
maskArr = ma.masked_array(arr, mask=mask_time_indices[-1])

MaskError: ignored

In [146]:
input_values

tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]])

In [147]:
mask_time_indices

tensor([[1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
         1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
         0, 0, 1, 1]])

In [122]:
batch_size, raw_sequence_length = input_values.shape
sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)

In [130]:

with torch.no_grad():
    outputs = model(input_values, mask_time_indices=mask_time_indices)

IndexError: ignored

In [119]:
output = model(input_values, mask_time_indices=mask_time_indices)

IndexError: ignored

In [124]:
import torch
from transformers import AutoFeatureExtractor, Wav2Vec2ForPreTraining
from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices
from datasets import load_dataset

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")

ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1


  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [125]:

# compute masked indices
batch_size, raw_sequence_length = input_values.shape
sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
mask_time_indices = torch.tensor(mask_time_indices, device=input_values.device, dtype=torch.long)

with torch.no_grad():
    outputs = model(input_values, mask_time_indices=mask_time_indices)


In [126]:
outputs

Wav2Vec2ForPreTrainingOutput([('projected_states',
                               tensor([[[-0.2813,  2.6754, -0.6393,  ..., -0.0373,  0.0163,  0.0333],
                                        [ 1.5432,  2.1354, -0.3621,  ..., -0.7258,  0.5428, -1.0003],
                                        [-0.9096,  3.5332,  0.7381,  ..., -2.7142, -3.3135,  0.9322],
                                        ...,
                                        [-0.8724,  0.5275,  1.1799,  ..., -0.0055, -1.1114,  0.4923],
                                        [-1.0138, -1.4169,  0.9391,  ..., -0.6488, -1.2267, -0.9506],
                                        [-1.0640, -1.1501,  1.5004,  ..., -0.5585, -0.9427,  1.0709]]])),
                              ('projected_quantized_states',
                               tensor([[[ 0.3261,  0.9365,  0.0114,  ..., -0.1760, -0.4141,  0.4569],
                                        [ 0.4270,  0.7415,  0.0088,  ..., -0.1571, -0.2923,  0.3007],
                       

In [None]:

# compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)

# show that cosine similarity is much higher than random
cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5

# for contrastive loss training model should be put into train mode
model = model.train()
loss = model(input_values, mask_time_indices=mask_time_indices).loss

In [86]:
from transformers import HubertModel, HubertConfig

# Initializing a Hubert facebook/hubert-base-ls960 style configuration
configuration = HubertConfig()

# Initializing a model from the facebook/hubert-base-ls960 style configuration
model = HubertModel(configuration)

# Accessing the model configuration
configuration = model.config