In [14]:
from transformers import AutoModel, Wav2Vec2FeatureExtractor
from auditory_cortex.dataloader import DataLoader

import numpy as np


  torchaudio.set_audio_backend("sox_io")


In [2]:
from auditory_cortex.utils import get_receptive_fields
print("Wav2vec...!")
kernels = [20, 5, 3, 3, 3, 2, 2]
strides = [10, 3, 2, 2, 2, 2, 2]
# the last entries are kernel size and strides of the 'convolution position encoding'

get_receptive_fields(kernels, strides, fs=48000)

Wav2vec...!
Calculating receptive fields for all layers...
Layer 0, RF:    20 samples, 0.42 ms, sampling_rate: 4800Hz, sampling_time: 0.208ms
Layer 1, RF:    60 samples, 1.25 ms, sampling_rate: 1600Hz, sampling_time: 0.625ms
Layer 2, RF:   120 samples, 2.50 ms, sampling_rate: 800Hz, sampling_time: 1.250ms
Layer 3, RF:   240 samples, 5.00 ms, sampling_rate: 400Hz, sampling_time: 2.500ms
Layer 4, RF:   480 samples, 10.00 ms, sampling_rate: 200Hz, sampling_time: 5.000ms
Layer 5, RF:   720 samples, 15.00 ms, sampling_rate: 100Hz, sampling_time: 10.000ms
Layer 6, RF:  1200 samples, 25.00 ms, sampling_rate: 50Hz, sampling_time: 20.000ms


In [18]:
dataloader = DataLoader()
aud = dataloader.metadata.stim_audio(sent=12)
fs = dataloader.metadata.get_sampling_rate()

Using default normalizer file...


In [19]:
model_name = 'w2v2_generic'
dnn_obj = dataloader.get_DNN_obj(model_name=model_name)

Model on device: cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
audio_input = aud.astype(np.float64)
input_values = dnn_obj.extractor.processor(
	audio_input, sampling_rate=48000, return_tensors="pt", padding="longest"
	).input_values  # Batch size 1
dnn_obj.extractor.model.eval()
# with torch.no_grad():
input_values = input_values.to(dnn_obj.extractor.device)
out = dnn_obj.extractor.model(input_values)

In [22]:
out

Wav2Vec2BaseModelOutput(last_hidden_state=tensor([[[-0.7463,  0.3522, -0.1401,  ..., -0.6947,  1.1080, -1.1386],
         [-0.6872,  0.3424, -0.1933,  ..., -0.6574,  1.1438, -1.2890],
         [-0.9524,  0.7153,  0.1298,  ..., -0.4903,  0.9808, -1.2617],
         ...,
         [-1.1682,  0.6434,  0.1576,  ..., -0.7273,  1.1989, -1.2184],
         [-0.3292,  0.7253, -0.3920,  ..., -0.5218,  1.1232, -1.6271],
         [-0.7395,  0.3259, -0.0253,  ..., -0.7346,  1.1811, -1.1488]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>), extract_features=tensor([[[-0.7850, -0.8867, -0.9281,  ..., -0.4476, -0.9006, -0.5120],
         [-0.8131, -0.7833, -0.8405,  ..., -0.1075, -0.8504, -0.4853],
         [-0.1379,  2.6510,  2.7070,  ...,  3.1893,  0.9478, -0.6239],
         ...,
         [-0.4617,  2.4437,  3.5720,  ...,  4.8108,  1.3308, -0.7093],
         [-0.7013, -0.5920, -0.6433,  ...,  1.4962,  1.4987,  0.9948],
         [-0.7077, -0.8635, -0.9188,  ..., -0.4738, -0.8700, -0.4984]

In [2]:
model_id = "bilalhsp/wav2vec2-48KHz-audioset-natual-sounds-v1"
cache_dir='/scratch/gilbreth/ahmedb/cache/huggingface/models/'

model = AutoModel.from_pretrained(
	model_id,
	cache_dir=cache_dir,
	)

processor =	Wav2Vec2FeatureExtractor.from_pretrained(
	model_id,
	cache_dir=cache_dir,
	)

In [7]:
layer_names = dict([*model.named_modules()])
# ['wav2vec2.feature_extractor.conv_layers.0.layer_norm']

In [12]:
layer_names['feature_extractor']

Wav2Vec2FeatureEncoder(
  (conv_layers): ModuleList(
    (0): Wav2Vec2GroupNormConvLayer(
      (conv): Conv1d(1, 512, kernel_size=(20,), stride=(10,), bias=False)
      (activation): GELUActivation()
      (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
    )
    (1): Wav2Vec2NoLayerNormConvLayer(
      (conv): Conv1d(512, 512, kernel_size=(5,), stride=(3,), bias=False)
      (activation): GELUActivation()
    )
    (2-4): 3 x Wav2Vec2NoLayerNormConvLayer(
      (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
      (activation): GELUActivation()
    )
    (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
      (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
      (activation): GELUActivation()
    )
  )
)

In [13]:
layer_names['encoder']

Wav2Vec2Encoder(
  (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
    (conv): ParametrizedConv1d(
      768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16
      (parametrizations): ModuleDict(
        (weight): ParametrizationList(
          (0): _WeightNorm()
        )
      )
    )
    (padding): Wav2Vec2SamePadLayer()
    (activation): GELUActivation()
  )
  (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (layers): ModuleList(
    (0-11): 12 x Wav2Vec2EncoderLayer(
      (attention): Wav2Vec2SdpaAttention(
        (k_proj): Linear(in_features=768, out_features=768, bias=True)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (dropout): Dropout(p=0.1, inplace=False)
      (layer_norm): LayerNorm((768,), eps=1e-05, elementwi