<a href="https://colab.research.google.com/github/ZahraDehghani99/AutoModel-vs-AutomodelForClassification-outputs/blob/main/AutoModel_vs_AutomodelForAudioClassification_outputs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ouput parameters of `AutoModel` and `AutoModelForAudioClassification` Classes

## Prerequisits

In [None]:
!pip install -q transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip uninstall -y transformers accelerate
!pip install -q transformers accelerate

Found existing installation: transformers 4.35.2
Uninstalling transformers-4.35.2:
  Successfully uninstalled transformers-4.35.2
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Wav2vec2-base

Wav2Vec2Model => [source code](https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1440)


Wav2Vec2ForSequenceClassification => [source code](https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L2021)

### Load feature extractor


In [None]:
from transformers import AutoFeatureExtractor

model_ckpt = "facebook/wav2vec2-base-960h"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_ckpt)

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

In [None]:
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [None]:
feature_extractor.model_input_names

['input_values', 'attention_mask']

In [None]:
from datasets import load_dataset

dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

Downloading builder script:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
dataset[0]["audio"]["array"]

array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
       0.0010376 ])

In [None]:
inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [None]:
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]])}

In [None]:
type(inputs)

transformers.feature_extraction_utils.BatchFeature

### AutoModel class (output_hidden_states = False)

In [None]:
from transformers import AutoModel

num_labels = 4
model = (AutoModel.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [None]:
model.encoder

Wav2Vec2Encoder(
  (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
    (conv): ParametrizedConv1d(
      768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16
      (parametrizations): ModuleDict(
        (weight): ParametrizationList(
          (0): _WeightNorm()
        )
      )
    )
    (padding): Wav2Vec2SamePadLayer()
    (activation): GELUActivation()
  )
  (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (layers): ModuleList(
    (0-11): 12 x Wav2Vec2EncoderLayer(
      (attention): Wav2Vec2Attention(
        (k_proj): Linear(in_features=768, out_features=768, bias=True)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (dropout): Dropout(p=0.1, inplace=False)
      (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_a

In [None]:
# previous inputs object don't have device type, so we should place them in the same device as the model (if we don't run this code and our model is in GPU, there will be a problem becaue the place of tensors and model is different.)
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]],
        device='cuda:0')}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

Wav2Vec2BaseModelOutput(last_hidden_state=tensor([[[-0.0032, -0.0086,  0.0228,  ..., -0.1943,  0.0403, -0.1071],
         [ 0.0021, -0.0078,  0.0206,  ..., -0.2157,  0.0311, -0.1106],
         [-0.0056, -0.0331,  0.0698,  ..., -0.2463, -0.0076, -0.1500],
         ...,
         [-0.0061, -0.0336,  0.0705,  ..., -0.2466, -0.0082, -0.1501],
         [-0.0013, -0.0162,  0.0132,  ..., -0.1883,  0.0449, -0.0850],
         [-0.0029, -0.0174,  0.0129,  ..., -0.1893,  0.0441, -0.0822]]],
       device='cuda:0'), extract_features=tensor([[[-0.1333,  0.4963, -0.1661,  ...,  0.0779, -0.1221,  0.2210],
         [-0.0114,  0.2933, -0.1618,  ...,  0.0162,  0.1916,  0.1411],
         [ 0.2863,  0.2206,  0.0498,  ...,  0.3393,  0.0983,  0.1178],
         ...,
         [-0.1779,  0.1665, -0.0653,  ...,  0.7739,  0.0545,  0.2607],
         [-0.3813,  0.0677,  0.1115,  ...,  1.0431,  0.3075,  0.1133],
         [ 0.2753,  0.1415,  0.1551,  ...,  0.5013,  0.1535, -0.0632]]],
       device='cuda:0'), hidden_

In [None]:
for k, v in outputs.items():
  print(k)

last_hidden_state
extract_features


As we can see, we set didn't set `output_hidden_states = True`, so in the output, `hidden_states` parameter doesn't have value and we have value just for `last_hidden_state` and `extract_features` parameters.

In [None]:
y = outputs["last_hidden_state"]

In [None]:
y

tensor([[[-0.0032, -0.0086,  0.0228,  ..., -0.1943,  0.0403, -0.1071],
         [ 0.0021, -0.0078,  0.0206,  ..., -0.2157,  0.0311, -0.1106],
         [-0.0056, -0.0331,  0.0698,  ..., -0.2463, -0.0076, -0.1500],
         ...,
         [-0.0061, -0.0336,  0.0705,  ..., -0.2466, -0.0082, -0.1501],
         [-0.0013, -0.0162,  0.0132,  ..., -0.1883,  0.0449, -0.0850],
         [-0.0029, -0.0174,  0.0129,  ..., -0.1893,  0.0441, -0.0822]]],
       device='cuda:0')

In [None]:
y.shape

torch.Size([1, 292, 768])

It is obvious that `y` is the last hidden state in out model (hidden state that is corresponds to last layer(12th layer)). The shape of this object is like (batch_size, number of tokens, hidden size), out sentence has 5 tokens and as we know in transformers based models we always put `[CLS]` token at the begining of each **sequence** and `[SEP]` token at the end of each **sentence**. So, all in all we have 6 (4 tokens in sentenece + 2 special tokens) tokens. In this example we want to extract `[CLS]` token, so in this case we should select first token of this batch.

For this aim these two lines of codes are the same and give us `[CLS]` token.

```
utputs.last_hidden_state[:,0]
```
and
```
outputs.last_hidden_state[:,0,:]
```

In [None]:
zx = outputs.last_hidden_state[:,0,:]

In [None]:
zx.size()

torch.Size([1, 768])

In [None]:
zy = outputs.last_hidden_state[:,0]

In [None]:
zy.size()

torch.Size([1, 768])

In this line of code we proof the equality of those two lines of codes.

In [None]:
torch.all(zx == zy)

tensor(True, device='cuda:0')

### AutoModel class (output_hidden_states = True)

This time we set `output_hidden_states=True`. So, in the output we will have output of each hidden states in addition to `last_hidden_state` parameter.

In [None]:
from transformers import AutoModel

num_labels = 4
model = (AutoModel.from_pretrained(model_ckpt, num_labels=num_labels, output_hidden_states=True).to(device))

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]],
        device='cuda:0')}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

Wav2Vec2BaseModelOutput(last_hidden_state=tensor([[[-0.0032, -0.0086,  0.0228,  ..., -0.1943,  0.0403, -0.1071],
         [ 0.0021, -0.0078,  0.0206,  ..., -0.2157,  0.0311, -0.1106],
         [-0.0056, -0.0331,  0.0698,  ..., -0.2463, -0.0076, -0.1500],
         ...,
         [-0.0061, -0.0336,  0.0705,  ..., -0.2466, -0.0082, -0.1501],
         [-0.0013, -0.0162,  0.0132,  ..., -0.1883,  0.0449, -0.0850],
         [-0.0029, -0.0174,  0.0129,  ..., -0.1893,  0.0441, -0.0822]]],
       device='cuda:0'), extract_features=tensor([[[-0.1333,  0.4963, -0.1661,  ...,  0.0779, -0.1221,  0.2210],
         [-0.0114,  0.2933, -0.1618,  ...,  0.0162,  0.1916,  0.1411],
         [ 0.2863,  0.2206,  0.0498,  ...,  0.3393,  0.0983,  0.1178],
         ...,
         [-0.1779,  0.1665, -0.0653,  ...,  0.7739,  0.0545,  0.2607],
         [-0.3813,  0.0677,  0.1115,  ...,  1.0431,  0.3075,  0.1133],
         [ 0.2753,  0.1415,  0.1551,  ...,  0.5013,  0.1535, -0.0632]]],
       device='cuda:0'), hidden_

In [None]:
for k, v in outputs.items():
  print(k)

last_hidden_state
extract_features
hidden_states


As we can see, this time `hidden_states` parameter has value.

In [None]:
len(outputs["hidden_states"])

13

The size of `hidden_state` parameter equals hidden states of the model at the output of each layer plus the optional initial embedding outputs. So, all in all the size equals (12 layers => 12 hidden states + 1 (initial embedding outputs)) = 13

In [None]:
outputs["hidden_states"][0].shape

torch.Size([1, 292, 768])

In [None]:
outputs["hidden_states"][1].shape

torch.Size([1, 292, 768])

We can see that the size of tokens in each hidden state is the same and just embedding will be reacher in top layers (because of multi head attention)

In [None]:
# extract last hidden state
x = outputs["hidden_states"][-1]

In [None]:
x.shape

torch.Size([1, 292, 768])

In [None]:
y = outputs["last_hidden_state"]

In [None]:
y.shape

torch.Size([1, 292, 768])

We want to see that x and y objects are the same and check their equality.

x => last hidden state from `hidden_states` parameter in output

y => `last_hidden_state` parameter in output

In [None]:
torch.all(x == y)

tensor(True, device='cuda:0')

In [None]:
# output of pooler layer
a = outputs[1]

In [None]:
a

tensor([[[-0.1333,  0.4963, -0.1661,  ...,  0.0779, -0.1221,  0.2210],
         [-0.0114,  0.2933, -0.1618,  ...,  0.0162,  0.1916,  0.1411],
         [ 0.2863,  0.2206,  0.0498,  ...,  0.3393,  0.0983,  0.1178],
         ...,
         [-0.1779,  0.1665, -0.0653,  ...,  0.7739,  0.0545,  0.2607],
         [-0.3813,  0.0677,  0.1115,  ...,  1.0431,  0.3075,  0.1133],
         [ 0.2753,  0.1415,  0.1551,  ...,  0.5013,  0.1535, -0.0632]]],
       device='cuda:0')

In [None]:
a.size()

torch.Size([1, 292, 512])

In [None]:
b = outputs.last_hidden_state

In [None]:
b

tensor([[[-0.0032, -0.0086,  0.0228,  ..., -0.1943,  0.0403, -0.1071],
         [ 0.0021, -0.0078,  0.0206,  ..., -0.2157,  0.0311, -0.1106],
         [-0.0056, -0.0331,  0.0698,  ..., -0.2463, -0.0076, -0.1500],
         ...,
         [-0.0061, -0.0336,  0.0705,  ..., -0.2466, -0.0082, -0.1501],
         [-0.0013, -0.0162,  0.0132,  ..., -0.1883,  0.0449, -0.0850],
         [-0.0029, -0.0174,  0.0129,  ..., -0.1893,  0.0441, -0.0822]]],
       device='cuda:0')

In [None]:
b.size()

torch.Size([1, 292, 768])

In [None]:
zx.size()

torch.Size([1, 768])

#### Concatenate 4 last hidden states

In [None]:
hidden_states = outputs["hidden_states"]
pooled_output = torch.cat(tuple([hidden_states[i] for i in [-4, -3, -2, -1]]), dim=-1)

In [None]:
pooled_output.size()

torch.Size([1, 292, 3072])

In [None]:
pooled_output = pooled_output[:, 0, :]

In [None]:
pooled_output.size()

torch.Size([1, 3072])

### AutoModelForAudioClassification class (output_hidden_states = False)

In [None]:
from transformers import AutoModelForAudioClassification

num_labels = 4
model = (AutoModelForAudioClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [None]:
# previous inputs object don't have device type, so we should place them in the same device as the model (if we don't run this code and our model is in GPU, there will be a problem becaue the place of tensors and model is different.)
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]],
        device='cuda:0')}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0177,  0.0291, -0.0044, -0.0487]], device='cuda:0'), hidden_states=None, attentions=None)

As we can see, since we use `AutoModelForSequenceClassification` class, we have a classification head on top of pooler layer, so in the output we give logits which are scores for each class. (There is no pooler layer in the output)

In [None]:
for k, v in outputs.items():
  print(k)

logits


### AutoModelForAudioClassification  class (output_hidden_states = True)

This time we set `output_hidden_states=True`.

In [None]:
from transformers import AutoModelForAudioClassification

num_labels = 4
model = (AutoModelForAudioClassification.from_pretrained(model_ckpt, num_labels=num_labels, output_hidden_states=True).to(device))

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]],
        device='cuda:0')}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0151,  0.0538,  0.0434, -0.0418]], device='cuda:0'), hidden_states=(tensor([[[ 0.1268, -0.6065,  0.1728,  ..., -0.1042,  0.0300,  0.0992],
         [ 0.2772, -0.3489, -0.0101,  ..., -0.1404, -0.0408,  0.1601],
         [ 0.0686, -0.2589, -0.2347,  ...,  0.1477, -0.2343,  0.2663],
         ...,
         [-0.0095, -0.2362,  0.2269,  ..., -0.2547, -0.1206,  0.2207],
         [ 0.2402, -0.2564,  0.1918,  ...,  0.0832,  0.0278,  0.2594],
         [-0.1356, -0.1467,  0.2775,  ..., -0.1656, -0.2945,  0.6156]]],
       device='cuda:0'), tensor([[[ 0.2410, -0.1939,  0.1821,  ...,  0.1164,  0.0805,  0.1752],
         [ 0.3539, -0.1995,  0.1245,  ...,  0.0254, -0.1509,  0.2396],
         [ 0.3606, -0.2190,  0.0775,  ...,  0.2022, -0.2204,  0.3670],
         ...,
         [ 0.2438, -0.2710,  0.1719,  ..., -0.1546, -0.2640,  0.3820],
         [ 0.1722, -0.2075, -0.0498,  ..., -0.1815, -0.2613,  0.2820],
         [ 0.0284, -0.0592,  0.0477,  ...

In [None]:
for k, v in outputs.items():
  print(k)

logits
hidden_states


As we can see, this time we have `hidden_states` in addition to `logits` parameter.


In [None]:
len(outputs["hidden_states"])

13

The size of `hidden_state` parameter equals hidden states of the model at the output of each layer plus the optional initial embedding outputs. So, all in all the size equals (12 layers => 12 hidden states + 1 (initial embedding outputs)) = 13

In [None]:
outputs["hidden_states"][0].shape

torch.Size([1, 292, 768])

In [None]:
outputs["hidden_states"][1].shape

torch.Size([1, 292, 768])

We can see that the size of tokens in each hidden state is the same and just embedding will be reacher in top layers (because of multi head attention)

In [None]:
# extract last hidden state
x = outputs["hidden_states"][-1]

In [None]:
x.shape

torch.Size([1, 292, 768])

## HuBERT-base

HubertModel => [source code](https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/hubert/modeling_hubert.py#L968)

HubertForAudioClassification => [source code](https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/hubert/modeling_hubert.py#L968)

### Load feature extractor


In [None]:
from transformers import AutoFeatureExtractor

model_ckpt = "facebook/hubert-base-ls960"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_ckpt)

preprocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

In [None]:
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [None]:
feature_extractor.model_input_names

['input_values', 'attention_mask']

In [None]:
from datasets import load_dataset

dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset = dataset.sort("id")
sampling_rate = dataset.features["audio"].sampling_rate

In [None]:
dataset[0]["audio"]["array"]

array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
       0.0010376 ])

In [None]:
inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [None]:
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]])}

In [None]:
type(inputs)

transformers.feature_extraction_utils.BatchFeature

### AutoModel class (output_hidden_states = False)

In [None]:
from transformers import AutoModel

num_labels = 4
model = (AutoModel.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [None]:
model

HubertModel(
  (feature_extractor): HubertFeatureEncoder(
    (conv_layers): ModuleList(
      (0): HubertGroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): HubertFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): HubertEncoder(
    (pos_conv_embed): HubertPositionalConvEmbedding(
      (conv): Para

In [None]:
# previous inputs object don't have device type, so we should place them on the same device as the model (if we don't run this code and our model is in GPU, there will be a problem becaue the place of tensors and model is different.)
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]],
        device='cuda:0')}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

BaseModelOutput(last_hidden_state=tensor([[[ 0.0924, -0.0873,  0.2480,  ..., -0.0481,  0.1011, -0.3813],
         [ 0.1171, -0.0870,  0.2565,  ..., -0.0525,  0.0991, -0.4402],
         [ 0.1896, -0.0639,  0.2879,  ..., -0.0714,  0.0727, -0.5391],
         ...,
         [ 0.1721,  0.3426,  0.0415,  ..., -0.0303, -0.1977, -0.6863],
         [ 0.1121,  0.1157,  0.1866,  ..., -0.1068, -0.1563, -0.5571],
         [ 0.0897,  0.0344,  0.2302,  ..., -0.0846, -0.0011, -0.4501]]],
       device='cuda:0'), hidden_states=None, attentions=None)

In [None]:
for k, v in outputs.items():
  print(k)

last_hidden_state


As we can see, we set don't set `output_hidden_states = True`, so in the output, `hidden_states` parameter doesn't have value and we have value just for `last_hidden_state` parameter.

In [None]:
y = outputs["last_hidden_state"]

In [None]:
y

tensor([[[ 0.0924, -0.0873,  0.2480,  ..., -0.0481,  0.1011, -0.3813],
         [ 0.1171, -0.0870,  0.2565,  ..., -0.0525,  0.0991, -0.4402],
         [ 0.1896, -0.0639,  0.2879,  ..., -0.0714,  0.0727, -0.5391],
         ...,
         [ 0.1721,  0.3426,  0.0415,  ..., -0.0303, -0.1977, -0.6863],
         [ 0.1121,  0.1157,  0.1866,  ..., -0.1068, -0.1563, -0.5571],
         [ 0.0897,  0.0344,  0.2302,  ..., -0.0846, -0.0011, -0.4501]]],
       device='cuda:0')

In [None]:
y.shape

torch.Size([1, 292, 768])

It is obvious that `y` is the last hidden state in out model (hidden state that is corresponds to last layer(12th layer)). The shape of this object is like (batch_size, number of tokens, hidden size), out sentence has 5 tokens and as we know in transformers based models we always put `[CLS]` token at the begining of each **sequence** and `[SEP]` token at the end of each **sentence**. So, all in all we have 6 (4 tokens in sentenece + 2 special tokens) tokens. In this example we want to extract `[CLS]` token, so in this case we should select first token of this batch.

For this aim these two lines of codes are the same and give us `[CLS]` token.

```
utputs.last_hidden_state[:,0]
```
and
```
outputs.last_hidden_state[:,0,:]
```

In [None]:
zx = outputs.last_hidden_state[:,0,:]

In [None]:
zx.size()

torch.Size([1, 768])

In [None]:
zy = outputs.last_hidden_state[:,0]

In [None]:
zy.size()

torch.Size([1, 768])

In this line of code we proof the equality of those two lines of codes.

In [None]:
torch.all(zx == zy)

tensor(True, device='cuda:0')

### AutoModel class (output_hidden_states = True)

This time we set `output_hidden_states=True`. So, in the output we will have output of each hidden states in addition to `last_hidden_state` parameter.

In [None]:
from transformers import AutoModel

num_labels = 4
model = (AutoModel.from_pretrained(model_ckpt, num_labels=num_labels, output_hidden_states=True).to(device))

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [None]:
model

HubertModel(
  (feature_extractor): HubertFeatureEncoder(
    (conv_layers): ModuleList(
      (0): HubertGroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x HubertNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): HubertFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): HubertEncoder(
    (pos_conv_embed): HubertPositionalConvEmbedding(
      (conv): Para

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]],
        device='cuda:0')}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

BaseModelOutput(last_hidden_state=tensor([[[ 0.0924, -0.0873,  0.2480,  ..., -0.0481,  0.1011, -0.3813],
         [ 0.1171, -0.0870,  0.2565,  ..., -0.0525,  0.0991, -0.4402],
         [ 0.1896, -0.0639,  0.2879,  ..., -0.0714,  0.0727, -0.5391],
         ...,
         [ 0.1721,  0.3426,  0.0415,  ..., -0.0303, -0.1977, -0.6863],
         [ 0.1121,  0.1157,  0.1866,  ..., -0.1068, -0.1563, -0.5571],
         [ 0.0897,  0.0344,  0.2302,  ..., -0.0846, -0.0011, -0.4501]]],
       device='cuda:0'), hidden_states=(tensor([[[-2.7202e-01, -1.8356e-01,  5.7611e-02,  ..., -1.5418e-01,
           2.6787e-01, -3.2099e-01],
         [-3.1669e-01, -2.1807e-01, -8.8446e-04,  ...,  2.1743e-01,
           2.5074e-01, -3.4014e-01],
         [-1.8568e-01, -2.1013e-01, -7.7682e-02,  ..., -6.0388e-02,
           1.7608e-01, -1.6781e-01],
         ...,
         [-9.6796e-02, -5.4610e-02, -5.0362e-02,  ..., -1.1312e-01,
           3.1314e-01, -1.7221e-01],
         [-4.1814e-02, -8.8047e-02, -2.0904e-02,  

In [None]:
for k, v in outputs.items():
  print(k)

last_hidden_state
hidden_states


As we can see, this time `hidden_states` parameter has value.

In [None]:
len(outputs["hidden_states"])

13

The size of `hidden_state` parameter equals hidden states of the model at the output of each layer plus the optional initial embedding outputs. So, all in all the size equals (12 layers => 12 hidden states + 1 (initial embedding outputs)) = 13

In [None]:
outputs["hidden_states"][0].shape

torch.Size([1, 292, 768])

In [None]:
outputs["hidden_states"][1].shape

torch.Size([1, 292, 768])

We can see that the size of tokens in each hidden state is the same and just embedding will be reacher in top layers (because of multi head attention)

In [None]:
# extract last hidden state
x = outputs["hidden_states"][-1]

In [None]:
x.shape

torch.Size([1, 292, 768])

In [None]:
y = outputs["last_hidden_state"]

In [None]:
y.shape

torch.Size([1, 292, 768])

We want to see that x and y objects are the same and check their equality.

x => last hidden state from `hidden_states` parameter in output

y => `last_hidden_state` parameter in output

In [None]:
torch.all(x == y)

tensor(True, device='cuda:0')

In [None]:
# output of pooler layer
a = outputs[1]

In [None]:
a

(tensor([[[-2.7202e-01, -1.8356e-01,  5.7611e-02,  ..., -1.5418e-01,
            2.6787e-01, -3.2099e-01],
          [-3.1669e-01, -2.1807e-01, -8.8446e-04,  ...,  2.1743e-01,
            2.5074e-01, -3.4014e-01],
          [-1.8568e-01, -2.1013e-01, -7.7682e-02,  ..., -6.0388e-02,
            1.7608e-01, -1.6781e-01],
          ...,
          [-9.6796e-02, -5.4610e-02, -5.0362e-02,  ..., -1.1312e-01,
            3.1314e-01, -1.7221e-01],
          [-4.1814e-02, -8.8047e-02, -2.0904e-02,  ...,  1.5962e-01,
            2.2029e-01, -3.3574e-01],
          [-1.7370e-01, -1.3185e-05,  1.1318e-01,  ...,  4.2382e-02,
            1.4670e-01, -3.2963e-01]]], device='cuda:0'),
 tensor([[[-0.1017,  0.0460,  0.0956,  ..., -0.0712,  0.0526,  0.0072],
          [-0.1442, -0.0138,  0.0379,  ..., -0.0178,  0.0609, -0.1824],
          [ 0.0195,  0.0251, -0.0595,  ..., -0.1657, -0.0221, -0.0359],
          ...,
          [ 0.0211, -0.0258, -0.1667,  ..., -0.1015,  0.1922, -0.2106],
          [-0.0554, 

In [None]:
b = outputs.last_hidden_state

In [None]:
b

tensor([[[ 0.0924, -0.0873,  0.2480,  ..., -0.0481,  0.1011, -0.3813],
         [ 0.1171, -0.0870,  0.2565,  ..., -0.0525,  0.0991, -0.4402],
         [ 0.1896, -0.0639,  0.2879,  ..., -0.0714,  0.0727, -0.5391],
         ...,
         [ 0.1721,  0.3426,  0.0415,  ..., -0.0303, -0.1977, -0.6863],
         [ 0.1121,  0.1157,  0.1866,  ..., -0.1068, -0.1563, -0.5571],
         [ 0.0897,  0.0344,  0.2302,  ..., -0.0846, -0.0011, -0.4501]]],
       device='cuda:0')

### AutoModelForAudioClassification class (output_hidden_states = False)

In [None]:
from transformers import AutoModelForAudioClassification

num_labels = 4
model = (AutoModelForAudioClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

HubertForSequenceClassification(
  (hubert): HubertModel(
    (feature_extractor): HubertFeatureEncoder(
      (conv_layers): ModuleList(
        (0): HubertGroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x HubertNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x HubertNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): HubertFeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encod

In [None]:
# previous inputs object don't have device type, so we should place them on the same device as the model (if we don't run this code and our model is in GPU, there will be a problem becaue the place of tensors and model is different.)
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]],
        device='cuda:0')}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0317, -0.0334,  0.0317, -0.0024]], device='cuda:0'), hidden_states=None, attentions=None)

As we can see, since we use `AutoModelForSequenceClassification` class, we have a classification head on top of pooler layer, so in the output we give logits which are scores for each class. (There is no pooler layer in the output)

In [None]:
for k, v in outputs.items():
  print(k)

logits


### AutoModelForAudioClassification  class (output_hidden_states = True)

This time we set `output_hidden_states=True`.

In [None]:
from transformers import AutoModelForAudioClassification

num_labels = 4
model = (AutoModelForAudioClassification.from_pretrained(model_ckpt, num_labels=num_labels, output_hidden_states=True).to(device))

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

HubertForSequenceClassification(
  (hubert): HubertModel(
    (feature_extractor): HubertFeatureEncoder(
      (conv_layers): ModuleList(
        (0): HubertGroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x HubertNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x HubertNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): HubertFeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encod

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_values': tensor([[0.0386, 0.0337, 0.0322,  ..., 0.0070, 0.0095, 0.0169]],
        device='cuda:0')}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0569,  0.0147,  0.0286, -0.0184]], device='cuda:0'), hidden_states=(tensor([[[-2.7202e-01, -1.8356e-01,  5.7611e-02,  ..., -1.5418e-01,
           2.6787e-01, -3.2099e-01],
         [-3.1669e-01, -2.1807e-01, -8.8446e-04,  ...,  2.1743e-01,
           2.5074e-01, -3.4014e-01],
         [-1.8568e-01, -2.1013e-01, -7.7682e-02,  ..., -6.0388e-02,
           1.7608e-01, -1.6781e-01],
         ...,
         [-9.6796e-02, -5.4610e-02, -5.0362e-02,  ..., -1.1312e-01,
           3.1314e-01, -1.7221e-01],
         [-4.1814e-02, -8.8047e-02, -2.0904e-02,  ...,  1.5962e-01,
           2.2029e-01, -3.3574e-01],
         [-1.7370e-01, -1.3185e-05,  1.1318e-01,  ...,  4.2382e-02,
           1.4670e-01, -3.2963e-01]]], device='cuda:0'), tensor([[[-0.1017,  0.0460,  0.0956,  ..., -0.0712,  0.0526,  0.0072],
         [-0.1442, -0.0138,  0.0379,  ..., -0.0178,  0.0609, -0.1824],
         [ 0.0195,  0.0251, -0.0595,  ..., -0.1657, -0.0221, -0.0359],


In [None]:
for k, v in outputs.items():
  print(k)

logits
hidden_states


As we can see, this time we have `hidden_states` in addition to `logits` parameter.



In [None]:
len(outputs["hidden_states"])

13

The size of `hidden_state` parameter equals hidden states of the model at the output of each layer plus the optional initial embedding outputs. So, all in all the size equals (12 layers => 12 hidden states + 1 (initial embedding outputs)) = 13

In [None]:
outputs["hidden_states"][0].shape

torch.Size([1, 292, 768])

In [None]:
outputs["hidden_states"][1].shape

torch.Size([1, 292, 768])

We can see that the size of tokens in each hidden state is the same and just embedding will be reacher in top layers (because of multi head attention)

In [None]:
# extract last hidden state
x = outputs["hidden_states"][-1]

In [None]:
x.shape

torch.Size([1, 292, 768])

In [None]:
# -1 th element is just hidden_states
len(outputs[-1])

13