In [None]:
! pip install datasets
! pip install datasets[audio]
# ! pip install jiwer
! pip install torchaudio
! pip install Levenshtein
! pip install timit_per
! pip install -U accelerate
! pip install -U transformers

In [None]:
# ! pip install accelerate>=0.21.0
! pip install -U accelerate
! pip install -U transformers

In [6]:
! pip install jiwer



In [None]:
import gdown

url = 'https://drive.google.com/uc?id=1vIFxMIfXjdT4ie7c6JXWNpNDGaXKDHiH'
output = 'timid_2.'

gdown.download(url, output, quiet=False)

In [None]:
! unzip -o /content/timid_2.zip

In [None]:
from datasets import load_dataset

dataset = load_dataset("timit_asr", data_dir="/content/data/lisa/data", trust_remote_code=True)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [13]:
train_dataset.features

{'file': Value(dtype='string', id=None),
 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'text': Value(dtype='string', id=None),
 'phonetic_detail': Sequence(feature={'start': Value(dtype='int64', id=None), 'stop': Value(dtype='int64', id=None), 'utterance': Value(dtype='string', id=None)}, length=-1, id=None),
 'word_detail': Sequence(feature={'start': Value(dtype='int64', id=None), 'stop': Value(dtype='int64', id=None), 'utterance': Value(dtype='string', id=None)}, length=-1, id=None),
 'dialect_region': Value(dtype='string', id=None),
 'sentence_type': Value(dtype='string', id=None),
 'speaker_id': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None)}

In [29]:
import torch
import torchaudio
from transformers import AutoModelForCTC, AutoTokenizer, AutoFeatureExtractor, Wav2Vec2Processor

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

resample_layer = torchaudio.transforms.Resample(
    orig_freq=48000,
    new_freq=16000
)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

In [30]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import jiwer

import tqdm

def evaluate_model(test_dataset):
    model.eval()
    total_samples = len(test_dataset)
    total_errors = 0
    total_words = 0
    i=0

    with torch.no_grad(), tqdm.tqdm(total=total_samples) as pbar:
        for sample in test_dataset:
            audio_input, sample_rate = torchaudio.load(sample["file"], normalize=True, channels_first=True, num_frames=100000)  # Adjust num_frames as needed
            audio_input = resample_layer(audio_input)
            inputs = processor(audio_input.squeeze(0), sampling_rate=sample_rate, return_tensors="pt", padding=True)

            input_values = inputs.input_values.to(DEVICE)
            attention_mask = torch.ones_like(input_values).to(DEVICE)

            logits = model(input_values=input_values,
                           attention_mask=attention_mask).logits

            pred_ids = torch.argmax(logits, dim=-1)
            predicted_transcription = processor.batch_decode(pred_ids)[0]
            ground_truth_transcription = sample["text"]

            # per
            error = jiwer.compute_measures(ground_truth_transcription, predicted_transcription)["wer"]
            total_errors += error
            total_words += len(ground_truth_transcription.split())

            pbar.update(1)

    per = total_errors / total_words
    accuracy = 1 - per
    print(f"Accuracy: {accuracy}")


evaluate_model(test_dataset)


100%|██████████| 1680/1680 [12:25<00:00,  2.25it/s]

Accuracy: 0.8842258214761314





In [88]:
import torch
import torchaudio
import jiwer
import tqdm


class LinearHead(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearHead, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)


class MLPHead(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLPHead, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

def evaluate_model(test_dataset, layer_name, head_name):
    model.eval()
    total_samples = len(test_dataset)
    total_errors = 0
    total_words = 0

    i = 0

    with torch.no_grad(), tqdm.tqdm(total=total_samples) as pbar:
        for sample in test_dataset:
            i += 1
            if i > 30: break
            audio_input, sample_rate = torchaudio.load(sample["file"], normalize=True, channels_first=True, num_frames=100000)
            audio_input = resample_layer(audio_input)
            inputs = processor(audio_input.squeeze(0), sampling_rate=sample_rate, return_tensors="pt", padding=True)

            input_values = inputs.input_values.to(DEVICE)
            attention_mask = torch.ones_like(input_values).to(DEVICE)

            if layer_name == "last_hidden_state":
                logits = model(input_values=input_values, attention_mask=attention_mask).logits
            elif layer_name in input_values:
                hidden_states = model(input_values=input_values, attention_mask=attention_mask).hidden_states
                logits = model.from_pretrained(model_name, config=model.config, input_values=hidden_states[layer_name]).logits
            else:
                raise ValueError(f"Invalid layer name: {layer_name}")

            if head_name == "linear":
                head = LinearHead(input_dim=logits.shape[-1], output_dim=len(tokenizer.vocab))
            elif head_name == "mlp":
                hidden_dim = 512
                head = MLPHead(input_dim=logits.shape[-1], hidden_dim=hidden_dim, output_dim=len(tokenizer.vocab))
            else:
                raise ValueError(f"Invalid head name: {head_name}")

            head_logits = head(logits)

            pred_ids = torch.argmax(head_logits, dim=-1)
            predicted_transcription = processor.batch_decode(pred_ids)[0]
            ground_truth_transcription = sample["text"]

            error = jiwer.compute_measures(ground_truth_transcription, predicted_transcription)["wer"]
            total_errors += error
            total_words += len(ground_truth_transcription.split())

            pbar.update(1)

    per = total_errors / total_words
    accuracy = 1 - per
    print(f"\nAccuracy: {accuracy}")

In [86]:
from transformers import AutoProcessor, Wav2Vec2Processor, Wav2Vec2ForCTC, HubertForCTC, AutoTokenizer, AutoFeatureExtractor

model_name = "facebook/hubert-large-ls960-ft"

processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft").to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name)
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)

resample_layer = torchaudio.transforms.Resample(
    orig_freq=48000,
    new_freq=16000
)

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertForCTC: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForCTC were not initialized from the model checkpoint at facebook/hubert-large-ls960-ft and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-

In [87]:
evaluate_model(test_dataset, "last_hidden_state", "linear")

  2%|▏         | 30/1680 [00:54<49:46,  1.81s/it]

Accuracy: 0.8845785440613027





In [89]:
evaluate_model(test_dataset, "last_hidden_state", "mlp")

  2%|▏         | 30/1680 [01:00<55:48,  2.03s/it]


Accuracy: 0.8850574712643678





In [90]:
import torch
import torchaudio
from transformers import AutoModelForCTC, AutoTokenizer, AutoFeatureExtractor, Wav2Vec2Processor

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

resample_layer = torchaudio.transforms.Resample(
    orig_freq=48000,
    new_freq=16000
)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

In [92]:
evaluate_model(test_dataset, "last_hidden_state", "linear")

  2%|▏         | 30/1680 [00:17<16:26,  1.67it/s]


Accuracy: 0.8826628352490421





In [91]:
evaluate_model(test_dataset, "last_hidden_state", "mlp")

  2%|▏         | 30/1680 [00:30<27:48,  1.01s/it]


Accuracy: 0.8850574712643678



