In [1]:
import os
os.environ['HF_HOME'] = 'huggingface'
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = 'True'
import math
from datasets import load_dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Union
import evaluate
from torch.utils.data import DataLoader



  from .autonotebook import tqdm as notebook_tqdm


In [46]:
!nvidia-smi

Thu May 23 21:33:40 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.76.01              Driver Version: 552.44         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070 ...    On  |   00000000:01:00.0  On |                  N/A |
|  0%   35C    P8             10W /  285W |    7731MiB /  16376MiB |      9%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Loading the Dataset according to below specifications

https://huggingface.co/docs/datasets/en/audio_load

Prepping the csv from jsonl

In [2]:
import json
import csv
import io

# get the JSON objects from JSONL
jsonl_file = open("asr.jsonl", "r")
jsonl_data = jsonl_file.read()

json_lines = tuple(json_line
                   for json_line in jsonl_data.splitlines()
                   if json_line.strip())
jsons_objs = tuple(json.loads(json_line)
                   for json_line in json_lines)

jsonl_file.close()
# write them into a CSV file
csv_file = open("audio/metadata.csv", "w", newline='')
writer = csv.writer(csv_file)
writer.writerow(["file_name", "transcription"])
writer.writerows((value for key, value in sorted(json_obj.items()) if key in ["audio", "transcript"])
                 for json_obj in jsons_objs)

csv_file.close()

# Setting up the model and dataset

In [2]:
# model_name = 'openai/whisper-medium.en'
model_name = 'openai/whisper-large-v3'
# checkpoint_name = 'whisper-checkpoints/checkpoint-750/'
processor = WhisperProcessor.from_pretrained(model_name, resume_download = None)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
full_audio_path = "audio"
# full_audio_path = "combined_audio_files"
# Use audiofolder metadata to load the audio dataset
ds = load_dataset('audiofolder', data_dir=full_audio_path, split='train')
train_test_split = ds.train_test_split(test_size=0.1, shuffle=True)

trainval = train_test_split['train'].train_test_split(test_size=0.1, shuffle=True)
test_set = train_test_split['test']
train_set = trainval['train']
val_set = trainval['test']

In [11]:
train_set

Dataset({
    features: ['audio', 'transcription', 'input_features', 'input_length', 'labels', 'length'],
    num_rows: 2835
})

In [5]:
test_set

Dataset({
    features: ['audio', 'transcription'],
    num_rows: 350
})

In [6]:
val_set

Dataset({
    features: ['audio', 'transcription'],
    num_rows: 315
})

In [4]:
def prepare_dataset(batch):
    # model_name = 'openai/whisper-medium.en'
    model_name = 'openai/whisper-large-v3'

    # separate import for each process
    from transformers import WhisperProcessor
    processor = WhisperProcessor.from_pretrained(model_name, resume_download= None)

    batch["input_features"] =[processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0] for audio in batch["audio"]]
    batch["input_length"] = [len(b) for b in batch["input_features"]]
    batch["labels"] = processor(text=batch["transcription"]).input_ids
    batch['length'] = batch["input_length"]

    return batch

In [5]:
train_set = train_set.map(prepare_dataset, num_proc=8, batched=True, batch_size=128)
val_set = val_set.map(prepare_dataset, num_proc=8, batched=True, batch_size=128)
test_set = test_set.map(prepare_dataset, num_proc=8, batched=True, batch_size=128)

# This uses 100% CPU oops

Map (num_proc=8):   0%|          | 0/2835 [00:00<?, ? examples/s]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or 

## Augmentations

In [6]:
from audiomentations import Compose, HighShelfFilter, LowShelfFilter, TimeStretch, BandPassFilter
import numpy as np
# High Shelf Filter + Low Shelf Filter
# Subtle Time Strech (aka Speed Perturbation) of Entire Audio
# Frequency Band Pass

def augment_audio(batch):
    # batch['input_values']  -> (BATCH, variable AUDIO_LENGTH)
    augmented_audios = []
    # print(batch.keys())
    # print(batch)
    # exit()

    if 'input_values' not in batch:
        return batch
    for sample in batch['input_values']:
        sample = np.array(sample)

        augment = Compose([
            HighShelfFilter(max_gain_db=6.0, p=0.3),
            LowShelfFilter(max_gain_db=6.0, p=0.3),
            TimeStretch(min_rate=0.9, max_rate=1.1, p=0.2),
            BandPassFilter(p=0.3)
        ])

        augmented_audio = augment(samples=sample, sample_rate=16000)

        standardized_augmented_audio = processor(audio=augmented_audio, sampling_rate=16000).input_values

        augmented_audios.append(standardized_augmented_audio)

    batch['input_values'] = augmented_audios
    batch['input_length'] = [len(b[0]) for b in batch["input_values"]]
    # batch['length'] = batch['input_length']
    return batch

In [7]:
train_set.set_transform(augment_audio, columns=['audio', 'transcription', 'input_features', 'input_length', 'length', 'labels'])

In [8]:
# purpose of the data collator is to ensure that the inputs and labels are padded correctly

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: WhisperProcessor

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
# help(DataCollatorSpeechSeq2SeqWithPadding)

Help on class DataCollatorSpeechSeq2SeqWithPadding in module __main__:

class DataCollatorSpeechSeq2SeqWithPadding(builtins.object)
 |  DataCollatorSpeechSeq2SeqWithPadding(processor: transformers.models.whisper.processing_whisper.WhisperProcessor) -> None
 |  
 |  DataCollatorSpeechSeq2SeqWithPadding(processor: transformers.models.whisper.processing_whisper.WhisperProcessor)
 |  
 |  Methods defined here:
 |  
 |  __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]
 |      Call self as a function.
 |  
 |  __eq__(self, other)
 |      Return self==value.
 |  
 |  __init__(self, processor: transformers.models.whisper.processing_whisper.WhisperProcessor) -> None
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for inst

In [9]:
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## Here is the adding of special tokens

In [12]:
from transformers import WhisperTokenizer, WhisperForConditionalGeneration

# load pre-trained tokenizer and model
ckpt = "openai/whisper-small.en"
tokenizer = WhisperTokenizer.from_pretrained(ckpt, resume_download = None , use_fast=True)
model = WhisperForConditionalGeneration.from_pretrained(
    model_name,  # checkpoint_name
    pad_token_id=processor.tokenizer.pad_token_id,
    mask_time_prob=0.5,  # 0.05
    mask_time_length=10, # 10
    mask_feature_prob=0.5, # 0
    mask_feature_length=10, # 10
    apply_spec_augment=True,
    resume_download = None
)


# define new tokens to add to vocab
new_tokens = ["niner"]

# check if the new tokens are already in the vocabulary
# Dont need to check cos we know
# new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())

# add the tokens to the tokenizer vocabulary
tokenizer.add_tokens(list(new_tokens))

# add new random embeddings for the appended tokens
model.resize_token_embeddings(len(tokenizer))
model.freeze_encoder()

## Below is for without special tokens

In [10]:
model = WhisperForConditionalGeneration.from_pretrained(
    model_name,  # checkpoint_name
    pad_token_id=processor.tokenizer.pad_token_id,
    mask_time_prob=0.5,  # 0.05
    mask_time_length=10, # 10
    mask_feature_prob=0.5, # 0
    mask_feature_length=10, # 10
    apply_spec_augment=True,
    resume_download = None
)
model.freeze_encoder()

# Training Args

In [12]:
import psutil

training_args = Seq2SeqTrainingArguments(
    output_dir="Checkpoints/large-checkpoints",
    overwrite_output_dir =True,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=1e-5, # was 1e-4
    warmup_steps=500,
    # max_steps=5000,
    num_train_epochs=30,
    gradient_checkpointing=True,
    fp16=True,
    # bf16=True,  # for A100
    torch_compile=True,
    fp16_full_eval=True,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    save_total_limit = 3,
    # load_best_model_at_end=True,
    adam_beta1=0.9,
    adam_beta2=0.98,  # follow fairseq finetuning config
    warmup_ratio=0.22, # follow Ranger21
    weight_decay=1e-4,
    
    # bf16_full_eval=True,  # for A100
    # group_by_length=True,  # slows down
    # evaluation_strategy="epoch",
    # save_strategy='epoch',  # epoch
    # save_safetensors=True,
    # per_device_eval_batch_size=2,
    # save_steps=1,
    # eval_steps=1,
    # logging_steps=100,
    # save_total_limit=3,
    # lr_scheduler_type='cosine',
    # load_best_model_at_end=True,  # True
    # adam_beta1=0.9,
    # adam_beta2=0.98,  # follow fairseq fintuning config
    # warmup_ratio=0.22, # follow Ranger21
    # weight_decay=1e-4,  # follow Ranger21
    # metric_for_best_model="wer",
    # greater_is_better=False,
    # report_to=['tensorboard'],
    # torch_compile=True,
    # remove_unused_columns=False, # Without this causes an error
    # dataloader_num_workers=24 if os.name != 'nt' else 1
    dataloader_num_workers=psutil.cpu_count(logical=True)
)

In [11]:
# class CustomWhisperTrainer(Seq2SeqTrainer):
#     def training_step(self, model: torch.nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
#         """
#         Perform a training step on a batch of inputs.

#         Subclass and override to inject custom behavior.

#         Args:
#             model (:obj:`nn.Module`):
#                 The model to train.
#             inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
#                 The inputs and targets of the model.

#                 The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
#                 argument :obj:`labels`. Check your model's documentation for all accepted arguments.

#         Return:
#             :obj:`torch.Tensor`: The tensor with training loss on this batch.
#         """

#         model.train()
#         inputs = self._prepare_inputs(inputs)
#         loss = self.compute_loss(model, inputs)

#         if self.args.gradient_accumulation_steps > 1:
#             loss = loss / self.args.gradient_accumulation_steps

#         # if os.name != 'nt':
#         #     accelerator.backward(self.scaler.scale(loss))
#         #     # self.scaler.scale(loss).backward()
#         # else:
#         # self.scaler.scale(loss).backward()
        
#         return loss.detach()

In [15]:
# if os.name != 'nt':
#     from accelerate import Accelerator
#     accelerator = Accelerator(mixed_precision='fp16', dynamo_backend='eager')  # FP8 needs transformer_engine package which is only on Linux with Hopper GPUs

In [13]:
def tri_stage_schedule(epoch: int, max_epoch = training_args.num_train_epochs, stage_ratio = [0.1, 0.4, 0.5], peak_lr = training_args.learning_rate, initial_lr_scale=0.01, final_lr_scale=0.05):
    """https://github.com/facebookresearch/fairseq/blob/5ecbbf58d6e80b917340bcbf9d7bdbb539f0f92b/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py#L51"""
    assert sum(stage_ratio) == 1
    current_ratio = epoch / max_epoch
    if current_ratio < stage_ratio[0]:  # linear warmup
        lrs = torch.linspace(initial_lr_scale * peak_lr, peak_lr, int(stage_ratio[0] * max_epoch))
        return lrs[epoch]
    elif stage_ratio[0] <= current_ratio <= stage_ratio[1]:  # constant
        return peak_lr
    else:  # exponential decay
        decay_factor = -math.log(final_lr_scale) / (stage_ratio[2] * max_epoch)
        return peak_lr * math.exp(-decay_factor * stage_ratio[2] * max_epoch)

In [26]:
# ds['train'][0]

{'audio': {'path': None,
  'array': array([0., 0., 0., ..., 0., 0., 0.]),
  'sampling_rate': 16000},
 'transcription': 'Target red and blue fighter plane at heading one five five, engage with machine gun.',
 'input_features': [[-0.963567852973938,
   -0.963567852973938,
   -0.7878323793411255,
   -0.8258864879608154,
   -0.8095105886459351,
   -0.7946726083755493,
   -0.808912992477417,
   -0.805404543876648,
   -0.813245415687561,
   -0.8142907619476318,
   -0.8144005537033081,
   -0.8253769874572754,
   -0.798128604888916,
   -0.7934802770614624,
   -0.8224153518676758,
   -0.8033881187438965,
   -0.8269888162612915,
   -0.8269256353378296,
   -0.8116075992584229,
   -0.7845652103424072,
   -0.8124032020568848,
   -0.8179854154586792,
   -0.8203535079956055,
   -0.7647955417633057,
   -0.7887591123580933,
   -0.8110722303390503,
   -0.7997866868972778,
   -0.8307012319564819,
   -0.7924110889434814,
   -0.7952202558517456,
   -0.8105142116546631,
   -0.8058382272720337,
   -0.8129931

In [None]:
# Debugging Steps
train_dataloader = DataLoader(ds['train'], batch_size=64, shuffle=True)



In [12]:

# model = torch.compile(model,mode='default')

In [39]:

# max_steps = math.ceil(training_args.num_train_epochs * len(ds['train']) / training_args.gradient_accumulation_steps / min(training_args.per_device_train_batch_size, len(ds['train'])))
# optimizer = Ranger21(model.parameters(), num_iterations=max_steps, lr=1e-4)
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-8, foreach=False)  # https://github.com/facebookresearch/fairseq/blob/main/examples/wav2vec/config/finetuning/base_960h.yaml
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_steps)
# scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=tri_stage_schedule)  # following FAIR finetuning settings
# scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: x)  # constant LR, stays same throughout, for Ranger21
# torch._dynamo.config.suppress_errors = True
# torch._dynamo.config.verbose=True
# model = torch.compile(model,mode='default')
# model = torch.compile(model. mode = 'default)
# trainer = CustomWhisperTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=ds['train'],
#     eval_dataset=ds['test'],
#     tokenizer=processor.feature_extractor,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics
#     # optimizers=(optimizer, scheduler),
# )
# if os.name != 'nt':  # windows does not support torch.compile yet
#     trainer.model_wrapped, trainer.optimizer, trainer.lr_scheduler = accelerator.prepare(trainer.model_wrapped, trainer.optimizer, trainer.lr_scheduler)

# trainer.train()
# if os.name != 'nt':
#     accelerator.wait_for_everyone()

  return node.target(*args, **kwargs)
  return node.target(*args, **kwargs)
  return target(*args, **kwargs)
  return target(*args, **kwargs)
  return node.target(*args, **kwargs)
  return target(*args, **kwargs)
  return target(*args, **kwargs)
W0514 19:33:09.419000 140349341388800 torch/_dynamo/convert_frame.py:357] torch._dynamo hit config.cache_size_limit (8)
W0514 19:33:09.419000 140349341388800 torch/_dynamo/convert_frame.py:357]    function: '_prepare_4d_causal_attention_mask_for_sdpa' (/usr/local/lib/python3.10/dist-packages/transformers/modeling_attn_mask_utils.py:348)
W0514 19:33:09.419000 140349341388800 torch/_dynamo/convert_frame.py:357]    last reason: L['input_shape'][1] == 30                                   
W0514 19:33:09.419000 140349341388800 torch/_dynamo/convert_frame.py:357] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W0514 19:33:09.419000 140349341388800 torch/_dynamo/convert_frame.py:357] To diagnose recompilation issues, see https://pytorch

AssertionError: Attempted unscale_ but _scale is None.  This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration.

In [None]:

generate_kwargs = {"language":"<|tr|>","task": "transcribe"}



In [14]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    # optimizers=(optimizer, scheduler),
)

trainer.train()

  return node.target(*args, **kwargs)
  return node.target(*args, **kwargs)
  return target(*args, **kwargs)
  return target(*args, **kwargs)
  return node.target(*args, **kwargs)
  return target(*args, **kwargs)
  return target(*args, **kwargs)
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Epoch,Training Loss,Validation Loss


W0525 17:06:59.479000 140602411212800 torch/_dynamo/convert_frame.py:357] torch._dynamo hit config.cache_size_limit (8)
W0525 17:06:59.479000 140602411212800 torch/_dynamo/convert_frame.py:357]    function: 'forward' (/usr/local/lib/python3.10/dist-packages/transformers/models/whisper/modeling_whisper.py:1273)
W0525 17:06:59.479000 140602411212800 torch/_dynamo/convert_frame.py:357]    last reason: tensor 'L['input_ids']' stride mismatch at index 0. expected 47, actual 48
W0525 17:06:59.479000 140602411212800 torch/_dynamo/convert_frame.py:357] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W0525 17:06:59.479000 140602411212800 torch/_dynamo/convert_frame.py:357] To diagnose recompilation issues, see https://pytorch.org/docs/master/compile/troubleshooting.html.
W0525 17:07:55.629000 140602411212800 torch/_dynamo/convert_frame.py:357] torch._dynamo hit config.cache_size_limit (8)
W0525 17:07:55.629000 140602411212800 torch/_dynamo/convert_frame.py:357]    function: '_pre

ValueError: Multiple languages detected when trying to predict the most likely target language for transcription. It is currently not supported to transcribe to different languages in a single batch. Please make sure to either force a single language by passing `language='...'` or make sure all input audio is of the same language.

# Inference Code


In [15]:
import os
from tqdm.auto import tqdm
os.environ['HF_HOME'] = 'huggingface'
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = 'True'
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
weights = "Checkpoints/medium-checkpoints/checkpoint-best"

# print(np.array(waveform, dtype= float))
pipe = pipeline(task="automatic-speech-recognition", model=weights, device=0)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
def clean(annotation):
    if "'" in annotation:
        # print(annotation, f'has \' in {annotation}, removing')
        annotation = annotation.split("'")[0] + annotation.split("'")[1][1:]  # Tokenizer includes "'" but TIL dataset does not, remove the S following '
    return annotation

In [22]:
results = []

for out in tqdm(pipe(KeyDataset(test_set, "audio"), batch_size=5), total=test_set.num_rows):
    results.append(clean(out['text']))

100%|██████████| 350/350 [10:26<00:00,  1.79s/it]


In [41]:
import librosa

import pyloudnorm as pyln
filenames = ["sanity.wav"]*4 

frequency = 16000
wf = []
wfn =[]
for f in filenames:
    waveform, sr = librosa.load(f, sr = frequency)
    
    pna = pyln.normalize.peak(waveform, 1.0)
    meter = pyln.Meter(frequency) # create BS.1770 meter
    loudness = meter.integrated_loudness(pna)
    pna = pyln.normalize.loudness(pna, loudness, 0.0)
    wf.append(pna)
    wfn.append(waveform)



In [23]:
results

['Control tower to all turrets, this is Alpha-1. Deploy interceptor jets to heading three three five. Target is green commercial aircraft. Engage and intercept immediately.',
 'Turret Bravo, target the purple and grey commercial aircraft heading three two zero, deploy anti-air artillery.',
 'Control to turrets, prepare to engage target at heading two four zero. Deploy anti-air artillery against the brown, grey, and white fighter plane. Take aim and fire at will. Over.',
 'Turret Charlie, focus EMP on white light aircraft at heading three three zero. Engage.',
 'Engage, red and blue cargo aircraft at heading one one zero with surface-to-air missiles.',
 'Control tower to air defense turrets, target is silver commercial aircraft at heading one six zero. Deploy anti-air artillery. Target must be intercepted immediately. That is an order.',
 'Turret Bravo, engage surface-to-air missiles, heading three zero five, target yellow helicopter. Turret Echo, standby for further orders.',
 'Control

In [24]:
test_set[0]

{'audio': {'path': None,
  'array': array([ 0.00000000e+00, -3.05175781e-05, -1.22070312e-04, ...,
          3.84521484e-03,  1.12915039e-03, -1.80053711e-03]),
  'sampling_rate': 16000},
 'transcription': 'Control Tower to all turrets, this is Alpha One. Deploy interceptor jets to heading three three five. Target is green commercial aircraft. Engage and intercept immediately.',
 'input_features': [[-0.8216806650161743,
   -0.8001151084899902,
   -0.8236849308013916,
   -0.7944220304489136,
   -0.8236849308013916,
   -0.8052636384963989,
   -0.7973525524139404,
   -0.815966010093689,
   -0.80418860912323,
   -0.8159199953079224,
   -0.8132989406585693,
   -0.8158341646194458,
   -0.7916271686553955,
   -0.8018431663513184,
   -0.7988895177841187,
   -0.7943437099456787,
   -0.8132468461990356,
   -0.8052159547805786,
   -0.798561692237854,
   -0.787926435470581,
   -0.8116097450256348,
   -0.7674583196640015,
   -0.8170151710510254,
   -0.8036720752716064,
   -0.8127151727676392,
   -0

In [43]:
pipe(wf)

[{'text': 'Turrets, prepare to deploy electromagnetic pulse. Heading zero six five, target is grey and white fighter jet. Engage when ready.'},
 {'text': 'Turrets, prepare to deploy electromagnetic pulse. Heading zero six five, target is grey and white fighter jet. Engage when ready.'},
 {'text': 'Turrets, prepare to deploy electromagnetic pulse. Heading zero six five, target is grey and white fighter jet. Engage when ready.'},
 {'text': 'Turrets, prepare to deploy electromagnetic pulse. Heading zero six five, target is grey and white fighter jet. Engage when ready.'}]

In [18]:
from transformers import WhisperProcessor
import numpy as np

# weights = "whisper-checkpoints/1_checkpoint-2000-wer-0.034826/"
weights = "whisper-niner-combined-checkpoints/checkpoint-3905-0.0304512wer/"

processor = WhisperProcessor.from_pretrained('openai/whisper-small.en',resume_download = None)
processor.tokenizer.save_pretrained(weights)
processor.feature_extractor.save_pretrained(weights)

['whisper-niner-combined-checkpoints/checkpoint-3905-0.0304512wer/preprocessor_config.json']

In [19]:
import os
os.environ['HF_HOME'] = 'huggingface'
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = 'True'
# import torch
# import datasets
# from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset

# import pandas as pd
# from torch.utils.data import DataLoader
# torch.multiprocessing.set_start_method('spawn')

def clean(annotation):
    if "'" in annotation:
        # print(annotation, f'has \' in {annotation}, removing')
        annotation = annotation.split("'")[0] + annotation.split("'")[1][1:]  # Tokenizer includes "'" but TIL dataset does not, remove the S following '
    return annotation

transcriber = pipeline("automatic-speech-recognition", model=weights, device=0)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Commence the predictions on ds['test']

In [16]:
test_set

Dataset({
    features: ['audio', 'transcription', 'input_features', 'input_length', 'labels', 'length'],
    num_rows: 350
})

In [5]:
# dataset = ds['test'].map(prepare_dataset, num_proc=8, batched=True, batch_size=128)

Map (num_proc=8): 100%|██████████| 700/700 [00:29<00:00, 23.41 examples/s]


In [9]:
dataset.num_rows

700

In [19]:


# test_ds = pd.read_csv('Test_Advanced.csv')
results = []

for out in tqdm(transcriber(KeyDataset(test_set, "audio"), batch_size=50), total=test_set.num_rows):
    results.append(clean(out['text']))
# test_ds['annotation'] = results
# test_ds['path'] = test_ds['path'].apply(lambda x: x.split('/')[-1])
# test_ds.to_csv('whisper-submissions/'+weights+'.csv', index=False)

NameError: name 'transcriber' is not defined

In [23]:
results

['Control tower to turrets, target is a purple and grey drone, heading zero niner zero. Deploy EMP. Take it out.',
 'Engage target, red drone, at heading three three zero with electromagnetic pulse.',
 'Engage target, purple, red, and brown missile, heading one niner five, deploy machine gun.',
 'Engage the blue, orange, and grey commercial aircraft heading one six zero. Deploy anti-air artillery.',
 'Control here. Deploy anti-air artillery to heading zero three five. Target is a white, red, and blue light aircraft. Engage and destroy. Over.',
 ' Deploy anti-air artillery, heading three six zero, target yellow, orange, and red commercial aircraft.',
 'Turret Alpha, deploy anti-air artillery at heading zero seven five to intercept the brown fighter plane. Standby for authorization.',
 'Air defense turret, heading two niner zero, deploy EMP on orange fighter plane.',
 'Air defense turret, target the orange fighter plane heading two three zero, deploy EMP.',
 'Turret Bravo, deploy drone c

In [12]:
dataset[0]

{'audio': {'path': None,
  'array': array([ 0.00000000e+00, -6.10351562e-05, -2.44140625e-04, ...,
         -1.77001953e-03, -3.81469727e-03, -3.81469727e-03]),
  'sampling_rate': 16000},
 'transcription': 'Control to turrets, prepare to deploy electromagnetic pulse on target purple light aircraft heading zero six five. Repeat, prepare to deploy electromagnetic pulse on target purple light aircraft heading zero six five. Over.',
 'input_features': [[-0.8024837970733643,
   -0.8197126388549805,
   -0.8162621259689331,
   -0.7974296808242798,
   -0.8007311820983887,
   -0.8113070726394653,
   -0.819406270980835,
   -0.8119014501571655,
   -0.8375890254974365,
   -0.8315097093582153,
   -0.7958790063858032,
   -0.8286324739456177,
   -0.7965573072433472,
   -0.8001447916030884,
   -0.8129847049713135,
   -0.7953317165374756,
   -0.8415418863296509,
   -0.8060276508331299,
   -0.8064930438995361,
   -0.8098506927490234,
   -0.8237791061401367,
   -0.8101183176040649,
   -0.8026570081710815

# Run whisper on 1 audio

In [27]:
import librosa
filename = "RecordingMe.wav" 
model = WhisperForConditionalGeneration.from_pretrained(weights)

frequency = 16000
waveform, sr = librosa.load(filename, sr = frequency)
# print(np.array(waveform, dtype= float))
input_features = processor(
    waveform,
    sampling_rate = frequency,
    return_tensors = "pt"
).input_features





In [17]:
transcriber = pipeline("automatic-speech-recognition", model=weights, device=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [28]:
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

In [29]:
transcription

[' Turrets, prepare to deploy electromagnetic pulse, heading zero seven nine. Target is yellow fighter jet. Engage.']

In [31]:
import transformers
print(transformers.__version__)

4.40.2


## Fast Whisper

In [1]:
import torch
print('CUDA:',torch.version.cuda)

cudnn = torch.backends.cudnn.version()
cudnn_major = cudnn // 1000
cudnn = cudnn % 1000
cudnn_minor = cudnn // 100
cudnn_patch = cudnn % 100
print( 'cuDNN:', '.'.join([str(cudnn_major),str(cudnn_minor),str(cudnn_patch)]) )

CUDA: 12.1
cuDNN: 8.8.1


## Converter

## Model inference

In [5]:
print(model)

<faster_whisper.transcribe.WhisperModel object at 0x0000025174B20B80>


In [6]:

# print(np.array(waveform, dtype= float))
input_features = processor(
    waveform,
    sampling_rate = frequency,
    return_tensors = "pt"
).input_features

In [11]:
waveform

array([ 0.00156825, -0.00155713,  0.00423736, ..., -0.00077666,
       -0.00231077, -0.0004648 ], dtype=float32)

In [12]:
input_features

tensor([[[-0.0658, -0.2872, -0.1421,  ..., -1.0103, -1.0103, -1.0103],
         [-0.0221, -0.2499, -0.0393,  ..., -1.0103, -1.0103, -1.0103],
         [ 0.0173, -0.1727,  0.0011,  ..., -1.0103, -1.0103, -1.0103],
         ...,
         [-0.3948, -0.4706, -0.4809,  ..., -1.0103, -1.0103, -1.0103],
         [-0.4814, -0.5673, -0.5935,  ..., -1.0103, -1.0103, -1.0103],
         [-0.3784, -0.6411, -0.7374,  ..., -1.0103, -1.0103, -1.0103]]])

In [15]:
segments, info = model.transcribe(waveform, beam_size=5)
# transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(segments)

<generator object WhisperModel.generate_segments at 0x000002518480AE30>


In [10]:
for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

[0.00s -> 7.00s]  Turrets, prepare to deploy electromagnetic pulse, heading zero seven nine. Target is yellow fighter jet. Engage.
