In [1]:
!nvidia-smi

Thu Mar  7 05:41:10 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    Off  | 00000000:00:05.0 Off |                  Off |
| 30%   32C    P8    27W / 300W |      1MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install librosa evaluate datasets jiwer gcsfs accelerate transformers==4.37.2

Collecting librosa
  Downloading librosa-0.10.1-py3-none-any.whl (253 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.7/253.7 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting gcsfs
  Downloading gcsfs-2024.2.0-py2.py3-none-any.whl (33 kB)
Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.37.2
  Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m86.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting huggingface-h

In [3]:
import librosa, torch, evaluate, os
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
from dataclasses import dataclass
from typing import Any, Dict, List, Union

In [4]:
print('Initializing...')
metric = evaluate.load("wer")

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large", language="Hindi", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-large", language="Hindi", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")

Initializing...


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.85k [00:00<?, ?B/s]

In [5]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [6]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [7]:
def prepare_dataset(batch):
    #os.system('gsutil -q cp -r ' + 'gs://s2t-database/dataset/ben/wav/'+batch["path"]+'.wav' + ' /content/drive/MyDrive/fine/ben/')
    audio_array, sampling_rate = librosa.load(batch["path"], sr=16000, mono=True)
    batch["input_features"] = feature_extractor(audio_array, sampling_rate=sampling_rate).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [8]:
ds = load_dataset('csv', data_files={'train': ['train.csv'], 'test': 'test.csv'})

Using custom data configuration default-d2b0043f2f7bafe2


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-d2b0043f2f7bafe2/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-d2b0043f2f7bafe2/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
print(ds)
ds = ds.map(prepare_dataset, num_proc=None)
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

DatasetDict({
    train: Dataset({
        features: ['path', 'sentence'],
        num_rows: 315
    })
    test: Dataset({
        features: ['path', 'sentence'],
        num_rows: 147
    })
})


  0%|          | 0/315 [00:00<?, ?ex/s]

  0%|          | 0/147 [00:00<?, ?ex/s]

In [10]:
from transformers import Seq2SeqTrainingArguments

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="ckpt/whisper-large-hi-snt",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=38,
    max_steps=2000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    generation_max_length=255,
    # save_strategy="epoch",
    save_steps=20,
    eval_steps=20,
    logging_steps=1,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,

)


In [12]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [13]:
processor.save_pretrained(training_args.output_dir)

[]

In [14]:
model = WhisperForConditionalGeneration.from_pretrained("ckpt/whisper-large-hi-snt/checkpoint-1580")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
trainer.train(resume_from_checkpoint=True)
# trainer.train()

There were missing keys in the checkpoint model loaded: ['proj_out.weight'].
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
1600,0.0,0.10836,30.22113
1620,0.0,0.108439,30.22113
1640,0.0,0.108472,30.09828
1660,0.0,0.108516,30.09828
1680,0.0,0.108502,30.09828
1700,0.0,0.108589,30.22113
1720,0.0,0.108586,30.22113
1740,0.0,0.108626,30.22113
1760,0.0,0.108637,30.22113
1780,0.0,0.108689,30.22113


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress

TrainOutput(global_step=2000, training_loss=1.0543145151586942e-06, metrics={'train_runtime': 6656.0625, 'train_samples_per_second': 4.808, 'train_steps_per_second': 0.3, 'total_flos': 6.68798871552e+19, 'train_loss': 1.0543145151586942e-06, 'epoch': 100.0})

In [16]:
# !rm -rf ckpt/whisper-small-bn-snt ckpt/whisper-medium-bn-snt

In [None]:
import time
while True:
    print("This is an infinite loop")
    time.sleep(60)

This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
This is an infinite loop
