In [2]:
!pip install --upgrade --quiet pip
!pip install --upgrade --quiet datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio huggingface_hub pydub

from datasets import load_dataset, DatasetDict, Audio
from transformers import (
    WhisperFeatureExtractor, 
    WhisperTokenizer, 
    WhisperProcessor, 
    WhisperForConditionalGeneration, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch
import evaluate
from huggingface_hub import notebook_login



[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.16.1 requires tensorboard<2.17,>=2.16, but you have tensorboard 2.18.0 which is incompatible.[0m[31m
[0m

In [3]:
notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [4]:
print("Loading dataset...")
dataset = load_dataset("ahmedafareed/arabicnewscorpus", split="train")

print("Splitting dataset into train/test...")
train_test_split = dataset.train_test_split(test_size=0.2)

common_voice = DatasetDict({
    "train": train_test_split["train"],
    "test": train_test_split["test"]
})

print("Dataset loaded and split successfully!")



Loading dataset...


README.md:   0%|          | 0.00/364 [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/719M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/2.38G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9414 [00:00<?, ? examples/s]

Splitting dataset into train/test...
Dataset loaded and split successfully!


In [5]:
print("Loading processor components...")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Arabic", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Arabic", task="transcribe")

# Cast audio column to the correct sampling rate
print("Casting audio column...")
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))



Loading processor components...


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Casting audio column...


In [6]:
print("Preparing dataset...")
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

# Use parallel processing for efficient dataset mapping
common_voice = common_voice.map(
    prepare_dataset, 
    remove_columns=common_voice.column_names["train"], 
    num_proc=2  # Adjust based on CPU cores for optimal performance
)



Preparing dataset...


  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/7531 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1883 [00:00<?, ? examples/s]

In [7]:
# Load model and move to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
model.generation_config.language = "arabic"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None



Using device: cuda


config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [8]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]
        
        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor, decoder_start_token_id=tokenizer.bos_token_id)

# Define evaluation metric
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-arabiccorpus",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=1000,
    gradient_checkpointing=True,
    fp16=torch.cuda.is_available(),
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)



Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]



In [9]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

print("Training model...")
processor.save_pretrained(training_args.output_dir)
trainer.train()



  trainer = Seq2SeqTrainer(
max_steps is given, it will override any value given in num_train_epochs


Training model...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Wer
1000,0.0785,0.202404,43.384645


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use

TrainOutput(global_step=1000, training_loss=0.44767873990535734, metrics={'train_runtime': 15018.8461, 'train_samples_per_second': 2.131, 'train_steps_per_second': 0.067, 'total_flos': 9.2289611022336e+18, 'train_loss': 0.44767873990535734, 'epoch': 4.246284501061571})

In [1]:
kwargs = {
    "dataset_tags": "ahmedafareed/arabicnewscorpus",
    "dataset": "Arabic News Corpus",
    "language": "ar",
    "model_name": "Whisper Small Arabic",
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
}
print("Pushing model to the Hub...")
trainer.push_to_hub(**kwargs)

Pushing model to the Hub...


NameError: name 'trainer' is not defined