In [1]:
!pip install transformers datasets accelerate




In [2]:
from datasets import load_from_disk

# Define the dataset path
dataset_path = "/kaggle/input/preprocessed-dataset/preprocessed_dataset"

# Load the dataset
dataset = load_from_disk(dataset_path)

# Inspect the dataset
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['wav', 'input_features', 'labels'],
        num_rows: 6827
    })
    validation: Dataset({
        features: ['wav', 'input_features', 'labels'],
        num_rows: 2252
    })
    test: Dataset({
        features: ['wav', 'input_features', 'labels'],
        num_rows: 2267
    })
})


In [3]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load Whisper processor and model
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-darija",
    per_device_train_batch_size=4,  # Adjust batch size if needed
    gradient_accumulation_steps=4,  # Effective batch size = 4 * 4 = 16
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    num_train_epochs=3,
    learning_rate=1e-5,
    warmup_steps=500,
    save_total_limit=2,
    fp16=True,  # Mixed precision for GPU
    predict_with_generate=True,
    report_to="none",
)


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]



In [6]:
def data_collator(features):
    input_features = [feature["input_features"] for feature in features]
    labels = [feature["labels"] for feature in features]

    # Pad input features and labels
    batch = processor.feature_extractor.pad({"input_features": input_features}, return_tensors="pt")
    labels_batch = processor.tokenizer.pad({"input_ids": labels}, return_tensors="pt", padding=True)

    # Replace padding token id with -100 to ignore during loss computation
    labels_batch["input_ids"][labels_batch["input_ids"] == processor.tokenizer.pad_token_id] = -100

    # Combine into a single batch
    batch["labels"] = labels_batch["input_ids"]
    return batch


In [7]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
)
trainer.train()


  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,1.5551,1.464974
1000,0.3135,0.449092




TrainOutput(global_step=1278, training_loss=1.6680029449701683, metrics={'train_runtime': 8099.0809, 'train_samples_per_second': 2.529, 'train_steps_per_second': 0.158, 'total_flos': 5.89810841174016e+18, 'train_loss': 1.6680029449701683, 'epoch': 2.9935559461042764})

In [9]:
# Save the fine-tuned model
model.save_pretrained("/kaggle/working/whisper-darija")
processor.save_pretrained("/kaggle/working/whisper-darija")

print("Model saved to /kaggle/working/whisper-darija")


Model saved to /kaggle/working/whisper-darija


In [10]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [13]:
!pip install jiwer


Collecting jiwer
  Downloading jiwer-3.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading jiwer-3.0.5-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.5 rapidfuzz-3.11.0


In [15]:
# Add the language to the processor during generation
with torch.no_grad():
    predicted_ids = model.generate(
        inputs.input_features,
        forced_decoder_ids=processor.get_decoder_prompt_ids(language="ar", task="transcribe")
    )



NameError: name 'torch' is not defined

In [None]:
import torch
import evaluate

# Load the WER metric
wer_metric = evaluate.load("wer")

# Generate predictions with specified language and attention mask
predictions = trainer.predict(dataset["test"])

# Decode predictions and references
decoded_preds = processor.batch_decode(predictions.predictions, skip_special_tokens=True)
decoded_refs = [processor.tokenizer.decode(item["labels"], skip_special_tokens=True) for item in dataset["test"]]

# Compute Word Error Rate
wer = wer_metric.compute(predictions=decoded_preds, references=decoded_refs)
print(f"Word Error Rate (WER): {wer}")


In [16]:
pip install sounddevice scipy transformers


Collecting sounddevice
  Downloading sounddevice-0.5.1-py3-none-any.whl.metadata (1.4 kB)
Downloading sounddevice-0.5.1-py3-none-any.whl (32 kB)
Installing collected packages: sounddevice
Successfully installed sounddevice-0.5.1
Note: you may need to restart the kernel to use updated packages.


In [20]:
!pip install pipwin
!pipwin install pyaudio


Collecting pipwin
  Downloading pipwin-0.5.2.tar.gz (7.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting docopt (from pipwin)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyprind (from pipwin)
  Downloading PyPrind-2.11.3-py2.py3-none-any.whl.metadata (1.1 kB)
Collecting js2py (from pipwin)
  Downloading Js2Py-0.74-py3-none-any.whl.metadata (868 bytes)
Collecting pySmartDL>=1.3.1 (from pipwin)
  Downloading pySmartDL-1.3.4-py3-none-any.whl.metadata (2.8 kB)
Collecting pyjsparser>=2.5.1 (from js2py->pipwin)
  Downloading pyjsparser-2.7.1.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pySmartDL-1.3.4-py3-none-any.whl (20 kB)
Downloading Js2Py-0.74-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)
Building wheels f

In [25]:
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch

# Load the fine-tuned model
model_path = "./whisper-darija"
processor = WhisperProcessor.from_pretrained(model_path)
model = WhisperForConditionalGeneration.from_pretrained(model_path)

# Path to the audio file
audio_path = "/kaggle/input/audioali/mic_audio2.wav"

# Load the audio file
waveform, sample_rate = torchaudio.load(audio_path)

# Resample the audio to 16 kHz (required by Whisper)
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

# Process the audio input
inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")

# Transcribe the audio
with torch.no_grad():
    predicted_ids = model.generate(
        inputs.input_features,
        forced_decoder_ids=processor.get_decoder_prompt_ids(language="ar", task="transcribe")
    )

# Decode the transcription
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print(f"Transcription: {transcription}")


Transcription: معليكم بخير لبس عليكم شكرا عودوه تمم زينين الحياه كدهيه


In [27]:
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch

# Load the fine-tuned model
model_path = "./whisper-darija"
processor = WhisperProcessor.from_pretrained(model_path)
model = WhisperForConditionalGeneration.from_pretrained(model_path)

# Path to the audio file
audio_path = "/kaggle/input/audioali/mic_audio4.wav"

# Load the audio file
waveform, sample_rate = torchaudio.load(audio_path)

# Resample the audio to 16 kHz (required by Whisper)
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

# Process the audio input
inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")

# Transcribe the audio
with torch.no_grad():
    predicted_ids = model.generate(
        inputs.input_features,
        forced_decoder_ids=processor.get_decoder_prompt_ids(language="ar", task="transcribe")
    )

# Decode the transcription
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print(f"Transcription: {transcription}")


Transcription: انا علي اصواب انا وصاحبي احمد ابن براهي ومحمد عيسان ترنهت بروج بس يبينو بلي يمكن نفاين شونيه ويسبر علي الدريجه بشوي يقدر يفهم اللغه ديناه زوينه وعليش لانك نشو بابيده
