In [10]:
import os
import re
import librosa
import torch
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from datasets import Dataset
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import evaluate
import Levenshtein
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Paths to the data
test_path = r"C:\PATH\test"
train_path = r"C:\PATH\train"
txt_path = r"C:\PATH\Paroimies.txt"

# Load the text data
#Reads the file at txt_path line by line.
#Extracts proverbs using a regular expression (re.match), (e.g., 1. First proverb).
def load_target_text(txt_path):
    target_texts = []
    with open(txt_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:
                match = re.match(r"^\d+\.\s*(.*)", line)
                if match:
                    text = match.group(1).strip()
                    target_texts.append(text)
                else:
                    print(f"Skipping line due to formatting issues: {line}")
    return target_texts

target_texts = load_target_text(txt_path) # saves the returned list of proverbs

# Preprocess audio and text data
#Resamples the audio to 16 kHz (sr=16000), which is required by Whisper for consistent feature extraction.
def preprocess_audio(file_path, target_text):
    # Load audio and resample to 16 kHz (required for Whisper)
    audio, sr = librosa.load(file_path, sr=16000)
    return {"audio": audio, "text": target_text} #returns the resampled audio with the corresponding targeted text

def create_dataset(data_path, target_texts):
    data = []
    for filename in os.listdir(data_path):
        if filename.endswith(".wav"):
            match = re.match(r"(\d+)", filename)#Uses a regular expression to extract a numeric ID from the filename (e.g., 1.wav would extract 1).
            #This ID is used to match the audio file to its corresponding proverb in target_texts.
            if match:
                file_id = int(match.group(1))
                file_path = os.path.join(data_path, filename)
                target_sentence = target_texts[file_id - 1] if 0 < file_id <= len(target_texts) else ""
                data.append(preprocess_audio(file_path, target_sentence))
    return data

# Create training and testing datasets
train_data = create_dataset(train_path, target_texts)
test_data = create_dataset(test_path, target_texts)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

Model Weights: Contains pre-trained parameters for Whisper-small.
Processor Configuration: Includes details about expected input features (e.g., spectrogram size, sampling rate) and tokenizer.

In [11]:
# Load Whisper model and processor
# Downloads and loads the pre-trained Whisper processor for the small model.
processor = WhisperProcessor.from_pretrained("openai/whisper-small") 
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")


Prepares a single data point (audio + text) into a format suitable for training or inference with Whisper.

In [3]:
# Preprocess dataset for Whisper
def preprocess_batch(batch):
    # Convert audio to Whisper input features
    input_features = processor(batch["audio"], sampling_rate=16000, return_tensors="pt").input_features
    # Tokenize the text labels
    #Converts the text into a sequence of token IDs (integers corresponding to tokens in the Whisper vocabulary).
    labels = processor.tokenizer(batch["text"], return_tensors="pt", padding="longest").input_ids
    # Store tensors directly
    batch["input_features"] = input_features.squeeze(0)  # Ensure shape [feature_dim, time]
    batch["labels"] = labels.squeeze(0)  # Ensure shape [sequence_length]
    return batch

# Map preprocessing to train and test datasets
# Applies the preprocess_batch to all data points
train_dataset = train_dataset.map(preprocess_batch)
test_dataset = test_dataset.map(preprocess_batch)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["audio", "text"])
test_dataset = test_dataset.remove_columns(["audio", "text"])
#The training and testing datasets will only contain:
# input_features: The processed audio data.
# labels: The processed text data.

Map:   0%|          | 0/1238 [00:00<?, ? examples/s]

Map:   0%|          | 0/262 [00:00<?, ? examples/s]

Batching Input Features:

Stacks mel-spectrograms (input_features) into a single tensor for efficient parallel processing.
Padding Labels:

Ensures all text sequences (labels) in a batch are padded to the same length, avoiding dimension mismatch during training.
Compatibility with Whisper:

Whisper models require inputs and labels to be provided as batched tensors for training or inference.
The data collator ensures that both input_features and labels meet these requirements.

In [4]:
# Custom Data Collator
class DataCollatorWhisper:
    def __call__(self, features):
        input_features = torch.stack([torch.tensor(feature["input_features"]) for feature in features])
        labels = [torch.tensor(feature["labels"]) for feature in features]
        labels = pad_sequence(labels, batch_first=True, padding_value=-100)
        return {"input_features": input_features, "labels": labels}

data_collator = DataCollatorWhisper()


In [31]:
# Load WER metric
# WER = Sunstitution + deletions+insertions / Num of words
wer_metric = evaluate.load("wer")

# Transliteration map (partial, extend as needed)
greek_to_greeklish = {
    "α": "a", "β": "v", "γ": "g", "δ": "d", "ε": "e", "ζ": "z",
    "η": "i", "θ": "th", "ι": "i", "κ": "k", "λ": "l", "μ": "m",
    "ν": "n", "ξ": "x", "ο": "o", "π": "p", "ρ": "r", "σ": "s",
    "τ": "t", "υ": "y", "φ": "f", "χ": "ch", "ψ": "ps", "ω": "o",
    "ς": "s"
}

def transliterate_to_greeklish(greek_text):
    return ''.join(greek_to_greeklish.get(char, char) for char in greek_text)

# Compute metrics
#Decodes predictions and labels into text format.
#Transliterates predictions to Greeklish.
#Computes WER between the predicted and reference texts

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    
    pred_texts = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_texts = processor.batch_decode(label_ids, skip_special_tokens=True)
    
    pred_texts_greeklish = [transliterate_to_greeklish(text) for text in pred_texts]
    
    wer = wer_metric.compute(predictions=pred_texts_greeklish, references=label_texts)
    return {"wer": wer}

# Custom Trainer to force language settings during generation
#Extracts input features and labels from the batch.
#Applies generation (model.generate) with Greek language settings ("el").
#Computes loss when labels are available.
class CustomSeq2SeqTrainer(Seq2SeqTrainer):
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        ignore_keys = ignore_keys or []
        inputs = {k: v.to(self.args.device) for k, v in inputs.items() if k not in ignore_keys}
        has_labels = "labels" in inputs
        labels = inputs["labels"] if has_labels else None

        if prediction_loss_only:
            with torch.no_grad():
                outputs = model(**inputs)
                loss = outputs["loss"].mean().detach()
            return (loss, None, None)

        generation_inputs = inputs["input_features"]
        gen_kwargs = {"language": "el", "task": "transcribe"}

        with torch.no_grad():
            generated_tokens = model.generate(generation_inputs, **gen_kwargs)

        loss = None
        if has_labels:
            with torch.no_grad():
                outputs = model(**inputs)
                loss = outputs["loss"].mean().detach()

        return (loss, generated_tokens, labels)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=50,
    fp16=False,
)

# Initialize Custom Trainer
trainer = CustomSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=processor.tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)


2024/12/01 01:53:17 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id c059f6b7fa3a4ff1abfbd4b26ce8e4cf: Failed to log run data: Exception: Changing param values is not allowed. Param with key='num_train_epochs' was already logged with value='5' for run ID='c059f6b7fa3a4ff1abfbd4b26ce8e4cf'. Attempted logging new value '3'.


Epoch,Training Loss,Validation Loss,Wer
0,0.0032,0.018418,0.192499
2,0.0002,0.013208,0.035852


Evaluation Metrics: {'eval_loss': 0.01320820115506649, 'eval_wer': 0.035852178709321565, 'eval_runtime': 2119.8971, 'eval_samples_per_second': 0.124, 'eval_steps_per_second': 0.016, 'epoch': 2.9806451612903224}


In [32]:
# Save the fine-tuned model

model.save_pretrained("./whisper-finetuned")
processor.save_pretrained("./whisper-finetuned")


# Evaluate the model
results = trainer.evaluate()


In [33]:
results

{'eval_loss': 0.01320820115506649,
 'eval_wer': 0.035852178709321565,
 'eval_runtime': 2067.991,
 'eval_samples_per_second': 0.127,
 'eval_steps_per_second': 0.016,
 'epoch': 2.9806451612903224}

In [13]:

# Path to your saved model directory
model_path = r"C:\PATH\whisper-finetuned"

# Load the model and processor
model = WhisperForConditionalGeneration.from_pretrained(model_path)
processor = WhisperProcessor.from_pretrained(model_path)



In [14]:
# Test on unseen audio
def transcribe_audio(file_path):
    audio, sr = librosa.load(file_path, sr=16000)
    input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
    generated_ids = model.generate(input_features)
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return transcription

# Example transcription
test_audio_path = r"C:\PATH\audio.wav"
print("Transcription:", transcribe_audio(test_audio_path))

Transcription: Apou polla logia o kosmos en dikos tou.


In [5]:
!pip install  Levenshtein



In [6]:
# Find the nearest proverb using Levenshtein distance
def find_nearest_proverb(transcription, target_texts):
    """Find the closest match for the transcription from the list of proverbs."""
    distances = {proverb: Levenshtein.distance(transcription.lower(), proverb.lower()) for proverb in target_texts}
    nearest_proverb = min(distances, key=distances.get)  # Proverb with the smallest distance
    distance = distances[nearest_proverb]
    return nearest_proverb, distance

In [7]:
def transcribe_and_suggest(file_path, target_texts):
    # Step 1: Transcribe the audio
    transcription = transcribe_audio(file_path)

    # Step 2: Find the nearest proverb
    nearest_proverb, distance = find_nearest_proverb(transcription, target_texts)

    # Step 3: Return transcription and the closest proverb
    return transcription, nearest_proverb

In [8]:
test_audio_path = r"C:\PATH\audio.wav"
transcription, closest_proverb = transcribe_and_suggest(test_audio_path, target_texts)
print("Transcription:", transcription)
print("Do you mean:", closest_proverb)


Transcription: Apou polla logia o kosmos en dikos tou.
Do you mean: Apou’n antrepetai, o kosmos en dikos tou.


In [9]:
# Load a CSV file
testing_df = pd.read_csv(r'C:\Users\Foivos\Desktop\DS\Semester3\DSC515\projec\ALISAVOU\train_test_df.csv')

rows_n = testing_df.shape[0]
transcription_counter = 0
closest_counter = 0

# Using all the initial data, find out what percentage of the audio recording's transcriptions are right on the 1st try, and on the 2nd try.
for i in range(rows_n):
    transcription, closest_proverb = transcribe_and_suggest(testing_df.iloc[i, 0], target_texts)
    if transcription==testing_df.iloc[i, 1]:
        transcription_counter+=1
        closest_counter+=1
    elif closest_proverb==testing_df.iloc[i, 1]:
        closest_counter+=1

transcription_percent = (transcription_counter/rows_n)*100
closest_percent = (closest_counter/rows_n)*100

print("Transcription percentage: ", transcription_percent)
print("Closest percentage: ", closest_percent)

  audio, sr = librosa.load(file_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\chris\\Desktop\\Deep Learning\\Project\\DATASET_VOICE\\DATASET_VOICE\\train\\10_F_original.wav'