### Install required libraries

In [None]:
!pip install transformers torchaudio

### Import libraries

In [None]:
import os
import pandas as pd
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from transformers import WhisperProcessor, WhisperForConditionalGeneration

### Load data function provided

In [None]:
def load_data(tsv_file, audio_dir, max_samples=100):
    audio_files = []
    transcripts = []
    count = 0

    try:
        print("Loading dataset...\n\n" + "=" * 50 + "\n")
        df = pd.read_csv(tsv_file, sep='\t')
        df = df.sample(frac=1).reset_index(drop=True)

        for index, row in df.iterrows():
            audio_file = row['path']
            if not audio_file.endswith(".mp3"):
                audio_file += ".mp3"
            transcript = row['sentence']

            audio_files.append(os.path.join(audio_dir, audio_file))
            transcripts.append(transcript)
            count += 1

            if count >= max_samples:
                print(f"Finished loading {count} audio files and transcripts.\n\n" + "=" * 50 + "\n")
                break

        return audio_files, transcripts
    except Exception as e:
        print(f"Error loading Common Voice data: {e}\n")
        return [], []

### Load pre-trained Whisper model and special tokenizers

In [None]:
# Load the pre-trained Whisper tokenizer and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

# Custom tokens can be added based on frequent Tagalog phrases
special_tokens = ["[tagalog_token]", "[english_token]"]  # Example tokens
processor.tokenizer.add_tokens(special_tokens)

In [None]:
# Load the pre-trained Whisper model and resize the token embeddings
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.resize_token_embeddings(len(processor.tokenizer))  # Resize embeddings to include new tokens


### Processes the loaded data and trains the model

In [None]:
class process_data(Dataset):
    def __init__(self, audio_files, transcripts, processor):
        self.audio_files = audio_files
        self.transcripts = transcripts
        self.processor = processor

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_path = self.audio_files[idx]
        transcript = self.transcripts[idx]

        # Load and process audio
        audio = torchaudio.load(audio_path)
        input_features = self.processor(audio, sampling_rate=16000, return_tensors="pt").input_features

        # Encode the transcription
        labels = self.processor.tokenizer(transcript, return_tensors="pt").input_ids
        return {"input_features": input_features.squeeze(), "labels": labels.squeeze()}

# Load the data
tsv_file = "/path/to/train.tsv"
audio_dir = "/path/to/audio_files"
audio_files, transcripts = load_data(tsv_file, audio_dir)

# Prepare dataset and dataloader
train_dataset = process_data(audio_files, transcripts, processor)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

### Set training parameters

In [None]:
# Set training parameters
num_epochs = 10
output_dir = "/content/checkpoints"
os.makedirs(output_dir, exist_ok=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Training loop
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for batch in train_dataloader:
        input_features = batch["input_features"].to("cuda")
        labels = batch["labels"].to("cuda")

        outputs = model(input_features, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Save checkpoint every few epochs
    if (epoch + 1) % 2 == 0:  # Save every 2 epochs
        checkpoint_path = os.path.join(output_dir, f"whisper_checkpoint_epoch_{epoch + 1}.pt")
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Checkpoint saved at {checkpoint_path}")

### Saves the model after training

In [None]:
final_model_path = "/content/fine_tuned_whisper"
model.save_pretrained(final_model_path)
processor.save_pretrained(final_model_path)
print("Fine-tuned model and processor saved.")