In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/audio-dataset-male-marathi-english/english/txt.done.data
/kaggle/input/audio-dataset-male-marathi-english/english/calculate_duration.pl
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_02106.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_02167.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_04383.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00165.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_01895.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_02607.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_02527.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_03849.wav
/kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_01315.wav
/kaggle/input/audio-dataset-male-marathi-english/

In [2]:
import os
import librosa
from datasets import Dataset
import pandas as pd
import re

In [3]:
# Path to your dataset
audio_dir = "/kaggle/input/audio-dataset-male-marathi-english/english/wav"  # Folder containing WAV files
txt_file = "/kaggle/input/audio-dataset-male-marathi-english/english/txt.done.data"  # Your txt.done.data file


In [4]:
# Define a regular expression pattern to match the filename and transcription inside parentheses
pattern = r'\(\s*(\S+)\s+"(.+)"\s*\)'  # Matches: (filename "transcript")

# List to hold data
data = []

# Open the txt.done.data file and extract filenames and transcripts
with open(txt_file, "r") as f:
    for line in f:
        line = line.strip()
        if line:
            match = re.match(pattern, line)
            if match:
                filename = match.group(1).strip("() ")  # Clean filename
                transcript = match.group(2)  # Get transcript
                
                # Construct full audio path using os.path.join
                full_audio_path = os.path.join(audio_dir, f"{filename}.wav")  # Append .wav
                
                # Add the file and transcript to the data list
                data.append({
                    "file": full_audio_path,
                    "text": transcript
                })
            else:
                print(f"Error parsing line: {line}")

# Verify the collected data
print("Collected Data:")
for entry in data:
    print(f"Audio file: {entry['file']}, Transcript: {entry['text']}")

# Create a Dataset from the collected data
dataset = Dataset.from_dict({ 
    "file": [entry["file"] for entry in data], 
    "text": [entry["text"] for entry in data] 
})

def load_audio(batch):
    # Check if the audio file exists
    if not os.path.exists(batch["file"]):
        raise FileNotFoundError(f"Audio file not found: {batch['file']}")

    # Load the audio file and return as array and sampling rate
    audio_array, sampling_rate = librosa.load(batch["file"], sr=16000)  # Whisper expects 16kHz audio
    batch["audio"] = audio_array
    batch["sampling_rate"] = sampling_rate
    return batch

# Apply audio loading function to the dataset
dataset = dataset.map(load_audio)

# Verify the dataset
print("Loaded Dataset Sample:")
# print(dataset[0])

Collected Data:
Audio file: /kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00001.wav, Transcript:  Author of the danger trail, Philip Steels, etc. 
Audio file: /kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00002.wav, Transcript:  Not at this particular case, Tom apologized, Whittemore. 
Audio file: /kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00003.wav, Transcript:  For the twentieth time, that evening, the two men, shook hands. 
Audio file: /kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00004.wav, Transcript:  Lord, but I'm glad to see you again, Phil. 
Audio file: /kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00005.wav, Transcript:  Will we ever forget it. 
Audio file: /kaggle/input/audio-dataset-male-marathi-english/english/wav/train_marathimale_00006.wav, Transcript:  God bless 'em, I hope I'll go on seeing them forever.

Map:   0%|          | 0/5578 [00:00<?, ? examples/s]

Loaded Dataset Sample:


In [5]:
dataset.info


DatasetInfo(description='', citation='', homepage='', license='', features={'file': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'audio': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None), 'sampling_rate': Value(dtype='int64', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)

In [17]:
dataset = dataset.select(range(200))

In [10]:
!pip install evaluate
!pip install jiwer

import evaluate

# Load the Word Error Rate (WER) metric using the evaluate library
wer_metric = evaluate.load("wer")

  pid, fd = os.forkpty()


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading jiwer-3.0.4-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.4 rapidfuzz-3.10.0


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [11]:
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from safetensors import safe_open

# Step 1: Load the model architecture without weights
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

# Step 2: Path to your safetensors file on Kaggle
safetensor_path = "/kaggle/input/whisper_fine_tuned/pytorch/default/1/model.safetensors"  # Adjust if needed

# Step 3: Load the tensors from the safetensors file
tensors = {}
with safe_open(safetensor_path, framework="pt", device="cpu") as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)

# Step 4: Load the weights into the model
model.load_state_dict(tensors, strict=False)

# Set up the processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")


config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

In [12]:
# Define a function to transcribe audio
def transcribe_audio(file_path):
    # Load the audio file
    audio_array, sampling_rate = librosa.load(file_path, sr=16000)

    # Preprocess the audio
    input_features = processor(
        audio_array, sampling_rate=sampling_rate, return_tensors="pt"
    ).input_features

    # Generate transcription
    with torch.no_grad():
        generated_ids = model.generate(input_features)

    # Decode the generated ids to text
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return transcription


In [20]:
dataset

Dataset({
    features: ['file', 'text', 'audio', 'sampling_rate'],
    num_rows: 200
})

In [22]:
from tqdm import tqdm  # For progress visualization

# Function to transcribe the dataset using a model
def transcribe_dataset(model, processor, dataset):
    transcriptions = []
    for sample in tqdm(dataset, desc="Transcribing"):
        transcription = transcribe_audio(sample["file"])
        transcriptions.append(transcription)
    return transcriptions

# Get transcriptions from both models
print("Transcribing with Model 1...")
transcriptions_model_1 = transcribe_dataset(model, processor, dataset)  # Use your first model instance

# Extract ground truth transcripts
ground_truths = [entry["text"] for entry in dataset]

# Calculate WER for both models
wer_1 = wer_metric.compute(predictions=transcriptions_model_1, references=ground_truths)


Transcribing with Model 1...


Transcribing: 100%|██████████| 200/200 [18:05<00:00,  5.43s/it]


In [23]:
import requests

API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3-turbo"
headers = {"Authorization": "Bearer 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"}

def query(filename):
    with open(filename, "rb") as f:
        data = f.read()
    response = requests.post(API_URL, headers=headers, data=data)
    return response.json()

def generate_output(dataset):
    transcriptions = []
    for sample in tqdm(dataset, desc="Transcribing"):
        transcription = query(sample["file"])
        transcriptions.append(transcription["text"])
    
    return transcriptions

In [24]:
print("Transcribing with Model 2...")
transcriptions_model_2 = generate_output(dataset)  # Use your second model instance
wer_2 = wer_metric.compute(predictions=transcriptions_model_2, references=ground_truths)

Transcribing with Model 2...


Transcribing: 100%|██████████| 200/200 [02:01<00:00,  1.65it/s]


In [25]:
print(f"WER for Model 1: {wer_1}")
print(f"WER for Model 2: {wer_2}")

WER for Model 1: 0.037703995498030385
WER for Model 2: 0.20934158694428812
