In [None]:
import requests

API_URL = "https://api-inference.huggingface.co/models/openai/whisper-small"
headers = {"Authorization": "Bearer xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"} # Replace with your API key

def query(filename):
    with open(filename, "rb") as f:
        data = f.read()
    response = requests.post(API_URL, headers=headers, data=data)
    return response.json()

output = query("train_marathimale_00001.wav")

In [9]:
print(output)




In [10]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.config.forced_decoder_ids = None

# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)


In [11]:
import librosa

# Load your WAV file using librosa
audio_path = "enhanced_audio.wav"  # Specify your WAV file path here
audio_array, sampling_rate = librosa.load(audio_path, sr=16000)  # Whisper expects a sample rate of 16000

# Process the audio using Whisper's processor
input_features = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features

# Generate token IDs from the model
predicted_ids = model.generate(input_features)

# Decode the token IDs to get the transcription (skip special tokens for cleaner text)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

# Print or return the transcription
print("Transcription:", transcription[0])


Transcription:  6 spoons of fresh snow, 5-6 slabs of blue cheese, and maybe a snack for a great result.


LOAD DATASET

In [12]:
import os
import librosa
from datasets import Dataset
import pandas as pd
import re

In [13]:
# Path to your dataset
audio_dir = "./marathi_male_english/english/wav"  # Folder containing WAV files
txt_file = "./marathi_male_english/english/txt.done.data"  # Your txt.done.data file


In [14]:
# Define a regular expression pattern to match the filename and transcription inside parentheses
pattern = r'\(\s*(\S+)\s+"(.+)"\s*\)'  # Matches: (filename "transcript")

# List to hold data
data = []

# Open the txt.done.data file and extract filenames and transcripts
with open(txt_file, "r") as f:
    for line in f:
        line = line.strip()
        if line:
            match = re.match(pattern, line)
            if match:
                filename = match.group(1).strip("() ")  # Clean filename
                transcript = match.group(2)  # Get transcript
                
                # Construct full audio path using os.path.join
                full_audio_path = os.path.join(audio_dir, f"{filename}.wav")  # Append .wav
                
                # Add the file and transcript to the data list
                data.append({
                    "file": full_audio_path,
                    "text": transcript
                })
            else:
                print(f"Error parsing line: {line}")

# Verify the collected data
print("Collected Data:")
for entry in data:
    print(f"Audio file: {entry['file']}, Transcript: {entry['text']}")

# Create a Dataset from the collected data
dataset = Dataset.from_dict({ 
    "file": [entry["file"] for entry in data], 
    "text": [entry["text"] for entry in data] 
})

def load_audio(batch):
    # Check if the audio file exists
    if not os.path.exists(batch["file"]):
        raise FileNotFoundError(f"Audio file not found: {batch['file']}")

    # Load the audio file and return as array and sampling rate
    audio_array, sampling_rate = librosa.load(batch["file"], sr=16000)  # Whisper expects 16kHz audio
    batch["audio"] = audio_array
    batch["sampling_rate"] = sampling_rate
    return batch

# Apply audio loading function to the dataset
dataset = dataset.map(load_audio)

# Verify the dataset
print("Loaded Dataset Sample:")
print(dataset[0])

Collected Data:
Audio file: ./marathi_male_english/english/wav\train_marathimale_00001.wav, Transcript:  Author of the danger trail, Philip Steels, etc. 
Audio file: ./marathi_male_english/english/wav\train_marathimale_00002.wav, Transcript:  Not at this particular case, Tom apologized, Whittemore. 
Audio file: ./marathi_male_english/english/wav\train_marathimale_00003.wav, Transcript:  For the twentieth time, that evening, the two men, shook hands. 
Audio file: ./marathi_male_english/english/wav\train_marathimale_00004.wav, Transcript:  Lord, but I'm glad to see you again, Phil. 
Audio file: ./marathi_male_english/english/wav\train_marathimale_00005.wav, Transcript:  Will we ever forget it. 
Audio file: ./marathi_male_english/english/wav\train_marathimale_00006.wav, Transcript:  God bless 'em, I hope I'll go on seeing them forever. 
Audio file: ./marathi_male_english/english/wav\train_marathimale_00007.wav, Transcript:  And you always want to see it, in the superlative degree. 
Audio 

Map: 100%|██████████| 5578/5578 [01:03<00:00, 87.24 examples/s] 


Loaded Dataset Sample:
{'file': './marathi_male_english/english/wav\\train_marathimale_00001.wav', 'text': ' Author of the danger trail, Philip Steels, etc. ', 'audio': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [15]:
dataset

Dataset({
    features: ['file', 'text', 'audio', 'sampling_rate'],
    num_rows: 5578
})

In [16]:
print(type(dataset[0]["file"]))


<class 'str'>


In [None]:
from transformers import WhisperProcessor
import numpy as np

processor = WhisperProcessor.from_pretrained("openai/whisper-small")

def preprocess(batch):
    try:
        # Check if the audio is a list of floats
        if not isinstance(batch["audio"], list):
            raise ValueError(f"Expected audio to be a list, got {type(batch['audio'])}")

        # Convert to numpy array for processing
        audio_array = np.array(batch["audio"], dtype=np.float32)
        
        input_features = processor(audio_array, sampling_rate=batch["sampling_rate"], return_tensors="pt").input_features
        labels = processor(batch["text"], return_tensors="pt").input_ids

        batch["input_features"] = input_features.squeeze(0)
        batch["labels"] = labels.squeeze(0)

    except Exception as e:
        print(f"Error processing batch: {batch}")
        print(f"Error message: {str(e)}")
    
    return batch

# Apply preprocessing
dataset = dataset.map(preprocess)

# Remove unnecessary columns afterward
columns_to_remove = ["audio", "sampling_rate", "file", "text"]
dataset = dataset.remove_columns([col for col in columns_to_remove if col in dataset.column_names])

print("Processed Dataset Sample:")
print(dataset[0])


Check on fine tuned model

In [14]:
from safetensors import safe_open

tensors = {}
with safe_open("./model/model.safetensors", framework="pt", device="cpu") as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)

In [23]:
import torch
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from safetensors import safe_open
import librosa

# Step 1: Load the model architecture without loading weights
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")  # Use any model architecture

# Step 2: Load the tensors from the safetensors file
safetensor_path = "./model/model.safetensors"  # Path to your saved safetensors file
tensors = {}

# Read the safetensors file and get the tensors
with safe_open(safetensor_path, framework="pt", device="cpu") as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)

# Step 3: Load the weights into the model
model.load_state_dict(tensors, strict=False)

# Step 4: Set up the processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

# Define a function to transcribe audio
def transcribe_audio(file_path):
    # Load the audio file
    audio_array, sampling_rate = librosa.load(file_path, sr=16000)

    # Preprocess the audio
    input_features = processor(
        audio_array, sampling_rate=sampling_rate, return_tensors="pt"
    ).input_features

    # Generate transcription
    with torch.no_grad():
        generated_ids = model.generate(input_features)

    # Decode the generated ids to text
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return transcription


In [24]:
# Example usage
audio_file_path = "./train_marathimale_00020.wav"  # Replace with your audio file path
result = transcribe_audio(audio_file_path)
print("Transcription:", result)


Transcription:  Clubs and balls, and cities, grew to be only memories. 


Load model