In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa
import torch
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

In [3]:
# Load pre-trained ASR model and processor
model_name = "facebook/wav2vec2-large-960h-lv60"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

KeyboardInterrupt: 

In [None]:
def transcribe_audio(file_path, processor, model, target_sr=16000):
    """
    Transcribe a single audio file using a pre-trained ASR model.
    """
    try:
        audio, sr = librosa.load(file_path, sr=target_sr, mono=True)
        inputs = processor(audio, sampling_rate=target_sr, return_tensors="pt", padding=True).to(model.device)
        logits = model(inputs.input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]
        return {"audio_path": file_path, "transcription": transcription}
    except Exception as e:
        return {"audio_path": file_path, "transcription": None, "error": str(e)}

def process_files_in_parallel(file_paths, processor, model, num_workers=4):
    """
    Process audio files in parallel using multiple workers.
    """
    results = []
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        # Submit tasks to the executor
        futures = {executor.submit(transcribe_audio, file, processor, model): file for file in file_paths}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing files"):
            results.append(future.result())
    return results

In [None]:
# Define the base directory of the TORGO dataset
base_dir = "D://Abhinav//Test//SPR_Project//torgo"

In [None]:
# Collect all .wav files from the directory and its subdirectories
file_paths = []
for root, _, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".wav"):
            file_paths.append(os.path.join(root, file))

# Process the audio files in parallel
num_workers = 4  # Adjust based on the number of CPU cores
transcriptions = process_files_in_parallel(file_paths, processor, model, num_workers=num_workers)

# Convert results to a DataFrame
df = pd.DataFrame(transcriptions)

# Save to a CSV file
output_path = "generated_torgo_transcriptions_parallel.csv"
df.to_csv(output_path, index=False)

print(f"Transcriptions saved to {output_path}")

In [None]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

In [None]:
import os
import librosa
import pandas as pd

# Define paths
audio_dir = "path/to/TORGO/audio"
transcriptions_dir = "path/to/TORGO/transcriptions"

# Collect audio-transcription pairs
data = []
for root, _, files in os.walk(transcriptions_dir):
    for file in files:
        if file.endswith(".txt"):
            transcription_path = os.path.join(root, file)
            audio_path = os.path.join(audio_dir, file.replace(".txt", ".wav"))
            if os.path.exists(audio_path):
                with open(transcription_path, 'r') as f:
                    transcription = f.read().strip()
                data.append({"audio_path": audio_path, "transcription": transcription})

df = pd.DataFrame(data)

# Preprocess audio: Normalize sample rate and ensure mono
def preprocess_audio(file_path, target_sr=16000):
    audio, sr = librosa.load(file_path, sr=target_sr, mono=True)
    return audio

df['audio'] = df['audio_path'].apply(preprocess_audio)

# Preprocess transcription: Normalize text
df['transcription'] = df['transcription'].str.lower().str.replace(r"[^a-z ]", "", regex=True)


In [None]:
import numpy as np
from librosa.feature import mfcc

def extract_features(audio, sr=16000, n_mfcc=13):
    return mfcc(y=audio, sr=sr, n_mfcc=n_mfcc).T

df['features'] = df['audio'].apply(lambda x: extract_features(x))


In [None]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import torch

# Load pre-trained model and tokenizer
model_name = "facebook/wav2vec2-large-960h-lv60"
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

# Tokenize and process audio
def transcribe_audio(audio, tokenizer, model, target_sr=16000):
    inputs = tokenizer(audio, return_tensors="pt", sampling_rate=target_sr, padding=True)
    logits = model(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.batch_decode(predicted_ids)[0]
    return transcription


In [None]:
from datasets import Dataset

# Convert to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df)

# Prepare dataset for Wav2Vec 2.0
def prepare_batch(batch):
    audio = preprocess_audio(batch['audio_path'])
    inputs = tokenizer(audio, return_tensors="pt", sampling_rate=16000, padding=True)
    batch['input_values'] = inputs.input_values[0]
    batch['labels'] = tokenizer(batch['transcription'], return_tensors="pt").input_ids[0]
    return batch

hf_dataset = hf_dataset.map(prepare_batch, remove_columns=["audio", "transcription"])


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    logging_dir="./logs",
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset["train"],
    eval_dataset=hf_dataset["test"],
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
from datasets import load_metric

wer = load_metric("wer")

def compute_wer(predictions, references):
    return wer.compute(predictions=predictions, references=references)

# Example evaluation
predictions = [transcribe_audio(row['audio'], tokenizer, model) for _, row in df.iterrows()]
references = df['transcription'].tolist()

print("WER:", compute_wer(predictions, references))
