## Whisper Small 

In [None]:
import os
import librosa
import torch
from tqdm import tqdm
from transformers import WhisperProcessor, WhisperForConditionalGeneration


MODEL_NAME = ""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language="ro", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)


AUDIO_FOLDER = ""
OUTPUT_FOLDER = ""
os.makedirs(OUTPUT_FOLDER, exist_ok=True)


wav_files = [f for f in os.listdir(AUDIO_FOLDER) if f.endswith(".wav")]
failed_files = []


for file in tqdm(wav_files, desc="Transcribing", unit="file"):
    audio_file_path = os.path.join(AUDIO_FOLDER, file)
    txt_file_path = os.path.join(OUTPUT_FOLDER, os.path.splitext(file)[0] + ".txt")
    
    try:
    
        audio, sampling_rate = librosa.load(audio_file_path, sr=16000)
        

        audio_input = processor(audio, return_tensors="pt", sampling_rate=sampling_rate)
        input_features = audio_input['input_features'].to(device)


        with torch.no_grad():
            generated_ids = model.generate(input_features=input_features)
        

        transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
        with open(txt_file_path, "w", encoding="utf-8") as f:
            f.write(transcription)
    
    except Exception as e:
        failed_files.append(file)
        with open(txt_file_path, "w", encoding="utf-8") as f:
            f.write("")  

if failed_files:
    print("\nThe following files failed to transcribe:")
    for file in failed_files:
        print(file)
else:
    print("\nAll files processed successfully!")


## Whisper Large

In [None]:
import os
import librosa
import torch
from tqdm import tqdm
from transformers import WhisperProcessor, WhisperForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


MODEL_NAME = ""
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language="ro", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)


AUDIO_FOLDER = ""
OUTPUT_FOLDER = ""
os.makedirs(OUTPUT_FOLDER, exist_ok=True)


wav_files = [f for f in os.listdir(AUDIO_FOLDER) if f.endswith(".wav")]
failed_files = []


for file in tqdm(wav_files, desc="Transcribing", unit="file"):
    audio_file_path = os.path.join(AUDIO_FOLDER, file)
    txt_file_path = os.path.join(OUTPUT_FOLDER, os.path.splitext(file)[0] + ".txt")
    
    try:
        audio, sampling_rate = librosa.load(audio_file_path, sr=16000)
        audio_input = processor(audio, return_tensors="pt", sampling_rate=sampling_rate)
        input_features = audio_input["input_features"].to(device)

        with torch.no_grad():
            generated_ids = model.generate(input_features, max_new_tokens=200)
        
        transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
        with open(txt_file_path, "w", encoding="utf-8") as f:
            f.write(transcription)

    except Exception as e:
        failed_files.append(file)
        with open(txt_file_path, "w", encoding="utf-8") as f:
            f.write("")  


if failed_files:
    print("\nThe following files failed to transcribe:")
    print("\n".join(failed_files))
else:
    print("\nAll files processed successfully!")


## Wav2vec


In [None]:
import os
import torch
import soundfile as sf
from tqdm import tqdm
from transformers import AutoProcessor, AutoModelForCTC


processor = AutoProcessor.from_pretrained("")
model = AutoModelForCTC.from_pretrained("")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


AUDIO_FOLDER = ""  
OUTPUT_FOLDER = "" 
os.makedirs(OUTPUT_FOLDER, exist_ok=True)


wav_files = [f for f in os.listdir(AUDIO_FOLDER) if f.endswith(".wav")]
failed_files = []


for file in tqdm(wav_files, desc="Transcribing", unit="file"):
    audio_path = os.path.join(AUDIO_FOLDER, file)
    txt_path = os.path.join(OUTPUT_FOLDER, os.path.splitext(file)[0] + ".txt")

    try:

        audio_input, sr = sf.read(audio_path)
        if len(audio_input.shape) > 1:
            audio_input = audio_input.mean(axis=1) 


        inputs = processor(audio_input, return_tensors="pt", sampling_rate=16000)
        input_values = inputs.input_values.to(device)

 
        with torch.no_grad():
            logits = model(input_values).logits

       
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

     
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(transcription)

    except Exception as e:
        failed_files.append(file)
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write("") 
        print(f"Failed: {file} — {e}")


if failed_files:
    print("\nThe following files failed to transcribe:")
    for file in failed_files:
        print(file)
else:
    print("\nAll files processed successfully!")
