# install requirements

In [None]:
!pip install -U openai-whisper
!pip install --upgrade datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio
!pip install torchaudio
!pip install noisereduce librosa soundfile
!pip install pyannote.audio


In [3]:
from torch.nn.attention import SDPBackend, sdpa_kernel
from IPython.display import Audio
from collections import defaultdict
from pyannote.core import Segment
import noisereduce as nr
import soundfile as sf
from tqdm import tqdm
import pandas as pd
import numpy as np
import torchaudio
import librosa
import torch
import time


In [None]:
from huggingface_hub import login

login("Your_HuggingFace_Token")

# Reduce Noise on Audio

In [None]:

file_name = "sample.mp3"

data, srate = librosa.load(file_name, sr=None)

reduced_noise = nr.reduce_noise(y=data, sr=srate)

sf.write("denoised.wav", data=reduced_noise, samplerate=srate)

In [None]:
Audio("denoised.wav")

# Speaker Diarization

In [None]:
from pyannote.audio import Pipeline
from pyannote.audio.pipelines.utils.hook import ProgressHook

pipeline = Pipeline.from_pretrained(
  "pyannote/speaker-diarization-3.1",
  use_auth_token="Your_HuggingFace_Token")

pipeline.to(torch.device("cuda"))

# run the pipeline on an audio file
waveform, sample_rate = torchaudio.load(file_name)

with ProgressHook() as hook:
    diarization = pipeline({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)

# dump the diarization output to disk using RTTM format
with open("audio.rttm", "w") as rttm:
    diarization.write_rttm(rttm)

In [None]:
def format_seconds(seconds):
    minutes = int(seconds) // 60
    secs = int(seconds) % 60
    return f"{minutes}:{secs:02d}"

# print the result
for turn, _, speaker in diarization.itertracks(yield_label=True):
    print(f"start={format_seconds(turn.start)}s stop={format_seconds(turn.end)}s speaker_{speaker}")


In [None]:
# Collect segments per speaker
speaker_segments = defaultdict(list)
for segment, _, speaker in diarization.itertracks(yield_label=True):
    start_sample = int(segment.start * sample_rate)
    end_sample = int(segment.end * sample_rate)
    speaker_segments[speaker].append(waveform[0][start_sample:end_sample])

# Save each speaker's segments into separate files
for speaker, segments in speaker_segments.items():
    full_speech = np.concatenate(segments)
    full_speech = full_speech.astype(np.float32)  # ensure valid format
    # sf.write(f"{speaker}.wav", full_speech, sample_rate, format='WAV', subtype='PCM_16')
    sf.write(f"{speaker}.wav", full_speech, sample_rate)


In [None]:
Audio("SPEAKER_00.wav")

# Test Multiple ASR models

In [4]:
from transformers import (
      AutoTokenizer,
      AutoModelForCausalLM,
      pipeline,
      AutoModelForSpeechSeq2Seq,
      AutoProcessor,
      Wav2Vec2ForCTC,
      Wav2Vec2Processor,
)
from datasets import (
      load_dataset,
      Dataset,
)

##1. [First ASR Model](https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-persian-v3)

A Fine-Tuned wav2vec on Persian


In [None]:

model_name_or_path = "m3hrdadfi/wav2vec2-large-xlsr-persian-v3"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
model = Wav2Vec2ForCTC.from_pretrained(model_name_or_path).to(device)


def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    speech_array = speech_array.squeeze().numpy()
    speech_array = librosa.resample(y=np.asarray(speech_array), orig_sr=sampling_rate, target_sr=processor.feature_extractor.sampling_rate)

    batch["speech"] = speech_array
    return batch


def predict(batch):
    features = processor(
        batch["speech"],
        sampling_rate=processor.feature_extractor.sampling_rate,
        return_tensors="pt",
        padding=True
    )

    input_values = features.input_values.to(device)
    attention_mask = features.attention_mask.to(device)

    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits

    pred_ids = torch.argmax(logits, dim=-1)

    batch["predicted"] = processor.batch_decode(pred_ids)
    return batch


In [None]:

# root_dir = "./SampleDir/sample.wav"

# audio_files = [
# 			os.path.join(root_dir, f) for f in os.listdir(root_dir)
# 			if f.endswith(".mp3")
# 		]
audio_files = ["sample.mp3"]

dataset = Dataset.from_dict({"path": audio_files})

dataset = dataset.map(speech_file_to_array_fn)
result = dataset.map(predict, batched=True, batch_size=4)
asr_output = result['predicted']

In [None]:
asr_output

##2. [Second ASR Model](https://huggingface.co/ghofrani/xls-r-1b-fa-cv8)
A Fine-Tuned wav2vec on Persian


In [None]:
pipe = pipeline("automatic-speech-recognition", model="ghofrani/xls-r-1b-fa-cv8")

In [None]:
asr_output = pipe('sample.mp3')['text']
asr_output

## 3. [Third ASR Model](https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-persian)
A Fine-Tuned wav2vec on Persian


In [None]:
pipe = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-persian")

In [None]:
asr_output = pipe('sample.mp3')['text']
asr_output

##4. [Fourth ASR Model](https://github.com/SYSTRAN/faster-whisper)

faster whisper model

In [None]:
!pip install faster-whisper

In [None]:
from faster_whisper import WhisperModel

model_size = "turbo"

# Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")
# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")


In [None]:
st = time.time()
# segments, info = model.transcribe("test1.ogg", beam_size=5, language="fa")
segments, _ = model.transcribe(
    "sample.mp3",
    vad_filter=True,
    vad_parameters=dict(min_silence_duration_ms=500),
)
print("es time:", time.time()-st)
# print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
text = ""
for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
    text = text + " " + segment.text

asr_output = text

##5. [Fifth ASR Model](https://huggingface.co/openai/whisper-large-v3-turbo)

Whisper Large v3 Turbo with SDPA Attention

In [None]:
torch.set_float32_matmul_precision("high")

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa"
).to(device)

processor = AutoProcessor.from_pretrained(model_id)

model.generation_config.return_timestamps = True
model.generation_config.language = 'fa'
model.generation_config.task = 'transcribe'
model.generation_config.forced_decoder_ids = None

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    batch_size=64,
    chunk_length_s=22
)

In [7]:
sample, sr = librosa.load("sample.mp3", sr=None)
sample = librosa.resample(y=np.asarray(sample), orig_sr=sr, target_sr=processor.feature_extractor.sampling_rate)

In [None]:
start = time.time()
# with sdpa_kernel(SDPBackend.MATH):
asr_output = pipe(sample)['text']
end = time.time()
print("Estimated time: ", end - start)
asr_output

# Post-Processing ASR Output using a LLM

In [11]:

def correct_persian_text(misspelled_text):
    # Load the model (assumes you have access or downloaded it locally)
    corrector = pipeline("text-generation", model="google/gemma-3n-e4b-it", device=0)  # use device=0 for GPU, -1 for CPU

    # Prepare the prompt
    prompt = f"""متن زیر شامل برخی کلمات با غلط املایی است. لطفاً نسخهٔ اصلاح‌شدهٔ آن را ارائه بده و تنها متن تصحیح‌شده را بدون هیچ توضیحی برگردان.

              متن: «{misspelled_text}»

              """

    # Generate correction
    result = corrector(prompt, max_new_tokens=100, do_sample=False)[0]["generated_text"]

    corrected_part = result.replace(prompt, "").strip()
    return corrected_part, result



In [None]:
text = asr_output
# text = " سلام و درود من میخوام سیرویس و اینترنت هم از ایدی ایس ایل تبدیل به افتیتیه کنم بعد به مخابرات هم زنگ زدم تماس گرفتم گفتن که قرار شده بررسی کنن و خبر بدن که آیا منطقه ما تحت پوشش هست کنن کود پوستی ها که دادیم به سایت گفت تحت پوشش نیست ولی مخابرات قرار شد خودش از روی آدرس بدیری بکنه و خب هنجا بایمون قبر ندارن اگر که جواب مصبته که من حزینه رو باریس بکنم"
crrt_txt, output = correct_persian_text(text)
crrt_txt, output

# Intent Classification

## 1. Classify with Bert

In [None]:
# 1. Load the zero-shot classification pipeline with BERT
classifier = pipeline(
    "zero-shot-classification",
    model="bert-base-uncased"
)

# 2. Define your text and the categories you want to test
sequence_to_classify = "شرکت اپل از جدیدترین مدل آیفون رونمایی کرد"
candidate_labels = ["ورزشی", "سیاسی", "فناوری", "علمی"]

# 3. Run the classifier
result = classifier(sequence_to_classify, candidate_labels)
print(result)

## 2. Classify with a LLM

In [14]:


def classify_intent_persian(text, categories, model_id="google/gemma-3n-e4b-it", max_new_tokens=8):
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

    # Build Persian prompt
    category_list = "\n- " + "\n- ".join(categories)
    prompt = f"""
          شما یک دستیار هوش مصنوعی هستید که نیت جمله‌های کاربران را بر اساس دسته‌بندی‌های زیر مشخص می‌کند:
          {category_list}

          کاربر: می‌خواستم بدونم چطور می‌تونم اشتراکم رو لغو کنم؟
          نیت: درخواست اطلاعات

          کاربر: {text}
          نیت:
          """

    # Use text-generation pipeline
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    output = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False, temperature=0)[0]["generated_text"]

    # Extract the intent after the last "نیت:"
    intent_line = output.split("نیت:")[-1].strip().split("\n")[0]
    return intent_line


In [None]:
categories = ["درخواست اطلاعات", "شکایت", "احوال‌پرسی", "خداحافظی"]
text = "سلام، یه سوال داشتم درباره‌ی تمدید اشتراک"

intent = classify_intent_persian(text, categories)
print("نیت شناسایی‌شده:", intent)


# Text to Speech (TTS)

In [None]:
!pip install edge-tts

In [None]:
!edge-tts --list-voices

In [20]:
!edge-tts --voice  fa-IR-FaridNeural --text "مشترک گرامی به شماره 245,698 جهت خرید اشتراک باید هزینه ۱۰۰۰ تومان را واریز کنید" --write-media output.mp3 --write-subtitles hello_in_arabic.srt


In [None]:
Audio('output.mp3', autoplay=True)