In [3]:
# Import necessary libraries
import os
import torch
import soundfile as sf
import re
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from indicate import transliterate  # Transliteration library for Hindi to Latin
from rapidfuzz import fuzz  # For string similarity matching
from metaphone import doublemetaphone  # For phonetic encoding

# Function to check if a word contains Devanagari (Hindi) characters
def is_devanagari(word):
    return bool(re.search(r'[\u0900-\u097F]', word))

# Convert Devanagari script to Latin script (transliteration)
def transliterate_hindi_to_latin(text):
    words = text.split()
    result = []
    for word in words:
        if is_devanagari(word):  # Only transliterate Devanagari words
            try:
                latin = transliterate.hindi2english(word)
                result.append(latin if latin.strip() else word)
            except Exception:
                result.append(word)
        else:
            result.append(word)
    return ' '.join(result)

# Load audio file and ensure it is sampled at 16 kHz
def load_audio(audio_path):
    speech, sr = sf.read(audio_path)
    if sr != 16000:
        raise ValueError(f"Sampling rate must be 16kHz. Got {sr} for {audio_path}")
    return speech

# Transcribe audio to Hindi text using Whisper model
def transcribe(audio_path, processor, model, device):
    speech = load_audio(audio_path)
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt").to(device)
    with torch.no_grad():
        predicted_ids = model.generate(**inputs)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

# Match keywords using both fuzzy string matching and phonetic similarity
def hybrid_keyword_match(transliterated_text, keyword_set, sim_threshold=85, phonetic=True):
    """
    Returns a dictionary mapping each keyword to a tuple:
    (matched_word, similarity_score, match_type),
    if a match above the threshold is found.
    """
    words = re.findall(r'\w+', transliterated_text.lower())
    found = {}
    for kw in keyword_set:
        kw_phon = doublemetaphone(kw)[0] if phonetic else None  # Phonetic encoding of keyword
        for word in words:
            score = fuzz.ratio(kw, word)  # Fuzzy match score
            if score >= sim_threshold:
                found[kw] = (word, score, 'string')  # String similarity match
                break
            if phonetic and kw_phon and doublemetaphone(word)[0] == kw_phon and kw_phon != '':
                found[kw] = (word, 100, 'phonetic')  # Phonetic match
                break
    return found

# Main function to process all .wav files in a folder
def main():
    audio_folder = "C:/Users/WORKSTATIONS/Desktop/BijoyashreeDas/storm_vad-20250602T113932Z-1-001/storm_vad"
    output_file = "hindi_transcriptions.txt"

    # Predefined set of English keywords to detect
    keyword_set = {
        "report", "calling", "over", "guide", "army", "commandant",
        "wilko", "point", "checking", "vehicle", "namaste"
    }
    threshold = 85  # Minimum similarity score for a match

    print("Loading Whisper model and processor...")
    processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").to(device)
    model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="hindi", task="transcribe")
    print("Model and processor loaded.")

    # Open output file to save results
    with open(output_file, "w", encoding="utf-8") as out_f:
        for filename in sorted(os.listdir(audio_folder)):
            if filename.endswith(".wav"):
                audio_path = os.path.join(audio_folder, filename)
                print(f"\n🎧 Processing: {filename}")
                out_f.write(f"\n🎧 File: {filename}\n")

                try:
                    # Step 1: Transcribe audio to Hindi text
                    hindi_text = transcribe(audio_path, processor, model, device)
                    print(f"📝 Hindi transcription: {hindi_text}")
                    out_f.write(f"📝 Hindi: {hindi_text}\n")

                    # Step 2: Transliterate to Latin characters
                    latin_text = transliterate_hindi_to_latin(hindi_text)
                    print(f"🔤 Latin transliteration: {latin_text}")
                    out_f.write(f"🔤 Latin: {latin_text}\n")

                    # Step 3: Detect keywords via string similarity or phonetics
                    found_keywords = hybrid_keyword_match(latin_text, keyword_set, threshold)
                    if found_keywords:
                        found_str = ', '.join([
                            f"{kw} ~ {match[0]} ({match[1]}%, {match[2]})"
                            for kw, match in found_keywords.items()
                        ])
                    else:
                        found_str = 'None'
                    print(f"🔎 Keywords found (≥{threshold}% similarity or phonetic match): {found_str}")
                    out_f.write(f"🔎 Keywords found (≥{threshold}% similarity or phonetic match): {found_str}\n")

                except Exception as e:
                    print(f"❌ Error processing {filename}: {e}")
                    out_f.write(f"❌ Error: {e}\n")

    print(f"\n✅ Done! Transcriptions saved in {output_file}")

# Entry point of the script
if __name__ == "__main__":
    main()



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.9.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


Loading Whisper model and processor...
Model and processor loaded.

🎧 Processing: enh_storm_vad163.100000MHz_30KHz_FM_17_Nov_2023_07_40_54_PM.wav
📝 Hindi transcription:  गलती मत करना है इसलिए 7 है, 4 कॉलिंग से रिपोर्ट आई किया 7 13 अब, 14 और 10 से लेना, 13 4 कॉलिंग 7, 4, 4 कॉलिंग 7 4 कॉलिंग 7, 4, 4 कॉलिंग 7 4 कॉलिंग 7, 4, 4 कॉलिंग 7 4 कॉलिंग 7, 4, 4 कॉलिंग 7 4, 4 कॉलिंग 7, 4, 4 कॉलिंग 7
🔤 Latin transliteration: galti mat karna ha isliye 7 ha 4 coling se reported aai kiya 7 13 ab 14 or 10 se lena 13 4 coling 7, 4, 4 coling 7 4 coling 7, 4, 4 coling 7 4 coling 7, 4, 4 coling 7 4 coling 7, 4, 4 coling 7 4, 4 coling 7, 4, 4 coling 7
🔎 Keywords found (≥85% similarity or phonetic match): report ~ reported (85.71428571428572%, string), calling ~ coling (100%, phonetic)

🎧 Processing: enh_storm_vad350.358000MHz_240KHz_FM_18_Nov_2023_09_10_46_AM.wav
📝 Hindi transcription:  टू वन, have you been able to manage one more guide over? चल जिसे बट, बाके के लिए चल जाओ वर चल जाओ जरी आप एनी वेहिकल, like mi