In [18]:
from huggingface_hub import login
login()

In [3]:
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                       "transformers", "datasets", "torchaudio", "jiwer",
                       "accelerate", "soundfile", "librosa", "onnxruntime-gpu"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                       "fastapi", "uvicorn", "nest-asyncio", "pyngrok", "python-multipart"])

print("Installation complete")


Installation complete


In [4]:
from huggingface_hub import snapshot_download

model_dir = snapshot_download(
    repo_id="ai4bharat/indic-conformer-600m-multilingual",
    local_dir="indicconformermodel",
    local_dir_use_symlinks=False
)

print("Model downloaded to:", model_dir)



Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 404 files:   0%|          | 0/404 [00:00<?, ?it/s]

Model downloaded to: /content/indicconformermodel


In [5]:
import torch
import librosa
import time
import numpy as np
import soundfile as sf
import sys

sys.path.append(".")


In [6]:
import sys
import os

model_path = os.path.abspath("indicconformermodel")
sys.path.insert(0, model_path)

from model_onnx import IndicASRConfig, IndicASRModel

config = IndicASRConfig(
    ts_folder=model_path
)

print("Config created successfully")
print(f"Model path: {model_path}")


Please check FRAME_DURATION_MS. The timestamps can be inaccurate
Config created successfully
Model path: /content/indicconformermodel


In [7]:
model = IndicASRModel(config)
model.eval()

print("STT Model loaded successfully")
print(f"Using device: {'GPU' if torch.cuda.is_available() else 'CPU'}")


STT Model loaded successfully
Using device: GPU


In [8]:
def calculate_confidence(wav_input, model_instance, lang):
    """Calculate basic confidence score"""
    try:
        return 0.85  # Default confidence score
    except:
        return 0.0

print("Confidence function defined")


Confidence function defined


In [9]:
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
import nest_asyncio
import uvicorn
from pyngrok import ngrok
from threading import Thread
import tempfile
import os

nest_asyncio.apply()


In [15]:
app = FastAPI(
    title="Indic STT API",
    description="Speech-to-text endpoint for LLM orchestration",
    version="1.0"
)

@app.get("/")
def root():
    return {
        "status": "online",
        "service": "Indic STT API",
        "model": "ai4bharat/indic-conformer-600m-multilingual"
    }

@app.post("/transcribe")
async def transcribe_audio(file: UploadFile = File(...), language: str = "auto"):
    """
    Receives audio file and returns transcription
    language: 'auto' for auto-detection, or specify 'hi', 'en', etc.
    """
    try:
        # Save uploaded file temporarily
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
            contents = await file.read()
            tmp.write(contents)
            tmp_path = tmp.name

        # Load audio
        audio, sr = librosa.load(tmp_path, sr=16000, mono=True)
        audio_duration = len(audio) / sr
        wav_tensor = torch.from_numpy(audio.astype(np.float32)).unsqueeze(0)

        # Transcribe
        start_time = time.time()

        with torch.no_grad():
            if language == "auto":
                # Try BOTH languages
                transcript_en = model(
                    wav=wav_tensor,
                    lang="en",
                    decoding="ctc"
                )

                transcript_hi = model(
                    wav=wav_tensor,
                    lang="hi",
                    decoding="ctc"
                )

                print(f"[AUTO-DETECT] EN result: {transcript_en}")
                print(f"[AUTO-DETECT] HI result: {transcript_hi}")


                hi_has_devanagari = any('\u0900' <= char <= '\u097F' for char in transcript_hi)

                # Check for common English words in English output
                en_words_lower = transcript_en.lower()
                common_english = ["hello", "hi", "how", "are", "you", "help", "need", "want", "the", "is", "can"]
                has_english_words = any(word in en_words_lower for word in common_english)

                # Check for transliterated English in Devanagari (like हलो, हाय)
                transliterated = ["हलो", "हाय", "हेलो", "बाय"]
                is_transliterated = any(word in transcript_hi for word in transliterated)

                # Decision logic
                if has_english_words and not is_transliterated:
                    # Strong English signal
                    transcript = transcript_en
                    detected_lang = "en"
                    print(f"[AUTO-DETECT] Selected: ENGLISH (English words detected)")
                elif hi_has_devanagari and not is_transliterated:
                    # Pure Hindi/Hinglish
                    transcript = transcript_hi
                    detected_lang = "hi"
                    print(f"[AUTO-DETECT] Selected: HINDI (Pure Devanagari)")
                elif is_transliterated:
                    # Transliterated English - prefer Hinglish
                    transcript = transcript_hi
                    detected_lang = "hi"
                    print(f"[AUTO-DETECT] Selected: HINGLISH (Transliterated)")
                else:
                    # Default to English if unclear
                    transcript = transcript_en
                    detected_lang = "en"
                    print(f"[AUTO-DETECT] Selected: ENGLISH (default)")
            else:
                # Use specified language
                transcript = model(
                    wav=wav_tensor,
                    lang=language,
                    decoding="ctc"
                )
                detected_lang = language

        inference_time = time.time() - start_time

        # Calculate confidence
        try:
            confidence_score = calculate_confidence(wav_tensor, model, detected_lang)
        except:
            confidence_score = 0.85  # Default

        # Clean up temp file
        os.unlink(tmp_path)

        rtf = inference_time / audio_duration if audio_duration > 0 else 0

        print(f"[STT DEBUG] Final: Language={detected_lang}, Text={transcript}")

        return JSONResponse({
            "text": transcript,
            "language": detected_lang,
            "confidence": round(confidence_score * 100, 2),
            "inference_time": round(inference_time, 3),
            "audio_duration": round(audio_duration, 2),
            "rtf": round(rtf, 3)
        })

    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"Error occurred: {error_details}")
        raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")

print(" STT endpoint with improved auto-detection")




✅ STT endpoint updated with auto language detection


In [16]:
import getpass

ngrok_token = getpass.getpass("Enter your ngrok auth token: ")
ngrok.set_auth_token(ngrok_token)

public_url = ngrok.connect(8001)

print("STT API is now live")
print(f"Public URL: {public_url}")
print(f"API Documentation: {public_url}/docs")
print(f"Health Check: {public_url}/health")
print("\nEndpoint for orchestration:")
print(f"POST {public_url}/transcribe")
print('Upload audio file with optional parameter: language=hi/en/gu/mr/bn')



Enter your ngrok auth token: ··········
STT API is now live
Public URL: NgrokTunnel: "https://fe-demonologic-unfully.ngrok-free.dev" -> "http://localhost:8001"
API Documentation: NgrokTunnel: "https://fe-demonologic-unfully.ngrok-free.dev" -> "http://localhost:8001"/docs
Health Check: NgrokTunnel: "https://fe-demonologic-unfully.ngrok-free.dev" -> "http://localhost:8001"/health

Endpoint for orchestration:
POST NgrokTunnel: "https://fe-demonologic-unfully.ngrok-free.dev" -> "http://localhost:8001"/transcribe
Upload audio file with optional parameter: language=hi/en/gu/mr/bn


In [17]:
def run_server():
    uvicorn.run(
        app,
        host="0.0.0.0",
        port=8001,
        log_level="info"
    )

server_thread = Thread(target=run_server, daemon=True)
server_thread.start()

print("Server is running")
print("API is accessible via the ngrok URL above")


Server is running
API is accessible via the ngrok URL above


INFO:     Started server process [33172]
INFO:     Waiting for application startup.
INFO:     Application startup complete.


In [13]:
def run_server():
    uvicorn.run(
        app,
        host="0.0.0.0",
        port=8001,
        log_level="info"
    )

server_thread = Thread(target=run_server, daemon=True)
server_thread.start()

print("Server is running")
print("API is accessible via the ngrok URL above")


Server is running
API is accessible via the ngrok URL above
