In [None]:
pip install fastapi uvicorn openai faster_whisper  pyngrok fpdf torch torchaudio transformers soundfile

In [None]:

from fastapi import FastAPI, Form, UploadFile, File, WebSocket, WebSocketDisconnect
from fastapi.responses import HTMLResponse, FileResponse, JSONResponse
from openai import OpenAI
from faster_whisper import WhisperModel               # 🆕 low‑latency decoder
from transformers import AutoTokenizer, VitsModel
from pyngrok import ngrok
from fpdf import FPDF
import soundfile as sf
import numpy as np
import torch, os, time, uvicorn, asyncio, tempfile, threading

# === MedGemma via Ollama ===
OLLAMA_BASE_URL = "http://203.124.40.57:11434/v1"
VISION_MODEL = "puyangwang/medgemma-27b-it:q8"
client = OpenAI(base_url=OLLAMA_BASE_URL, api_key="ollama")

# === STT: faster‑whisper (Large‑v3) ===
STT_MODEL_SIZE = "large-v3"   # or "medium" for < 8 GB GPUs
stt_model = WhisperModel(STT_MODEL_SIZE, device="cuda", compute_type="float16")
# (faster‑whisper automatically handles batching + GPU kernels)

# === TTS: Urdu VITS ===
tts_model_id = "hamzamunir/mms-tts-urdu-vits-finetuned-aliakbar"
tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_id)
tts_model     = VitsModel.from_pretrained(tts_model_id).to("cuda")

# === Ngrok ===
NGROK_AUTH_TOKEN = "YOUR_NGROK_TOKEN_HERE"
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Kill all existing tunnels
ngrok.kill()

# Start a fresh tunnel
public_url = ngrok.connect(8000)
print(f"🌐 Public URL: {public_url}")

app = FastAPI(title="Medical AI Diagnosis – Low‑Latency")

# === Load UI ===
with open("/kaggle/input/oooooooook/index.html", "r", encoding="utf-8") as f:
    INDEX_HTML = f.read()

# === System prompts for MedGemma ===
system_prompts = [
    "You are a friendly, Urdu/English-speaking AI medical assistant trained to perform initial patient intake.",
    "You will greet the patient, ask standard intake questions (e.g., name, age, symptoms, allergies, medications, history), and confirm responses.",
    "Once enough data is collected, summarize the encounter in FHIR format using a QuestionnaireResponse or a Bundle with Patient, Condition, and Observation resources.",
    "Always include: Full name, Gender, Age or birthdate, Reason for visit, Symptoms with duration, Allergies and medications (if mentioned).",
    "Output the FHIR JSON at the end, with one resource per section.",
    "Respond in a polite, conversational tone."
]

@app.get("/", response_class=HTMLResponse)
async def serve_index():
    return INDEX_HTML

@app.post("/", response_class=HTMLResponse)
async def medgemma_chat(prompt: str = Form(...)):
    msgs = [{"role": "system", "content": p} for p in system_prompts] + [{"role": "user", "content": prompt}]
    start = time.time()
    res = client.chat.completions.create(model=VISION_MODEL, messages=msgs, max_tokens=1024, temperature=0.2)
    duration = round(time.time() - start, 2)
    answer = res.choices[0].message.content.strip()
    return HTMLResponse(INDEX_HTML.replace("{{ prompt or '' }}", prompt)
                                   .replace("{{ response }}", answer)
                                   .replace("{{ processing_time }}", str(duration)))

# === traditional /stt, /tts, /generate_pdf endpoints remain unchanged (kept for fallback) ===

# ---------------------------------------------------------------------------
# 🖥️  REAL‑TIME STT OVER WEBSOCKET – ultra‑low latency
# ---------------------------------------------------------------------------

BUFFER_DURATION  = 1.0        # seconds per decode window
SAMPLE_RATE      = 16000      # Whisper expects 16 kHz
BUFFER_SAMPLES   = int(BUFFER_DURATION * SAMPLE_RATE)

@app.websocket("/ws/audio")
async def ws_audio(websocket: WebSocket):
    await websocket.accept()
    ring_buffer = np.zeros(0, dtype=np.float32)

    async def decode_and_respond(audio_np: np.ndarray):
        """Background decode to avoid blocking the ws receive loop."""
        segments, _ = stt_model.transcribe(audio_np, language="ur", vad_filter=False, beam_size=1)
        transcript = "".join([seg.text for seg in segments]).strip()
        if transcript:
            await websocket.send_text(transcript)

    try:
        while True:
            packet = await websocket.receive_bytes()            # raw PCM int16
            chunk = np.frombuffer(packet, dtype=np.int16).astype(np.float32) / 32768.0
            ring_buffer = np.concatenate((ring_buffer, chunk))

            if ring_buffer.shape[0] >= BUFFER_SAMPLES:
                # Take first BUFFER_SAMPLES for decoding; keep remainder
                to_decode, ring_buffer = ring_buffer[:BUFFER_SAMPLES], ring_buffer[BUFFER_SAMPLES:]
                # Launch background task so networking thread keeps up
                asyncio.create_task(decode_and_respond(to_decode))
    except WebSocketDisconnect:
        pass

# === Run ===

def _run():
    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")
threading.Thread(target=_run, daemon=True).start()

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'pyngrok'