# Project — Part 3: WebRTC

Abhinav Kumar
12/7/2025

In [3]:
from pathlib import Path
from PIL import Image
import numpy as np
import cv2
import pytesseract
import requests
import textwrap

FRAME_PATH = Path("/workspaces/eng-ai-agents/project/frames/latest.png")

OLLAMA_BASE_URL = "http://host.docker.internal:11434"
MODEL_NAME = "qwen2.5:latest"


In [4]:
def call_qwen(system_prompt: str, user_prompt: str) -> str:
    prompt = textwrap.dedent(f"""
    System: {system_prompt}

    User: {user_prompt}

    Assistant:
    """)

    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False,
    }

    resp = requests.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload)
    if not resp.ok:
        print("Ollama error status:", resp.status_code)
        print("Ollama error body:", resp.text[:500])
        resp.raise_for_status()

    data = resp.json()
    return data.get("response", "").strip()


In [5]:
def ocr_page_from_file(path: Path) -> str:
    if not path.exists():
        raise FileNotFoundError(f"No frame found at {path}")

    pil_image = Image.open(path).convert("RGB")
    img = np.array(pil_image)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    scale = 1.5
    img = cv2.resize(
        img,
        (int(img.shape[1] * scale), int(img.shape[0] * scale)),
        interpolation=cv2.INTER_CUBIC,
    )

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU + cv2.THRESH_BINARY)

    text = pytesseract.image_to_string(thresh, lang="eng")
    return text


In [6]:
def answer_question_about_current_frame(question: str) -> str:
    if not FRAME_PATH.exists():
        return f"Frame file not found at {FRAME_PATH}. Is the WebRTC receiver saving frames?"

    page_text = ocr_page_from_file(FRAME_PATH)

    system_prompt = (
        "You are an AI tutor helping a student understand whatever is on the screen. "
        "You only use information that can reasonably be inferred from the OCR text."
    )

    user_prompt = textwrap.dedent(f"""
    Here is OCR text from a screenshot of the screen (likely a paper page):

    ---- OCR TEXT START ----
    {page_text}
    ---- OCR TEXT END ----

    Question: {question}
    """)

    return call_qwen(system_prompt, user_prompt)


In [7]:
ans = answer_question_about_current_frame(
    "Summarize this screen for me in 3–4 sentences."
)
print(ans)


This paper introduces EfficientViT, a new vision model designed for efficient high-resolution dense prediction tasks. It uses multi-scale linear attention to achieve global receptive fields and multi-scale learning with lightweight operations, addressing the hardware inefficiency found in previous models like SegFormer and SegNeXt. EfficientViT demonstrates significant performance gains and speedups on various hardware platforms while maintaining or improving accuracy in tasks such as semantic segmentation, super-resolution, and instance segmentation.
