In [1]:
"""
Live ASL → Google Search via MediaPipe + ResNet-18 + Selenium
Requirements:
    pip install torch torchvision opencv-python pillow mediapipe selenium
    Download ChromeDriver from https://chromedriver.chromium.org/downloads
    and put it on your PATH so that `webdriver.Chrome()` works.
"""

import cv2
import time
import torch
import torch.nn as nn
import mediapipe as mp
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from torchvision import transforms
from torchvision.models import resnet18, ResNet18_Weights

# ----------------------------
# Selenium Configuration
# ----------------------------
URL          = "https://www.google.com"
INPUT_NAME   = "q"          # name attribute for Google search box
WAIT_TIMEOUT = 30

# ----------------------------
# ASL Model Configuration
# ----------------------------
CKPT_PATH           = r"best_asl_resnet_checkpoint_5.pth"
DEVICE              = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
STABILITY_THRESHOLD = 15
INPUT_SIZE          = (224, 224)

# ----------------------------
# 1) Initialize Selenium WebDriver & accept consent if needed
# ----------------------------
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(URL)

# 1a) Accept Google's consent dialog if presented
try:
    consent_btn = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((
            By.XPATH,
            "//button[.//div[contains(text(),'I agree') or contains(text(),'Accept all')]]"
        ))
    )
    consent_btn.click()
    print("✅ Consent dialog accepted")
except TimeoutException:
    pass  # no consent dialog

# 1b) Wait for the search box to appear
try:
    input_el = WebDriverWait(driver, WAIT_TIMEOUT).until(
        EC.visibility_of_element_located((By.NAME, INPUT_NAME))
    )
    input_el.click()
    print("✅ Search box ready")
except TimeoutException:
    print("❌ Could not find the search box. Dumping page source for debug:")
    print(driver.page_source)
    driver.quit()
    exit()

# ----------------------------
# 2) Load ASL model
# ----------------------------
class_names = [chr(ord('A') + i) for i in range(26)] + ['Blank']

model = resnet18(weights=ResNet18_Weights.DEFAULT)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(class_names))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

ckpt = torch.load(r"best_asl_resnet_checkpoint_5.pth", map_location=device)
model.load_state_dict(ckpt['model_state_dict'])
model.to(device).eval()

# ----------------------------
# 3) Preprocessing & keymap
# ----------------------------
transform = transforms.Compose([
    transforms.Resize(INPUT_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

keymap = {c: c.lower() for c in class_names if c != 'Blank'}
keymap['Blank'] = None

# ----------------------------
# 4) MediaPipe Hands setup
# ----------------------------
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)
mp_drawing = mp.solutions.drawing_utils

# ----------------------------
# 5) Live loop: capture, predict, send_keys
# ----------------------------
cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
if not cap.isOpened():
    print("❌ Could not open camera. Check index or permissions.")
    driver.quit()
    exit()
time.sleep(1.0)  # camera warm-up

last_letter = None
count = 0
buffer_text = ""

while True:
    ret, frame = cap.read()
    if not ret:
        print("⚠️ Frame capture failed, exiting.")
        break

    h, w, _ = frame.shape
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb)

    if results.multi_hand_landmarks:
        lm = results.multi_hand_landmarks[0].landmark
        xs = [p.x for p in lm]
        ys = [p.y for p in lm]
        xmin, xmax = int(min(xs) * w), int(max(xs) * w)
        ymin, ymax = int(min(ys) * h), int(max(ys) * h)
        margin = 20
        xmin, ymin = max(0, xmin - margin), max(0, ymin - margin)
        xmax, ymax = min(w, xmax + margin), min(h, ymax + margin)

        crop = frame[ymin:ymax, xmin:xmax]
        pil_img = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
        inp = transform(pil_img).unsqueeze(0).to(DEVICE)

        with torch.no_grad():
            out = model(inp)
            _, pred = out.max(1)
            letter = class_names[pred.item()]

        # debounce logic
        if letter == last_letter:
            count += 1
        else:
            last_letter = letter
            count = 1

        if count == STABILITY_THRESHOLD:
            key = keymap[letter]
            if key:
                input_el.send_keys(key)
                buffer_text += letter
            count = 0

        # draw annotations
        mp_drawing.draw_landmarks(frame, results.multi_hand_landmarks[0], mp_hands.HAND_CONNECTIONS)
        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
        cv2.putText(frame, letter, (xmin, ymin - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 2)

    # overlay typed buffer
    cv2.putText(frame, buffer_text, (10, 40),
                cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2)

    cv2.imshow('ASL → Google Search', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# cleanup
cap.release()
cv2.destroyAllWindows()
driver.quit()

✅ Search box ready
