In [1]:
import cv2
import torch
import torch as t
import torch.nn as nn
import torchvision.transforms as transforms
import numpy as np

class Alexnet(nn.Module):
    def __init__(self):
        super(Alexnet,self).__init__()
        self.Convlayer = nn.Sequential(
            nn.Conv2d(3,64,11,stride=2,padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2,2),       
            nn.Conv2d(64,128,5,stride=1,padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            nn.Conv2d(128,256,3,stride=1,padding=1),
            nn.ReLU()
        )

        self.net = nn.Sequential(
            nn.Linear(256*7*7,4096),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(4096,4096),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(4096,29)
        )
    def forward(self,x):
        x = self.Convlayer(x)
        x = x.view(x.size(0), -1)
        x = self.net(x)
        return x


device = torch.device("mps" if torch.mps.is_available() else "cpu")
model = Alexnet().to(device)
model.load_state_dict(t.load("model.pth", map_location=device))
model.eval()

class_labels = ['A', 'B', 'C', 'D', 'DEL', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'NOT', 'O', 'P', 'Q', 'R', 'S', 'SPA', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

cap = cv2.VideoCapture(0)
print("üì∏ Starting camera... Press 'q' to quit.")

while True:
    ret, frame = cap.read()
    if not ret:
        print("‚ö†Ô∏è Camera not found.")
        break

    # Draw region of interest (ROI)
    x, y, w, h = 300, 300, 600, 600
    cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
    roi = frame[y:y + h, x:x + w]

    # --- Preprocess ---
    img_tensor = transform(roi).unsqueeze(0).to(device)  # shape: (1, 1, 64, 64)

    # --- Predict ---
    with torch.no_grad():
        outputs = model(img_tensor)
        _, predicted = torch.max(outputs, 1)
        label = class_labels[predicted.item()]

    # --- Display prediction ---
    cv2.putText(frame, f"Prediction: {label}", (50, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

    # Show the frame
    cv2.imshow("ASL Live Detection (PyTorch)", frame)

    # Press 'q' to exit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# --- Cleanup ---
cap.release()
cv2.destroyAllWindows()
print("üëã Camera closed.")


üì∏ Starting camera... Press 'q' to quit.
üëã Camera closed.
