In [None]:
from ultralytics import YOLO
import cv2
import torch
import torch.nn as nn
import torchvision.transforms as T
from PIL import Image

In [None]:
# ===========================================================
# 1. DEFINE YOUR MUDRA MODEL (same architecture you trained)
# ===========================================================

class MudraCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        def block(inp, out):
            return nn.Sequential(
                nn.Conv2d(inp, out, 3, padding=1),
                nn.ReLU(),
                nn.BatchNorm2d(out),
                nn.Conv2d(out, out, 3, padding=1),
                nn.ReLU(),
                nn.BatchNorm2d(out)
            )

        self.net = nn.Sequential(
            block(3, 32),
            nn.MaxPool2d(2),

            block(32, 64),
            nn.MaxPool2d(2),

            block(64, 128),
            nn.MaxPool2d(3),

            block(128, 256),
            nn.MaxPool2d(3),

            block(256, 256),
            nn.MaxPool2d(3),
        )
        self.fc = nn.Linear(256*3*3, num_classes)

    def forward(self, x):
        x = self.net(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)


# ===========================================================
# 2. LOAD YOUR TRAINED CLASSIFIER
# ===========================================================

def load_mudra_model(path="/mudra_project/hand_model_mudra_green_10_me.pth"):
    checkpoint = torch.load(path, map_location="cpu")
    classes = checkpoint["classes"]

    model = MudraCNN(num_classes=len(classes))
    model.load_state_dict(checkpoint["state_dict"])
    model.eval()

    device = "mps" if torch.backends.mps.is_available() else "cpu"
    model.to(device)

    return model, classes, device


# Preprocessing for classifier
transform = T.Compose([
    T.ToPILImage(),
    T.Resize((416, 416)),
    T.ToTensor()
])


@torch.no_grad()
def classify_hand(model, classes, device, crop_bgr):
    crop_rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB)
    img = transform(crop_rgb).unsqueeze(0).to(device)

    logits = model(img)
    probs = torch.softmax(logits, dim=1)[0]

    conf, idx = torch.max(probs, dim=0)
    return classes[idx.item()], float(conf)


# ===========================================================
# 3. MAIN PIPELINE â€” DETECTION + CLASSIFICATION
# ===========================================================

def main():

    # ---- Load YOLO ----
    yolo_model_path = "/Users/abhin-zstch1563/Documents/AI/DL/CNN/mudra_project/weights 13-12-2025 16_20_40_118/best.pt"
    yolo = YOLO(yolo_model_path)
    print("Loaded YOLO model")

    # ---- Load Mudra Classifier ----
    mudra_model, classes, device = load_mudra_model()
    print("Loaded Mudra classifier")

    # ---- Open Camera ----
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Camera error")
        return

    print("Press 'q' to quit")

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Run YOLO
        results = yolo(frame, stream=True, conf=0.5)

        # Process detections
        for result in results:
            for box in result.boxes:

                # YOLO bounding box
                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)

                # Safely crop the hand region
                h, w, _ = frame.shape
                x1, y1 = max(0, x1), max(0, y1)
                x2, y2 = min(w, x2), min(h, y2)

                crop = frame[y1:y2, x1:x2]
                if crop.size == 0:
                    continue

                # ---- CLASSIFY THE CROPPED HAND ----
                label, conf = classify_hand(
                    mudra_model, classes, device, crop
                )

                # # Draw YOLO box
                # cv2.rectangle(frame, (x1, y1), (x2, y2),
                #               (0, 255, 0), 2)

                # Draw mudra label
                text = f"{label} ({conf:.2f})"
                cv2.putText(frame, text, (70, 50),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            1.0, (0, 200, 100), 2)

        # Show output
        cv2.imshow("Hand Detection + Mudra Classification", frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()


if __name__ == "__main__":
    main()


Loaded YOLO model
Loaded Mudra classifier
Press 'q' to quit

0: 256x416 (no detections), 14.8ms
Speed: 1.0ms preprocess, 14.8ms inference, 0.1ms postprocess per image at shape (1, 3, 256, 416)

0: 256x416 (no detections), 24.2ms
Speed: 1.8ms preprocess, 24.2ms inference, 0.2ms postprocess per image at shape (1, 3, 256, 416)

0: 256x416 (no detections), 14.0ms
Speed: 0.6ms preprocess, 14.0ms inference, 0.1ms postprocess per image at shape (1, 3, 256, 416)

0: 256x416 (no detections), 14.4ms
Speed: 0.7ms preprocess, 14.4ms inference, 0.2ms postprocess per image at shape (1, 3, 256, 416)

0: 256x416 (no detections), 13.1ms
Speed: 0.5ms preprocess, 13.1ms inference, 0.1ms postprocess per image at shape (1, 3, 256, 416)

0: 256x416 (no detections), 15.1ms
Speed: 0.7ms preprocess, 15.1ms inference, 0.2ms postprocess per image at shape (1, 3, 256, 416)

0: 256x416 (no detections), 18.7ms
Speed: 0.8ms preprocess, 18.7ms inference, 0.2ms postprocess per image at shape (1, 3, 256, 416)

0: 256x4

: 