In [2]:
import cv2
import os
import pytesseract
import numpy as np
from deepface import DeepFace
from ultralytics import YOLO

# Load YOLOv8 model
model = YOLO("yolov8l.pt")  

# IP webcam URL
cam_url = "http://10.193.209.160:8080/video"

# Directory to save images
save_dir = "captured_images"
os.makedirs(save_dir, exist_ok=True)

def preprocess_for_ocr(image):
    """Prepares image for OCR with adaptive thresholding and noise removal."""
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (3, 3), 0)
    binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    kernel = np.ones((2,2), np.uint8)
    binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    return binary

def detect_objects(frame):
    """Detects objects and draws bounding boxes."""
    results = model(frame, conf=0.4)  
    detected_objects = []
    text_regions = []
    
    for result in results:
        for box in result.boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0])  
            label = result.names[int(box.cls[0])]  
            conf = float(box.conf[0])  
            
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, f"{label} {conf:.2f}", (x1, y1 - 5),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
            
            detected_objects.append(label)
            if label in ["book", "screen", "paper", "signboard"]:
                text_regions.append(frame[y1:y2, x1:x2])
    
    return frame, text_regions, detected_objects

def run_ocr(text_regions):
    """Runs OCR on detected text regions."""
    ocr_text = ""
    for idx, region in enumerate(text_regions):
        processed_region = preprocess_for_ocr(region)
        text = pytesseract.image_to_string(processed_region, config="--psm 6 --oem 3").strip()
        if text:
            ocr_text += f"\n🔤 Text from region {idx + 1}:\n{text}\n"
    
    return ocr_text.strip()

def detect_emotion(frame):
    """Detects emotion from the given frame."""
    try:
        result = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False)
        if isinstance(result, list) and result:
            return result[0]['dominant_emotion']
    except Exception as e:
        print("Error detecting emotion:", e)
    return "None"

def main():
    cap = cv2.VideoCapture(cam_url)
    if not cap.isOpened():
        print("Error: Unable to access IP webcam")
        return
    
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                print("Failed to grab frame")
                break
            
            cv2.imshow("Live Feed", frame)
            key = cv2.waitKey(1) & 0xFF
            
            if key == ord('o'):
                processed_frame, text_regions, detected_objects = detect_objects(frame)
                if detected_objects:
                    print("🔍 Detected Objects:", ", ".join(detected_objects))
                else:
                    print("No objects detected.")
                
            elif key == ord('t'):
                processed_frame, text_regions, _ = detect_objects(frame)
                ocr_text = run_ocr(text_regions)
                if ocr_text:
                    print(ocr_text)
                else:
                    print("No text detected.")
                
            elif key == ord('e'):
                emotion = detect_emotion(frame)
                print(f"😊 Detected Emotion: {emotion}")
                
            elif key == ord('q'):
                break
    
    except KeyboardInterrupt:
        print("\n🔴 Interrupted by user.")
    
    finally:
        cap.release()
        cv2.destroyAllWindows()
    
if __name__ == "__main__":
    main()


😊 Detected Emotion: happy

0: 384x640 1 person, 1 bed, 2 laptops, 230.1ms
Speed: 2.4ms preprocess, 230.1ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)
🔍 Detected Objects: laptop, person, bed, laptop
