In [14]:
import cv2
from deepface import DeepFace
import os
import mediapipe as mp
import numpy as np
import pickle
import pyautogui
import time
import pyttsx3
from PIL import Image
import pytesseract

# Set path to Tesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Load gesture recognition model
with open(r"D:\nada mossad\e-just\4th grade\second term\computer vision\Project\gestures\gesture_rf_model.pkl", 'rb') as f:
    clf = pickle.load(f)

# Label mapping
label_mapping = {
    0: 'palm - play/pause',
    1: 'like - volume up',
    2: 'dislike - volume down',
    3: 'peace - scroll up',
    4: 'four - scroll down'
}

# Face authentication
authenticated = False
authenticated_user = ""

# Mediapipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.6)
mp_drawing = mp.solutions.drawing_utils

# Camera setup
cap = cv2.VideoCapture(0)
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Drawing and gesture variables
draw_mode = False
canvas = np.zeros((480, 640, 3), dtype=np.uint8)
prev_x, prev_y = None, None
draw_color = (0, 255, 0)
prev_gesture = None
last_action_time = 0
cooldown = 1.5

# OCR variables
ocr_mode = False
last_ocr_text = ""
ocr_cooldown = 2
last_ocr_time = 0

# Text-to-speech engine
engine = pyttsx3.init()

def speak(text):
    try:
        engine.say(text)
        engine.runAndWait()
    except Exception as e:
        print(f"[TTS ERROR]: {e}")

def authenticate_user(frame, faces):
    global authenticated, authenticated_user
    for (x, y, w, h) in faces:
        face_roi = frame[y:y + h, x:x + w]
        if face_roi.size == 0:
            continue

        cv2.imwrite('current_face.jpg', face_roi)
        try:
            user_authenticated = False
            for user_image in os.listdir("users/"):  # Assuming user images are stored in "users" folder
                user_name = os.path.splitext(user_image)[0]  # User name is the filename without extension
                user_image_path = os.path.join("users", user_image)

                result = DeepFace.verify('current_face.jpg', user_image_path, enforce_detection=False)
                if result['verified']:
                    authenticated = True
                    authenticated_user = user_name
                    print(f"✅ User {authenticated_user} authenticated!")
                    speak(f"Welcome {authenticated_user}")
                    user_authenticated = True
                    break

            if not user_authenticated:
                print("❌ Unknown User")

        except Exception as e:
            print(f"[Face Verification Error]: {e}")

        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

def handle_gesture(gesture):
    global last_action_time, prev_gesture

    current_time = time.time()
    if gesture != prev_gesture or current_time - last_action_time > cooldown:
        prev_gesture = gesture
        last_action_time = current_time
        speak(gesture)

        if gesture == 'palm - play/pause':
            pyautogui.press('playpause')
        elif gesture == 'like - volume up':
            pyautogui.press('volumeup')
        elif gesture == 'dislike - volume down':
            pyautogui.press('volumedown')
        elif gesture == 'peace - scroll up':
            pyautogui.scroll(300)
        elif gesture == 'four - scroll down':
            pyautogui.scroll(-300)

while True:
    ret, frame = cap.read()
    if not ret:
        print("[ERROR] Could not read frame from camera.")
        break

    frame_raw = frame.copy()  # Save non-flipped frame for OCR
    frame = cv2.flip(frame, 1)  # Flip for mirrored interaction

    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 5, minSize=(100, 100))

    if not authenticated:
        authenticate_user(frame, faces)
    else:
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb)

        if results.multi_hand_landmarks and not ocr_mode:  # ❗ Disable gesture recognition during OCR
            for hand_landmarks in results.multi_hand_landmarks:
                landmarks = [coord for lm in hand_landmarks.landmark for coord in (lm.x, lm.y)]
                h, w, _ = frame.shape
                index_tip = hand_landmarks.landmark[8]
                cx, cy = int(index_tip.x * w), int(index_tip.y * h)

                if draw_mode:
                    if prev_x is not None:
                        cv2.line(canvas, (prev_x, prev_y), (cx, cy), draw_color, 5)
                    prev_x, prev_y = cx, cy
                else:
                    prev_x, prev_y = None, None
                    try:
                        prediction = clf.predict(np.array(landmarks).reshape(1, -1))
                        gesture = label_mapping[prediction[0]]
                        handle_gesture(gesture)
                        cv2.putText(frame, f'Gesture: {gesture}', (10, 30),
                                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
                    except Exception as e:
                        print(f"[Prediction Error]: {e}")

                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

        if ocr_mode:
            current_time = time.time()
            if current_time - last_ocr_time > ocr_cooldown:
                ocr_input = frame_raw.copy()
                frame = cv2.resize(ocr_input, (640, 480))
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                blurred = cv2.GaussianBlur(gray, (5, 5), 0)
                _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
                custom_config = r'--oem 3 --psm 6' 
                details = pytesseract.image_to_data(thresh, config=custom_config, output_type=pytesseract.Output.DICT)

                detected_text = ""
                for i in range(len(details['text'])):
                    text = details['text'][i]
                    confidence = int(details['conf'][i]) 
                    
                    if text.strip() != "" and confidence > 90:
                        detected_text += text + " "
                        speak(detected_text)
                cv2.putText(frame, detected_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2, cv2.LINE_AA)

        frame = cv2.addWeighted(frame, 1, canvas, 0.5, 0)

    # Display frame
    cv2.imshow("Smart Vision Assistant", frame)

    # Keyboard shortcuts
    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        print("Exiting...")
        break
    elif key == ord('d'):
        draw_mode = not draw_mode
        print("Draw mode:", "ON" if draw_mode else "OFF")
    elif key == ord('c'):
        canvas.fill(0)
        print("Canvas cleared")
    elif key == ord('o'):
        ocr_mode = not ocr_mode
        print("OCR Mode:", "ON" if ocr_mode else "OFF")
    elif key in [ord('r'), ord('g'), ord('b'), ord('y'), ord('k')]:
        colors = {'r': (0, 0, 255), 'g': (0, 255, 0), 'b': (255, 0, 0),
                  'y': (0, 255, 255), 'k': (0, 0, 0)}
        draw_color = colors[chr(key)]
        print(f"Color changed to {chr(key).upper()}")

# Cleanup
cap.release()
cv2.destroyAllWindows()


✅ User Nada authenticated!
Draw mode: ON
Draw mode: OFF
OCR Mode: ON
Exiting...


GUI

In [1]:
import cv2
from deepface import DeepFace
import os
import mediapipe as mp
import numpy as np
import pickle
import pyautogui
import time
import pyttsx3
from PIL import Image, ImageTk
import pytesseract
import tkinter as tk
from tkinter import ttk
import threading
import speech_recognition as sr

# Set path to Tesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Load gesture recognition model
with open(r"D:\nada mossad\e-just\4th grade\second term\computer vision\Project\gestures\gesture_rf_model.pkl", 'rb') as f:
    clf = pickle.load(f)

label_mapping = {
    0: 'palm - play/pause',
    1: 'like - volume up',
    2: 'dislike - volume down',
    3: 'peace - scroll up',
    4: 'four - scroll down'
}

# Mediapipe setup
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.6)
mp_drawing = mp.solutions.drawing_utils

# Voice engine
engine = pyttsx3.init()
def speak(text):
    try:
        engine.say(text)
        engine.runAndWait()
    except:
        pass

# Voice thread control
def voice_command_listener():
    global voice_thread_running
    recognizer = sr.Recognizer()
    mic = sr.Microphone()
    with mic as source:
        recognizer.adjust_for_ambient_noise(source)
    speak("Voice command module is now active.")
    while voice_thread_running:
        try:
            with mic as source:
                print("Listening for voice commands...")
                audio = recognizer.listen(source, timeout=5)
            command = recognizer.recognize_google(audio).lower()
            print(f"You said: {command}")
            if "increase brightness" in command:
                pyautogui.press("brightnessup")
                speak("Increasing brightness.")
            elif "decrease brightness" in command:
                pyautogui.press("brightnessdown")
                speak("Decreasing brightness.")
        except sr.WaitTimeoutError:
            continue
        except sr.UnknownValueError:
            continue
        except Exception as e:
            print(f"Voice error: {e}")
    speak("Voice command stopped.")

voice_thread_running = False
voice_thread = None

def toggle_voice_commands():
    global voice_thread_running, voice_thread
    if not voice_thread_running:
        voice_thread_running = True
        voice_thread = threading.Thread(target=voice_command_listener)
        voice_thread.daemon = True
        voice_thread.start()
        speak("Voice control started")
    else:
        voice_thread_running = False
        speak("Voice control stopped")

# Globals
cap = cv2.VideoCapture(0)
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

authenticated = False
authenticated_user = ""

canvas = np.zeros((480, 640, 3), dtype=np.uint8)
draw_mode = False
ocr_mode = False
draw_color = (0, 255, 0)
prev_x, prev_y = None, None
prev_gesture = None
last_action_time = 0
cooldown = 1.5
ocr_cooldown = 2
last_ocr_time = 0

# GUI Setup
root = tk.Tk()
root.title("Smart Vision Assistant")
root.configure(bg="#2e2e2e")

style = ttk.Style()
style.theme_use("clam")
style.configure("TButton", font=("Segoe UI", 10), padding=6, relief="flat", background="#3a3a3a", foreground="white")
style.configure("TFrame", background="#2e2e2e")

video_label = tk.Label(root)
video_label.pack(pady=10)

# Functions
def toggle_draw_mode():
    global draw_mode
    draw_mode = not draw_mode
    print("Draw mode:", draw_mode)

def toggle_ocr_mode():
    global ocr_mode
    ocr_mode = not ocr_mode
    print("OCR mode:", ocr_mode)

def clear_canvas():
    global canvas
    canvas.fill(0)
    print("Canvas cleared")

def set_color(color):
    global draw_color
    draw_color = color
    print("Color changed:", color)

def handle_gesture(gesture):
    global prev_gesture, last_action_time
    current_time = time.time()
    if gesture != prev_gesture or current_time - last_action_time > cooldown:
        prev_gesture = gesture
        last_action_time = current_time
        speak(gesture)

        if gesture == 'palm - play/pause':
            pyautogui.press('playpause')
        elif gesture == 'like - volume up':
            pyautogui.press('volumeup')
        elif gesture == 'dislike - volume down':
            pyautogui.press('volumedown')
        elif gesture == 'peace - scroll up':
            pyautogui.scroll(300)
        elif gesture == 'four - scroll down':
            pyautogui.scroll(-300)

def authenticate_user(frame, faces):
    global authenticated, authenticated_user
    for (x, y, w, h) in faces:
        face_roi = frame[y:y + h, x:x + w]
        if face_roi.size == 0:
            continue
        cv2.imwrite('current_face.jpg', face_roi)
        try:
            for user_image in os.listdir("users/"):
                user_name = os.path.splitext(user_image)[0]
                user_image_path = os.path.join("users", user_image)
                result = DeepFace.verify('current_face.jpg', user_image_path, enforce_detection=False)
                if result['verified']:
                    authenticated = True
                    authenticated_user = user_name
                    speak(f"Welcome {authenticated_user}")
                    break
        except Exception as e:
            print(f"Authentication Error: {e}")
        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

def camera_loop():
    global prev_x, prev_y, last_ocr_time

    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame")
        root.after(10, camera_loop)
        return

    frame_raw = frame.copy()
    frame = cv2.flip(frame, 1)
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 5, minSize=(100, 100))

    if not authenticated:
        authenticate_user(frame, faces)
    else:
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb)

        if results.multi_hand_landmarks and not ocr_mode:
            for hand_landmarks in results.multi_hand_landmarks:
                landmarks = [coord for lm in hand_landmarks.landmark for coord in (lm.x, lm.y)]
                h, w, _ = frame.shape
                index_tip = hand_landmarks.landmark[8]
                cx, cy = int(index_tip.x * w), int(index_tip.y * h)

                if draw_mode:
                    if prev_x is not None:
                        cv2.line(canvas, (prev_x, prev_y), (cx, cy), draw_color, 5)
                    prev_x, prev_y = cx, cy
                else:
                    prev_x, prev_y = None, None
                    try:
                        prediction = clf.predict(np.array(landmarks).reshape(1, -1))
                        gesture = label_mapping[prediction[0]]
                        handle_gesture(gesture)
                        cv2.putText(frame, f'Gesture: {gesture}', (10, 30),
                                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
                    except Exception as e:
                        print(f"[Prediction Error]: {e}")
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

        if ocr_mode:
            current_time = time.time()
            if current_time - last_ocr_time > ocr_cooldown:
                ocr_input = frame_raw.copy()
                frame = cv2.resize(ocr_input, (640, 480))
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                blurred = cv2.GaussianBlur(gray, (5, 5), 0)
                _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
                custom_config = r'--oem 3 --psm 6'
                details = pytesseract.image_to_data(thresh, config=custom_config, output_type=pytesseract.Output.DICT)

                detected_text = ""
                for i in range(len(details['text'])):
                    text = details['text'][i]
                    confidence = int(details['conf'][i])
                    if text.strip() != "" and confidence > 90:
                        detected_text += text + " "
                        speak(detected_text)
                cv2.putText(frame, detected_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2, cv2.LINE_AA)

        frame = cv2.addWeighted(frame, 1, canvas, 0.5, 0)

    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(img)
    imgtk = ImageTk.PhotoImage(image=img)
    video_label.imgtk = imgtk
    video_label.configure(image=imgtk)

    root.after(10, camera_loop)

# Buttons
button_frame = ttk.Frame(root)
button_frame.pack(pady=10)

row1 = ttk.Frame(button_frame)
row1.pack(pady=3)
row2 = ttk.Frame(button_frame)
row2.pack(pady=3)

# Gesture & OCR controls
ttk.Button(row1, text="Toggle Draw Mode", command=toggle_draw_mode).pack(side=tk.LEFT, padx=5)
ttk.Button(row1, text="Toggle OCR Mode", command=toggle_ocr_mode).pack(side=tk.LEFT, padx=5)
ttk.Button(row1, text="Clear Canvas", command=clear_canvas).pack(side=tk.LEFT, padx=5)
ttk.Button(row1, text="Toggle Voice Commands", command=toggle_voice_commands).pack(side=tk.LEFT, padx=5)

# Drawing colors
color_buttons = [
    ("Red", (0, 0, 255)),
    ("Green", (0, 255, 0)),
    ("Blue", (255, 0, 0)),
    ("Yellow", (0, 255, 255)),
    ("Black", (0, 0, 0))
]
for label, color in color_buttons:
    ttk.Button(row2, text=label, command=lambda c=color: set_color(c)).pack(side=tk.LEFT, padx=5)

# Start app
camera_loop()
root.mainloop()

cap.release()
cv2.destroyAllWindows()





Final

In [12]:
import cv2
from deepface import DeepFace
import os
import mediapipe as mp
import numpy as np
import pickle
import pyautogui
import time
import pyttsx3
from PIL import Image, ImageTk
import pytesseract
import tkinter as tk
from tkinter import ttk

# Set path to Tesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Load gesture recognition model
with open(r"D:\nada mossad\e-just\4th grade\second term\computer vision\Project\gestures\gesture_rf_model.pkl", 'rb') as f:
    clf = pickle.load(f)

label_mapping = {
    0: 'palm - play/pause',
    1: 'like - volume up',
    2: 'dislike - volume down',
    3: 'peace - scroll up',
    4: 'four - scroll down'
}

# Load camera calibration data
with open(r"D:\nada mossad\e-just\4th grade\second term\computer vision\Project\gestures\camera_calibration_data.pkl", 'rb') as f:
    calibration_data = pickle.load(f)
camera_matrix = calibration_data['camera_matrix']
dist_coeffs = calibration_data['distortion_coefficients']

# Mediapipe setup
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.6)
mp_drawing = mp.solutions.drawing_utils

# Voice engine
engine = pyttsx3.init()
def speak(text):
    try:
        engine.say(text)
        engine.runAndWait()
    except:
        pass

# Globals
cap = cv2.VideoCapture(0)
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

authenticated = False
authenticated_user = ""

canvas = np.zeros((480, 640, 3), dtype=np.uint8)
draw_mode = False
ocr_mode = False
draw_color = (0, 255, 0)
prev_x, prev_y = None, None
prev_gesture = None
last_action_time = 0
cooldown = 1.5
ocr_cooldown = 2
last_ocr_time = 0

# GUI Setup
root = tk.Tk()
root.title("Smart Vision Assistant")
root.configure(bg="#2e2e2e")

style = ttk.Style()
style.theme_use("clam")
style.configure("TButton", font=("Segoe UI", 10), padding=6, relief="flat", background="#3a3a3a", foreground="white")
style.configure("TFrame", background="#2e2e2e")

video_label = tk.Label(root)
video_label.pack(pady=10)

# Functions
def toggle_draw_mode():
    global draw_mode
    draw_mode = not draw_mode
    print("Draw mode:", draw_mode)

def toggle_ocr_mode():
    global ocr_mode
    ocr_mode = not ocr_mode
    print("OCR mode:", ocr_mode)

def clear_canvas():
    global canvas
    canvas.fill(0)
    print("Canvas cleared")

def set_color(color):
    global draw_color
    draw_color = color
    print("Color changed:", color)

def handle_gesture(gesture):
    global prev_gesture, last_action_time
    current_time = time.time()
    if gesture != prev_gesture or current_time - last_action_time > cooldown:
        prev_gesture = gesture
        last_action_time = current_time
        speak(gesture)

        if gesture == 'palm - play/pause':
            pyautogui.press('playpause')
        elif gesture == 'like - volume up':
            pyautogui.press('volumeup')
        elif gesture == 'dislike - volume down':
            pyautogui.press('volumedown')
        elif gesture == 'peace - scroll up':
            pyautogui.scroll(300)
        elif gesture == 'four - scroll down':
            pyautogui.scroll(-300)

def authenticate_user(frame, faces):
    global authenticated, authenticated_user
    for (x, y, w, h) in faces:
        face_roi = frame[y:y + h, x:x + w]
        if face_roi.size == 0:
            continue
        cv2.imwrite('current_face.jpg', face_roi)
        try:
            for user_image in os.listdir("users/"):
                user_name = os.path.splitext(user_image)[0]
                user_image_path = os.path.join("users", user_image)
                result = DeepFace.verify('current_face.jpg', user_image_path, enforce_detection=False)
                if result['verified']:
                    authenticated = True
                    authenticated_user = user_name
                    speak(f"Welcome {authenticated_user}")
                    break
        except Exception as e:
            print(f"Authentication Error: {e}")
        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

def camera_loop():
    global prev_x, prev_y, last_ocr_time

    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame")
        root.after(10, camera_loop)
        return

    frame_raw = frame.copy()
    frame = cv2.flip(frame, 1)

    # Undistort the frame using the camera calibration data
    frame = cv2.undistort(frame, camera_matrix, dist_coeffs)

    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 5, minSize=(100, 100))

    if not authenticated:
        authenticate_user(frame, faces)
    else:
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = hands.process(rgb)

        if results.multi_hand_landmarks and not ocr_mode:
            for hand_landmarks in results.multi_hand_landmarks:
                landmarks = [coord for lm in hand_landmarks.landmark for coord in (lm.x, lm.y)]
                h, w, _ = frame.shape
                index_tip = hand_landmarks.landmark[8]
                cx, cy = int(index_tip.x * w), int(index_tip.y * h)

                if draw_mode:
                    if prev_x is not None:
                        cv2.line(canvas, (prev_x, prev_y), (cx, cy), draw_color, 5)
                    prev_x, prev_y = cx, cy
                else:
                    prev_x, prev_y = None, None
                    try:
                        prediction = clf.predict(np.array(landmarks).reshape(1, -1))
                        gesture = label_mapping[prediction[0]]
                        handle_gesture(gesture)
                        cv2.putText(frame, f'Gesture: {gesture}', (10, 30),
                                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
                    except Exception as e:
                        print(f"[Prediction Error]: {e}")
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

        if ocr_mode:
            current_time = time.time()
            if current_time - last_ocr_time > ocr_cooldown:
                ocr_input = frame_raw.copy()
                frame = cv2.resize(ocr_input, (640, 480))
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                blurred = cv2.GaussianBlur(gray, (5, 5), 0)
                _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
                custom_config = r'--oem 3 --psm 6'
                details = pytesseract.image_to_data(thresh, config=custom_config, output_type=pytesseract.Output.DICT)

                detected_text = ""
                for i in range(len(details['text'])):
                    text = details['text'][i]
                    confidence = int(details['conf'][i])
                    if text.strip() != "" and confidence > 90:
                        detected_text += text + " "
                        speak(detected_text)
                cv2.putText(frame, detected_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2, cv2.LINE_AA)

        frame = cv2.addWeighted(frame, 1, canvas, 0.5, 0)

    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(img)
    imgtk = ImageTk.PhotoImage(image=img)
    video_label.imgtk = imgtk
    video_label.configure(image=imgtk)

    root.after(10, camera_loop)

# Buttons
button_frame = ttk.Frame(root)
button_frame.pack(pady=10)

row1 = ttk.Frame(button_frame)
row1.pack(pady=3)
row2 = ttk.Frame(button_frame)
row2.pack(pady=3)

ttk.Button(row1, text="Toggle Draw Mode", command=toggle_draw_mode).pack(side=tk.LEFT, padx=5)
ttk.Button(row1, text="Toggle OCR Mode", command=toggle_ocr_mode).pack(side=tk.LEFT, padx=5)
ttk.Button(row1, text="Clear Canvas", command=clear_canvas).pack(side=tk.LEFT, padx=5)

color_buttons = [
    ("Red", (0, 0, 255)),
    ("Green", (0, 255, 0)),
    ("Blue", (255, 0, 0)),
    ("Yellow", (0, 255, 255)),
    ("Black", (0, 0, 0))
]
for label, color in color_buttons:
    ttk.Button(row2, text=label, command=lambda c=color: set_color(c)).pack(side=tk.LEFT, padx=5)

# Run
camera_loop()
root.mainloop()

cap.release()
cv2.destroyAllWindows()
