In [1]:
import torch
import cv2
import pyttsx3
import requests
import json
import numpy as np
import time
import speech_recognition as sr
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import openai

In [2]:
#Loading pre-trained YOLO model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained = True)

Using cache found in C:\Users\Administrator/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-6-2 Python-3.10.7 torch-2.0.1+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


In [3]:
#Initialize Text-to-Speech Engine
engine = pyttsx3.init()

In [4]:
#Initialize OpenAI API(Use your Own People)
openai.api_key = 'Key'

In [5]:
def speak(text):
    engine.say(text)
    engine.runAndWait()

In [6]:
def get_objects_details(object_name):
    try:
        url = f"htps://end.wikipedia.org/api/rest_v1/page/summary/{object_name}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            return data.get('extract', f"Information about {object_name}")
        else:
            return f"Information about {object_name} is not available."
    except requests.RequestException as e:
        return f"Could not retrieve information due to an error: {str(e)}."

In [7]:
def get_position(bbox, frame_width):
    center_x = (bbox[0] + bbox[2]) /2
    if center_x < frame_width / 3:
        return "left"
    elif center_x > 2 *frame_width /3:
        return "right"
    else:
        return "center"

In [8]:
def process_frame(frame):
    results = model(frame)
    labels, cord = results.xyxyn[0][:, -1].numpy(), results.xyxyn[0][:, :-1].numpy()
    frame_width = frame.shape[1]

    detected_objects = []
    for i, (label, bbox) in enumerate(zip(labels, cord)) :
        class_name = model.names[int(label)]
        x1,y1,x2,y2, conf = bbox
        x1,y1,x2,y2 = int(x1 * frame_width), int(y1 * frame.shape[0]), int(x2 * frame_width), int(y2 * frame_width[0])

        position = get_position([x1,y1,x2,y2],  frame_width)

        #draw bounding box
        cv2.rectangle(frame, (x1,y1) , (x2,y2), (255, 0,0), 2)
        cv2.putText(frame, f"{class_name} {conf: .2f}", (x1,y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255,0,0))

        speak(f"I see a {class_name} on the {position}.")

        details = get_object_details(class_name)
        speak(details)

        detected_objects.append({"class_name": class_name, "position": position, "details": details})
        time.sleep(2) #Pause to allow TTs to finish speaking
    return frame, detected_objects

In [9]:
def recognize_speech():
    recognizer = sr.Recognizer()
    mic = sr.Microphone()

    with mic as source:
        print ("Listening....")
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)
    try:
        command = recognizer.recognize_google(audio)
        print(f"Recognized: {command}")
        return command.lower()
    except sr.UnknownValueError:
        return "Sorry, I didn't catch that."
    except sr.RequestError as e:
        return f"Could not request results; {e}"
from transformers import pipeline

#Initialize the Hugging Face pipeline
qa_pipeline = pipeline("question-answering")

def ask_openai(question, context=""):
    try:
        response = qa_pipeline(question=question, context=context)
        answer = response['answer']
        return answer
    except Exception as e:
        return f"An error occured while getting the response: {str(e)}"
    

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading config.json: 100%|██████████| 473/473 [00:00<?, ?B/s] 
Downloading model.safetensors: 100%|██████████| 261M/261M [01:08<00:00, 3.79MB/s] 
Downloading tokenizer_config.json: 100%|██████████| 49.0/49.0 [00:00<?, ?B/s]
Downloading vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 528kB/s]
Downloading tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 661kB/s]


In [11]:
def handle_command(command, detected_objects):
    context = " ".join([obj['details'] for obj in detected_objects])
    if "repeat" in command:
        speak("I will repeat the information.")
        for obj in detected_objects:
            speak(f"I see a {obj['class_name']} on the {obj['position']}. {obj['details']}")
    elif "details" in command:
        for obj in detected_objects:
            if obj['class_name'] in command:
                speak(f"Here are more details about {obj['class_name']}: {obj['details']}")
    else:
        answer = ask_openai(command, context)
        speak(answer)
        speak("Do you need any further assistance?")

In [13]:
def main(source_type="camera"):
    source = 0 if source_type == "camera" else source_type
    cap = cv2.VideoCapture(source)

    if not cap.isOpened():
        print(f"Error: Unable to open video source {source}")
        return
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame, detected_objects = process_frame(frame)
        cv2.imshow('YOLO Object Detection', frame)

        if cv2.waitkey(1) & 0xFF == ord('q'):
            break

        command = recognize_speech()
        if command:
            handle_command(command, detected_objects)
    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    source_type = input("Enter source type (camera/file): ").strip()  
