In [None]:
# 导入所需的库
from ultralytics import YOLO
import cv2
import torch # 确保torch也被导入
import json

In [2]:
# --- 1. 加载预训练的YOLO模型 ---
print("正在加载YOLOv8n模型...")
model = YOLO('backend/resource/yolo11n.pt')
print("模型加载完毕。")

正在加载YOLOv8n模型...
模型加载完毕。


In [3]:
# --- 2. 加载图像 ---
image_path = 'backend/resource/bus.jpg'
try:
    img = cv2.imread(image_path)
    if img is None:
        print(f"错误：无法加载图片，请检查路径是否正确: {image_path}")
        exit()
except Exception as e:
    print(f"加载图片时出错: {e}")
    exit()

print(f"成功加载图片: {image_path}")
# --- 新增：获取图片尺寸 ---
# img.shape 返回一个元组 (height, width, channels)
height, width, _ = img.shape
image_dimensions = {"width": width, "height": height}

print(f"图片尺寸: {width}x{height}")

成功加载图片: backend/resource/bus.jpg
图片尺寸: 810x1080


In [4]:
# --- 3. 执行检测 ---
print("正在对图像进行目标检测...")
results = model(img)
print("检测完成。")

正在对图像进行目标检测...

0: 640x480 4 persons, 1 bus, 152.5ms
Speed: 8.0ms preprocess, 152.5ms inference, 9.8ms postprocess per image at shape (1, 3, 640, 480)
检测完成。


In [5]:
# --- 4. 提取结构化数据 (整合为单一JSON对象) ---
# 这是项目目标[6]中提到的提取关键特征的一部分
detected_objects_list = []
for box in results[0].boxes:
    class_id = int(box.cls[0])
    confidence = float(box.conf[0])
    class_name = model.names[class_id]
    bounding_box = [int(coord) for coord in box.xyxy[0].tolist()]
    
    detected_objects_list.append({
        "class": class_name,
        "confidence": round(confidence, 4),
        "bounding_box": bounding_box
    })

# 创建一个顶层的数据结构来包含所有信息
scene_data = {
    "image_dimensions": image_dimensions,
    "detected_objects": detected_objects_list
}

print("\n--- 提取到的完整场景数据 (Python Dict) ---")
# 使用json.dumps来美化打印输出
json_data = json.dumps(scene_data, indent=2)
print(json_data)


--- 提取到的完整场景数据 (Python Dict) ---
{
  "image_dimensions": {
    "width": 810,
    "height": 1080
  },
  "detected_objects": [
    {
      "class": "bus",
      "confidence": 0.9402,
      "bounding_box": [
        3,
        229,
        796,
        728
      ]
    },
    {
      "class": "person",
      "confidence": 0.8882,
      "bounding_box": [
        671,
        394,
        809,
        878
      ]
    },
    {
      "class": "person",
      "confidence": 0.8783,
      "bounding_box": [
        47,
        399,
        239,
        904
      ]
    },
    {
      "class": "person",
      "confidence": 0.8558,
      "bounding_box": [
        223,
        408,
        344,
        860
      ]
    },
    {
      "class": "person",
      "confidence": 0.6219,
      "bounding_box": [
        0,
        556,
        68,
        872
      ]
    }
  ]
}


In [None]:
# --- 5. 创建LLM提示 ---
# 这是项目目标[7]中提到的关键一步：动态格式化数据为提示
def create_prompt_with_json(json_data):
    # --- 新增：数据结构定义 ---
    schema_description = """
First, understand the data structure I will provide. The data will be a JSON object with the following schema:
- "image_properties": An object containing the overall image dimensions.
  - "width": (integer) The total width of the image in pixels.
  - "height": (integer) The total height of the image in pixels.
- "detected_objects": A list of objects found in the image. Each object in this list has the following structure:
  - "class": (string) The name of the detected object class.
  - "confidence": (float) The model's confidence in the detection, from 0.0 to 1.0.
  - "bounding_box": A list of four integers [x_center , y_center, width, height]
"""
    
    # 构建最终的提示。这部分可以根据需要灵活修改
    # 它包含一个角色指令、原始数据和具体任务
    prompt = f"""You are an AI assistant that describes images based on a list of detected objects provided in JSON format.
{schema_description}
Now, analyze the following specific scene data based on the schema described above.
Detected objects data:
{json_data}
Based on this structured data, please generate a concise and natural language description of the scene. Try to infer relationships between objects and their general locations (e.g., "left side", "center", "behind another object") rather than just listing the raw bounding box coordinates. Focus on creating a human-like, coherent description of what the image likely contains.
"""
    return prompt

prompt = create_prompt_with_json(json_data)

'You are an AI assistant that describes images based on a list of detected objects provided in JSON format.\n\nFirst, understand the data structure I will provide. The data will be a JSON object with the following schema:\n- "image_properties": An object containing the overall image dimensions.\n  - "width": (integer) The total width of the image in pixels.\n  - "height": (integer) The total height of the image in pixels.\n- "detected_objects": A list of objects found in the image. Each object in this list has the following structure:\n  - "class": (string) The name of the detected object class.\n  - "confidence": (float) The model\'s confidence in the detection, from 0.0 to 1.0.\n  - "bounding_box": A list of four integers [x_center , y_center, width, height]\n\nNow, analyze the following specific scene data based on the schema described above.\nDetected objects data:\n{\n  "image_dimensions": {\n    "width": 810,\n    "height": 1080\n  },\n  "detected_objects": [\n    {\n      "class

In [None]:
# Ollama / DeepSeek Configuration
OLLAMA_API_URL = "http://localhost:11434/api/generate"
DEEPSEEK_MODEL_NAME = "deepseek-r1:8b" # 默认使用 "deepseek-llm"，请根据 `ollama list` 的结果修改

def get_local_llm_description(prompt):

    # 将 'YOUR_MODEL_NAME' 替换为 `ollama list` 中显示的模型名称
    # 例如: 'deepseek-llm:8b'
    model_name = DEEPSEEK_MODEL_NAME
    print(f"--- 正在发送提示到本地Ollama模型: {model_name} ---")

    try:
        # 使用 ollama.chat 与本地模型交互
        response = ollama.chat(
            model=model_name,
            messages=[
                {'role': 'user', 'content': prompt}
            ]
        )
        print("--- 已从本地模型接收到响应 ---")
        return response['message']['content']
    except Exception as e:
        print(f"调用本地Ollama模型时出错: {e}")
        return f"Error interacting with local Ollama model: {e}"

ModuleNotFoundError: No module named 'ollama'

In [14]:
# (可选) 如果你还想看到图片，可以取消下面代码的注释
annotated_frame = results[0].plot()
cv2.imshow("YOLOv8 Detection", annotated_frame)
cv2.waitKey(0)
cv2.destroyAllWindows()