# üé• **04-Inferencia de Video con YOLO**

Este notebook aplica un modelo YOLO entrenado (o pre-entrenado) para detectar y contar veh√≠culos en un video, guardar la secuencia anotada y reportar m√©tricas de rendimiento (FPS, latencia, tiempo total).

In [1]:
from pathlib import Path
import cv2
import time
from ultralytics import YOLO
from collections import defaultdict
import imageio.v2 as imageio

In [12]:
# Video de entrada 
VIDEO_IN = Path("/home/guardiaserver/bogota/vision-urbana-bogota/data/test/Bogot√° traffic video.mp4")

In [13]:
# Pesos del modelo YOLO entrenado 
MODEL_PT = Path("/home/guardiaserver/bogota/vision-urbana-bogota/models/yolov10m/weights/best.pt")

In [14]:
# Caperta de salida para los resultados
OUTPUT_DIR = Path("/home/guardiaserver/bogota/vision-urbana-bogota/results/video_inference")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [15]:
# Video de salida con las detecciones
VIDEO_OUT = OUTPUT_DIR / f"{VIDEO_IN.stem}_yolo_test.mp4"
# Otros par√°metros
IMG_SIZE = 640
CONF_THRESH = 0.25
DEVICE = 0            # 0 ‚Üí GPU 0,  
SHOW_FRAMES = False   # True para ver frames en tiempo de ejecuci√≥n

In [16]:
model = YOLO(str(MODEL_PT))
model.fuse()  # acelera inferencia
print(f"Modelo {MODEL_PT.name}")

YOLOv10m summary (fused): 369 layers, 16,451,542 parameters, 0 gradients, 63.4 GFLOPs
Modelo best.pt


In [17]:
cap = cv2.VideoCapture(str(VIDEO_IN))
assert cap.isOpened(), f"No se pudo abrir {VIDEO_IN}"

# Propiedades del video
fps_in  = cap.get(cv2.CAP_PROP_FPS)
width   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height  = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"Resoluci√≥n: {width}√ó{height}  |  FPS original: {fps_in:.2f}")

# üé• VideoWriter para guardar la salida
fourcc = cv2.VideoWriter_fourcc(*"mp4v") # type: ignore
writer = cv2.VideoWriter(str(VIDEO_OUT), fourcc, fps_in, (width, height))

Resoluci√≥n: 360√ó640  |  FPS original: 30.00


In [18]:
# üöÄ Inferencia y escritura de frames
frame_count, t0 = 0, time.time()
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Inferencia
    res = model.predict(
        frame,
        imgsz=IMG_SIZE,
        conf=CONF_THRESH,
        device=DEVICE,
        stream=False,
        verbose=False
    )[0]

    # Dibujar resultados sobre el frame
    annotated = res.plot()

    # Escribir frame anotado
    writer.write(annotated)

    # Mostrar en pantalla opcional
    if SHOW_FRAMES:
        cv2.imshow("YOLO Inference", annotated)
        if cv2.waitKey(1) & 0xFF == 27:   # ESC para salir
            break

    frame_count += 1

t_total = time.time() - t0
cap.release()
writer.release()
if SHOW_FRAMES:
    cv2.destroyAllWindows()

In [19]:
fps_real = frame_count / t_total
lat_ms   = 1000 * t_total / frame_count

print(f"üìà  Frames procesados : {frame_count}")
print(f"‚è±Ô∏è  Tiempo total      : {t_total:.2f} s")
print(f"‚ö° FPS promedio       : {fps_real:.2f}")
print(f"‚åõ Latencia promedio  : {lat_ms:.2f} ms por frame")
print(f"üéûÔ∏è  Video guardado en : {VIDEO_OUT}")

üìà  Frames procesados : 688
‚è±Ô∏è  Tiempo total      : 3.21 s
‚ö° FPS promedio       : 214.15
‚åõ Latencia promedio  : 4.67 ms por frame
üéûÔ∏è  Video guardado en : /home/guardiaserver/bogota/vision-urbana-bogota/results/video_inference/Bogot√° traffic video_yolo_test.mp4


In [20]:
# Rutas y par√°metros
VIDEO_IN   = Path("/home/guardiaserver/bogota/vision-urbana-bogota/data/test/Bogot√° traffic video.mp4")
MODEL_PT   = Path("/home/guardiaserver/bogota/vision-urbana-bogota/models/yolov10m/weights/best.pt")
OUTPUT_DIR = Path("/home/guardiaserver/bogota/vision-urbana-bogota/results/video_inference")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
VIDEO_OUT  = OUTPUT_DIR / f"{VIDEO_IN.stem}_count.mp4"

IMG_SIZE    = 640
CONF_THRESH = 0.25
DEVICE      = 0
SHOW_FRAMES = False
TRACKER_YAML= "bytetrack.yaml"

# Cargar modelo
model = YOLO(str(MODEL_PT))
model.fuse()
print(f"Modelo {MODEL_PT.name}")
car_id = next(i for i, n in model.names.items() if n.lower() == "car")

# Video de entrada / salida
cap = cv2.VideoCapture(str(VIDEO_IN))
assert cap.isOpened(), f"No se pudo abrir {VIDEO_IN}"

fps_in = cap.get(cv2.CAP_PROP_FPS)
W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
writer = cv2.VideoWriter(str(VIDEO_OUT), cv2.VideoWriter_fourcc(*"mp4v"), fps_in, (W, H)) # type: ignore

# L√≠nea de cruce
line_y = int(0.5 * H)
pt1, pt2 = (0, line_y), (W, line_y)

# Conteo
track_last_pos = {}
counted_ids = set()
total_cars = 0
frame_count, t0 = 0, time.time()

# Altura del recuadro superior (en p√≠xeles)
box_h = 60

while True:
    ret, frame = cap.read()
    if not ret:
        break

    results = model.track(
        frame,
        imgsz=IMG_SIZE,
        conf=CONF_THRESH,
        device=DEVICE,
        persist=True,
        tracker=TRACKER_YAML,
        classes=[car_id],
        verbose=False
    )[0]

    annotated = results.plot()

    # L√≠nea de cruce
    cv2.line(annotated, pt1, pt2, color=(0, 255, 255), thickness=2)

    for box, tid in zip(results.boxes.xyxy.cpu().numpy(), results.boxes.id.cpu().numpy()): # type: ignore
        tid = int(tid)
        x1, y1, x2, y2 = box
        cx, cy = int((x1 + x2) / 2), int((y1 + y2) / 2)

        prev_cy = track_last_pos.get(tid, cy)
        track_last_pos[tid] = cy

        if prev_cy < line_y <= cy and tid not in counted_ids:
            total_cars += 1
            counted_ids.add(tid)

        cv2.circle(annotated, (cx, cy), 4, (0, 0, 255), -1)

    # Dibujar recuadro amarillo en la parte superior
    cv2.rectangle(annotated, (0, 0), (W, box_h), (0, 255, 255), thickness=-1)

    # Texto en negro
    cv2.putText(annotated, f"Vehiculos contados: {total_cars}",
                org=(20, int(box_h * 0.7)),
                fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                fontScale=1.2,
                color=(0, 0, 0), thickness=3)

    writer.write(annotated)

    if SHOW_FRAMES:
        cv2.imshow("Conteo Vehicular", annotated)
        if cv2.waitKey(1) & 0xFF == 27:
            break

    frame_count += 1

# Cierre
cap.release(); writer.release()
if SHOW_FRAMES:
    cv2.destroyAllWindows()

t_total = time.time() - t0
print(f"‚úÖ Video generado con {total_cars} autos contados")
print(f"üìç Guardado en: {VIDEO_OUT}")

YOLOv10m summary (fused): 369 layers, 16,451,542 parameters, 0 gradients, 63.4 GFLOPs
Modelo best.pt
‚úÖ Video generado con 18 autos contados
üìç Guardado en: /home/guardiaserver/bogota/vision-urbana-bogota/results/video_inference/Bogot√° traffic video_count.mp4


In [21]:
# Ruta al video generado con conteo
video_path = Path("/home/guardiaserver/bogota/vision-urbana-bogota/results/video_inference/Bogot√° traffic video_count.mp4")
gif_path   = video_path.with_suffix(".gif")

# Capturar video con OpenCV
cap = cv2.VideoCapture(str(video_path))
assert cap.isOpened(), f"No se pudo abrir {video_path}"

# Extraer fps y duraci√≥n estimada
fps    = cap.get(cv2.CAP_PROP_FPS)
total  = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
dur    = total / fps

# Definir segmento (por ejemplo, del segundo 10 al 15)
start_sec = 10
end_sec   = 15
start_f   = int(start_sec * fps)
end_f     = int(end_sec * fps)

frames = []
i = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret or i > end_f:
        break
    if i >= start_f:
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_resized = cv2.resize(frame_rgb, (640, int(640 * frame.shape[0] / frame.shape[1])))
        frames.append(frame_resized)
    i += 1

cap.release()

# Escribir GIF
imageio.mimsave(str(gif_path), frames, fps=10, loop=0)
print(f"‚úÖ GIF creado en: {gif_path}")

‚úÖ GIF creado en: /home/guardiaserver/bogota/vision-urbana-bogota/results/video_inference/Bogot√° traffic video_count.gif


In [None]:
# Ruta al video de salida
video_path = Path("/home/guardiaserver/bogota/vision-urbana-bogota/results/video_inference/4K Road traffic video_count.mp4")
output_dir = video_path.parent

# Frame que quieres capturar (por ejemplo, el n√∫mero 400)
frame_to_capture = 400
save_path = output_dir / f"{video_path.stem}_frame{frame_to_capture}.png"

# Abrir video
cap = cv2.VideoCapture(str(video_path))
assert cap.isOpened(), f"No se pudo abrir {video_path}"

total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps          = cap.get(cv2.CAP_PROP_FPS)
print(f"üéûÔ∏è  FPS: {fps:.2f} | Frames totales: {total_frames}")

# Capturar el frame espec√≠fico
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_to_capture)
ret, frame = cap.read()

if ret:
    cv2.imwrite(str(save_path), frame)
    print(f"‚úÖ Imagen guardada: {save_path}")
else:
    print(f"‚ùå No se pudo capturar el frame {frame_to_capture}")

cap.release()

üéûÔ∏è  FPS: 30.00 | Frames totales: 9184
‚úÖ Imagen guardada: /home/guardiaserver/bogota/vision-urbana-bogota/results/video_inference/4K Road traffic video_count_frame400.png
