# Práctica 4/4b

En esta práctica, empleamos dos modelos de OCR diferentes para hacer una comparación entre ambos. En este caso, empleamos el mismo código para ambos, cambiando únicamente el modelo para la detección OCR.

## EasyOCR


En este código se hace uso de dos modelos de yolov11, uno preentrenado y otro entrenado únicamente para detectar matrículas de coche, haciendo uso de un dataset descargado de Roboflow.
Se mantiene una caché de la última detección que se ha hecho de un objeto, por su track_id, para luego poder comparar las nuevas detecciones con la última, y mantener siempre la mejor.
El proceso de detección y reconocimiento es el siguiente:
Primero se emplea yolov11 preentrenado para detectar los coches, personas, motocicletas y guaguas. Con cada frame detectado de las clases de interés (coche, motocicleta y guagua), se emplea el segundo modelo de yolov11 para detectar las matrículas. Finalmente, si la bbox de la matrícula es aceptable, se pasa por el modelo de OCR (en este caso EasyOCR) para reconocimiento de caracteres. Antes del reconocimiento se hacen ciertas mejoras a la imagen, para facilitar el reconocimiento por parte del modelo. Como parte del reconocimiento, se hace una comparación del resultado obtenido con el último predicho para dicho objeto, asegurando dejar siempre en la caché el mejor resultado.
Además, se mantiene una cuenta de los objetos únicos que se han encontrado de cada clase, y se va escribiendo en un csv que refleja el resultado total de la inferencia.

In [None]:
import cv2
from ultralytics import YOLO
import easyocr

model = YOLO('yolo11n.pt')  # detector + tracker general
model_2 = YOLO('TGC_RBNW/runs/detect/train2/weights/best.pt')  # detector de matrículas

classNames = ["person", "bicycle", "car", "motorcycle", "", "bus"]

vid = cv2.VideoCapture("C0142.MP4")
fps = vid.get(cv2.CAP_PROP_FPS)
width  = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('salida_easyocr.mp4', fourcc, fps, (width, height))

classes_to_count = {"person": 0, "car": 2, "motorcycle": 3, "bus": 5}
seen_ids = {k: set() for k in classes_to_count}
totals_unique = {k: 0 for k in classes_to_count}

reader = easyocr.Reader(['en'], gpu=True)

plate_cache = {}
OCR_TRY_EVERY = 5
MIN_PLATE_AREA = 2500
PAD = 8
ALLOW = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
GOOD_LEN = 5
GOOD_PROB = 0.7

frame_idx = 0

license_plates = 0

while True:
    ret, img = vid.read()
    if not ret:
        break

    # Seguimiento con YOLO: filtra clases relevantes
    results = model.track(img, persist=True, classes=[0, 1, 2, 3, 5])
    r = results[0]
    boxes = r.boxes

    for box in boxes:
        # bbox
        x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())

        # track id
        track_id = str(int(box.id[0].tolist())) if box.id is not None else ''

        # confianza y clase
        conf = float(box.conf[0])
        cls = int(box.cls[0])

        # Seguridad ante índice fuera de rango
        cls_name = classNames[cls] if 0 <= cls < len(classNames) and classNames[cls] else str(cls)

        # CSV base de detección
        csv_text = ",".join((
            str(frame_idx), cls_name, f"{conf:.4f}", track_id,
            str(x1), str(y1), str(x2), str(y2)
        ))

        # Filtro de confianza general
        if conf < 0.6:
            continue

        # Solo contamos únicos si hay tracking id
        if box.id is None:
            continue
        tid = int(box.id[0])

        for name, cid in classes_to_count.items():
            if cls == cid:
                if tid not in seen_ids[name]:
                    seen_ids[name].add(tid)
                    totals_unique[name] += 1
                break

        color = (0, 255, 255) if cls in (2, 3, 5) else (255, 0, 0)
        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
        label = f"{track_id} {cls_name} {conf:.2f}"
        cv2.putText(img, label, (x1, max(0, y1 - 5)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

        if cls_name in ("car", "bus", "motorcycle"):
            img_cropped = img[y1:y2, x1:x2]

            cached = plate_cache.get(track_id, {"text": "", "prob": 0.0, "last_try": -9999})
            have_good_text = (cached["prob"] >= GOOD_PROB and len(cached["text"]) >= GOOD_LEN)
            should_try_ocr = (not have_good_text) and (frame_idx - cached["last_try"] >= OCR_TRY_EVERY)

            best_text, best_prob = "", 0.0
            x1_p = y1_p = x2_p = y2_p = None

            if should_try_ocr and img_cropped.size > 0:
                # Detecta matrícula dentro del vehículo
                plate_results = model_2.predict(img_cropped, classes=[0], verbose=False)
                pr = plate_results[0]
                best_plate_box, best_conf = None, 0.0

                for plate_box in pr.boxes:
                    pconf = float(plate_box.conf[0])
                    if pconf >= best_conf:
                        best_conf = pconf
                        best_plate_box = plate_box

                if best_plate_box is not None:
                    x1_p, y1_p, x2_p, y2_p = map(int, best_plate_box.xyxy[0].tolist())

                    # Padding y límites
                    x1_p = max(0, x1_p - PAD); y1_p = max(0, y1_p - PAD)
                    x2_p = min(img_cropped.shape[1] - 1, x2_p + PAD)
                    y2_p = min(img_cropped.shape[0] - 1, y2_p + PAD)

                    w = max(0, x2_p - x1_p)
                    h = max(0, y2_p - y1_p)

                    if w * h >= MIN_PLATE_AREA and best_conf >= 0.5:
                        cv2.rectangle(img, (x1_p, y1_p), (x2_p, y2_p), (255, 128, 0), 2)
                        plate_roi = img_cropped[y1_p:y2_p, x1_p:x2_p]
                        gray = cv2.cvtColor(plate_roi, cv2.COLOR_BGR2GRAY)

                        # Upscale si ROI es pequeño
                        if max(gray.shape) < 220:
                            gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

                        # Contraste + binarizado suave
                        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
                        gray = clahe.apply(gray)
                        gray = cv2.GaussianBlur(gray, (3, 3), 0)
                        thr = cv2.adaptiveThreshold(
                            gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 31, 15
                        )

                        ocr = reader.readtext(thr, detail=1, paragraph=False, allowlist=ALLOW)

                        dets = []
                        for (bbox, text, prob) in ocr:
                            if not text:
                                continue
                            xs = [p[0] for p in bbox]
                            dets.append((min(xs), text, float(prob)))

                        dets.sort(key=lambda t: t[0])
                        dets = [d for d in dets if d[2] >= 0.4]

                        if dets:
                            best_text = "".join([d[1] for d in dets]).replace(" ", "").upper()
                            best_prob = sum(d[2] for d in dets) / len(dets)

                        # Actualiza cache sólo si la lectura es razonable y mejor
                        cached_plate = plate_cache.get(track_id, {"text": "", "prob": 0.0, "last_try": -9999})
                        if best_text and len(best_text) >= GOOD_LEN and best_prob >= 0.5 and best_prob > float(cached_plate["prob"]):
                            plate_cache[track_id] = {"text": best_text, "prob": best_prob, "last_try": frame_idx}
                            cv2.putText(img, f"PLATE: {best_text}", (x1, y2 + 20),
                                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
                            with open("plates_easyocr.txt", "a", encoding="utf-8") as fw:
                                fw.write(f'{best_text}\n')
                        else:
                            # Marca el intento para permitir reintentos futuros
                            cached_plate["last_try"] = frame_idx
                            plate_cache[track_id] = cached_plate
                    else:
                        # ROI demasiado pequeño o conf de placa baja: sólo actualizamos last_try
                        cached_plate = plate_cache.get(track_id, {"text": "", "prob": 0.0, "last_try": -9999})
                        cached_plate["last_try"] = frame_idx
                        plate_cache[track_id] = cached_plate

                # Añade al CSV sólo si hay texto válido y coords válidas
                if best_text and x1_p is not None:
                    csv_text += "," + ",".join((
                        "license_plate", str(x1_p), str(y1_p), str(x2_p), str(y2_p), str(best_text)
                    ))
                    license_plates += 1

            # Mostrar texto cacheado si existe
            cached_text = plate_cache.get(track_id, {}).get("text")
            if cached_text:
                cv2.putText(img, f"PLATE: {cached_text}", (x1, y2 + 20),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

        # Escribe una línea por detección
        with open("results_easyocr.csv", "a", encoding="utf-8") as fw:
            fw.write(f"{csv_text}\n")

    out.write(img)
    frame_idx += 1

print("==== TOTALES ÚNICOS ====")
print("license plates:", license_plates)
print("persons:", totals_unique["person"])
print("cars:", totals_unique["car"])
print("motorcycles:", totals_unique["motorcycle"])
print("buses:", totals_unique["bus"])

vid.release()
out.release()

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.



0: 384x640 4 cars, 1 bus, 86.6ms
Speed: 12.6ms preprocess, 86.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 59.8ms
Speed: 1.9ms preprocess, 59.8ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 67.3ms
Speed: 1.7ms preprocess, 67.3ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 99.8ms
Speed: 2.8ms preprocess, 99.8ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 86.5ms
Speed: 2.1ms preprocess, 86.5ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 69.3ms
Speed: 1.6ms preprocess, 69.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)





0: 384x640 4 cars, 1 bus, 70.7ms
Speed: 1.7ms preprocess, 70.7ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 91.2ms
Speed: 1.6ms preprocess, 91.2ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 60.8ms
Speed: 1.8ms preprocess, 60.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 65.0ms
Speed: 1.7ms preprocess, 65.0ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 66.5ms
Speed: 1.7ms preprocess, 66.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 66.7ms
Speed: 2.5ms preprocess, 66.7ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 65.8ms
Speed: 1.8ms preprocess, 65.8ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 86.7ms
Speed: 2.6ms preprocess, 86.7ms inference, 1.5ms p

error: OpenCV(4.12.0) D:\a\opencv-python\opencv-python\opencv\modules\highgui\src\window.cpp:1295: error: (-2:Unspecified error) The function is not implemented. Rebuild the library with Windows, GTK+ 2.x or Cocoa support. If you are on Ubuntu or Debian, install libgtk2.0-dev and pkg-config, then re-run cmake or configure script in function 'cvDestroyAllWindows'


## PyTesseract

En esta celda lo que realizamos son pequeños cambios respecto al código anterior, los justos para cambiar el modelo, de EasyOCR a PyTesseract. El resto del código funciona exactamente del mismo modo, a diferencia del modelo de OCR empleado.

In [3]:
import cv2
from ultralytics import YOLO
import pytesseract
from pytesseract import Output

pytesseract.pytesseract.tesseract_cmd = r"C:/Users/kevin/AppData/Local/Programs/Tesseract-OCR/tesseract.exe"
model = YOLO('yolo11n.pt')
model_2 = YOLO('TGC_RBNW/runs/detect/train2/weights/best.pt')

classNames = ["person", "bicycle", "car", "motorcycle", "", "bus"]

vid = cv2.VideoCapture("C0142.MP4")
fps = vid.get(cv2.CAP_PROP_FPS)
width  = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('salida_tesseract_uv.mp4', fourcc, fps, (width, height))

classes_to_count = {"person": 0, "car": 2, "motorcycle": 3, "bus": 5}
seen_ids = {k: set() for k in classes_to_count}
totals_unique = {k: 0 for k in classes_to_count}
license_plates = 0

plate_cache = {}
OCR_TRY_EVERY = 5
MIN_PLATE_AREA = 2500
PAD = 8
ALLOW = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
GOOD_LEN = 5
GOOD_PROB = 0.55

TESS_PSM = 7
TS_CONFIG = f'-l eng --oem 3 --psm {TESS_PSM} -c tessedit_char_whitelist={ALLOW}'

frame_idx = 0

while True:
    ret, img = vid.read()
    if not ret:
        break

    # Seguimiento con YOLO: filtra clases relevantes
    results = model.track(img, persist=True, classes=[0, 1, 2, 3, 5])
    r = results[0]
    boxes = r.boxes

    for box in boxes:
        # bbox
        x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())

        # track id
        track_id = str(int(box.id[0].tolist())) if box.id is not None else ''

        # confianza y clase
        conf = float(box.conf[0])
        cls = int(box.cls[0])

        cls_name = classNames[cls] if 0 <= cls < len(classNames) and classNames[cls] else str(cls)
        
        csv_text = ",".join((
            str(frame_idx), cls_name, f"{conf:.4f}", track_id,
            str(x1), str(y1), str(x2), str(y2)
        ))

        # Filtro de confianza general
        if conf < 0.6:
            continue

        # Solo contamos únicos si hay tracking id
        if box.id is None:
            continue
        tid = int(box.id[0])

        for name, cid in classes_to_count.items():
            if cls == cid:
                if tid not in seen_ids[name]:
                    seen_ids[name].add(tid)
                    totals_unique[name] += 1
                break

        # Dibujo del bbox
        color = (0, 255, 255) if cls in (2, 3, 5) else (255, 0, 0)
        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
        label = f"{track_id} {cls_name} {conf:.2f}"
        cv2.putText(img, label, (x1, max(0, y1 - 5)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
        
        if cls_name in ("car", "bus", "motorcycle"):
            img_cropped = img[y1:y2, x1:x2]
            
            cached = plate_cache.get(track_id, {"text": "", "prob": 0.0, "last_try": -9999})
            have_good_text = (cached["prob"] >= GOOD_PROB and len(cached["text"]) >= GOOD_LEN)
            should_try_ocr = (not have_good_text) and (frame_idx - cached["last_try"] >= OCR_TRY_EVERY)
            
            best_text, best_prob = "", 0.0
            x1_p = y1_p = x2_p = y2_p = None

            if should_try_ocr and img_cropped.size > 0:
                # Detecta matrícula dentro del vehículo
                plate_results = model_2.predict(img_cropped, classes=[0], verbose=False)
                pr = plate_results[0]
                best_plate_box, best_conf = None, 0.0

                for plate_box in pr.boxes:
                    pconf = float(plate_box.conf[0])
                    if pconf >= best_conf:
                        best_conf = pconf
                        best_plate_box = plate_box

                if best_plate_box is not None:
                    x1_p, y1_p, x2_p, y2_p = map(int, best_plate_box.xyxy[0].tolist())

                    # Padding y límites
                    x1_p = max(0, x1_p - PAD); y1_p = max(0, y1_p - PAD)
                    x2_p = min(img_cropped.shape[1] - 1, x2_p + PAD)
                    y2_p = min(img_cropped.shape[0] - 1, y2_p + PAD)

                    w = max(0, x2_p - x1_p)
                    h = max(0, y2_p - y1_p)

                    if w * h >= MIN_PLATE_AREA and best_conf >= 0.5:
                        plate_roi = img_cropped[y1_p:y2_p, x1_p:x2_p]
                        gray = cv2.cvtColor(plate_roi, cv2.COLOR_BGR2GRAY)

                        # Upscale si ROI es pequeño
                        if max(gray.shape) < 220:
                            gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

                        # Contraste + binarizado suave
                        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
                        gray = clahe.apply(gray)
                        gray = cv2.GaussianBlur(gray, (3, 3), 0)
                        thr = cv2.adaptiveThreshold(
                            gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 31, 15
                        )
                        
                        data = pytesseract.image_to_data(
                            thr, lang='eng', config=TS_CONFIG, output_type=Output.DICT
                        )
                        
                        dets = []
                        n = len(data.get('text', []))
                        for i in range(n):
                            txt = data['text'][i].strip()
                            try:
                                conf_i = float(data['conf'][i])
                            except:
                                conf_i = -1.0
                            if not txt or conf_i < 0:
                                continue
                            left_i = float(data.get('left', [0]*n)[i])
                            
                            txt = "".join([c for c in txt.upper() if c in ALLOW])
                            if not txt:
                                continue
                            # Tesseract da conf en 0..100 -> normalizamos a 0..1
                            dets.append((left_i, txt, conf_i / 100.0))

                        dets.sort(key=lambda t: t[0])
                        dets = [d for d in dets if d[2] >= 0.4]

                        if dets:
                            best_text = "".join([d[1] for d in dets]).replace(" ", "").upper()
                            best_prob = sum(d[2] for d in dets) / len(dets)
                        
                        cached_plate = plate_cache.get(track_id, {"text": "", "prob": 0.0, "last_try": -9999})
                        if best_text and best_prob >= 0.5 and best_prob > float(cached_plate["prob"]):
                            plate_cache[track_id] = {"text": best_text, "prob": best_prob, "last_try": frame_idx}
                            license_plates += 1
                            cv2.putText(img, f"PLATE: {best_text}", (x1, y2 + 20),
                                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
                        else:
                            # Marca el intento para permitir reintentos futuros
                            cached_plate["last_try"] = frame_idx
                            plate_cache[track_id] = cached_plate
                    else:
                        # ROI demasiado pequeño o conf de placa baja: sólo actualiza last_try
                        cached_plate = plate_cache.get(track_id, {"text": "", "prob": 0.0, "last_try": -9999})
                        cached_plate["last_try"] = frame_idx
                        plate_cache[track_id] = cached_plate

                # Añade al CSV sólo si hay texto válido y coords válidas
                if best_text and x1_p is not None:
                    csv_text += "," + ",".join((
                        "license_plate", str(x1_p), str(y1_p), str(x2_p), str(y2_p), str(best_text)
                    ))

            cached_text = plate_cache.get(track_id, {}).get("text")
            if cached_text:
                cv2.putText(img, f"PLATE: {cached_text}", (x1, y2 + 20),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

        # Escribe una línea por detección
        with open("results_tesseract_uv.csv", "a", encoding="utf-8") as fw:
            fw.write(f"{csv_text}\n")

    out.write(img)
    frame_idx += 1

print("==== TOTALES ÚNICOS ====")
print("license plates:", license_plates)
print("persons:", totals_unique["person"])
print("cars:", totals_unique["car"])
print("motorcycles:", totals_unique["motorcycle"])
print("buses:", totals_unique["bus"])

vid.release()
out.release()


0: 384x640 4 cars, 1 bus, 59.2ms
Speed: 1.8ms preprocess, 59.2ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 57.2ms
Speed: 1.7ms preprocess, 57.2ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 55.0ms
Speed: 1.6ms preprocess, 55.0ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 52.8ms
Speed: 1.7ms preprocess, 52.8ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 52.3ms
Speed: 2.4ms preprocess, 52.3ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 65.5ms
Speed: 1.7ms preprocess, 65.5ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 63.3ms
Speed: 1.8ms preprocess, 63.3ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 1 bus, 52.2ms
Speed: 1.6ms preprocess, 52.2ms inference, 1.6ms 