In [3]:
import cv2
import numpy as np
from ultralytics import YOLO
from tensorflow.keras.models import load_model
import tensorflow_addons as tfa
import imageio as io



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [13]:
def get_s3ar_unet_mask(image, unet_model):
    """
    Genera una máscara de segmentación utilizando un modelo Unet.

    Parámetros:
    image (numpy.ndarray): Imagen de entrada en formato de array numpy.
    unet_model (Unet): Modelo Unet preentrenado.

    Retorna:
    numpy.ndarray: Máscara de segmentación binaria.
    """
    input_shape = unet_model.input_shape 
    img_height, img_width = input_shape[1], input_shape[2]
    if image is None or image.size == 0:
        print("Warning: Empty image received in get_unet_mask.")
        return None

    img = cv2.resize(image, (img_width, img_height))  # img_orig en RGB
    img = img / 255.0
    img = img[np.newaxis, ...]  # Añadir dimensión de batch

    mask = unet_model.predict(img)
    
    if isinstance(mask, list):
        mask = mask[0]
    
    if mask.ndim == 4:
        mask = mask[0, :, :, 0]
    
    mask = (mask > 0.5).astype(np.uint8)  # Umbral para convertir en binario
    return mask

def get_unet_mask(image, unet_model):
    img_orig = image.astype(np.float32)

    # Convertir a gris y normalizar la imagen
    img_orig_gray = cv2.cvtColor(img_orig, cv2.COLOR_BGR2GRAY)
    img = cv2.resize(img_orig_gray, (256, 512))

    # Normalizar y preprocesar la imagen
    normalizedImg = np.zeros(img.shape)
    img = cv2.normalize(img, normalizedImg, -1, 1, cv2.NORM_MINMAX)
    img = img[None, ..., None]

    # Predicción
    pred_maps, seg_pred =unet_model.predict(img)
    mask = np.asarray(np.squeeze(seg_pred))

    # Convertir las probabilidades a booleanas
    mask = np.round(mask)
    # Redimensionar la máscara al tamaño original del frame
    mask = cv2.resize(mask, (img_orig_gray.shape[1], img_orig_gray.shape[0]))
    mask = mask.astype(bool)
    return mask

In [4]:
def get_yolo_mask(image, model):
    """
    Genera una máscara de segmentación utilizando un modelo YOLO.

    Parámetros:
    image (numpy.ndarray): Imagen de entrada en formato de array numpy.
    model (YOLO): Modelo YOLO preentrenado.

    Retorna:
    numpy.ndarray: Máscara de segmentación binaria.
    """
    try:
        original_shape = image.shape[:2]
        results = model(image)

        mask = np.zeros(original_shape, dtype=np.uint8)

        if results[0].masks is not None:
            masks = results[0].masks.xy
            for mask_array in masks:
                if mask_array.shape[0] == 0:  # Manejar el caso de máscaras vacías
                    continue
                mask_array = mask_array.astype(np.int32)
                cv2.fillPoly(mask, [mask_array], 1)
        else:
            print("No masks found in the results")

        mask = mask.astype(bool)
        return mask
    except Exception as e:
        print(f"Error in get_yolo_mask for image {image}: {e}")
        return None


In [5]:
# Función para encontrar contornos en la máscara y dibujarlos en la imagen original
def draw_contours_on_image(mask, img_orig, contour_color=(0, 255, 0), contour_thickness=3):
    contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Verificar si la imagen original está en escala de grises
    if len(img_orig.shape) == 2 or img_orig.shape[2] == 1:
        img_with_contours = cv2.cvtColor(img_orig, cv2.COLOR_GRAY2BGR)
    else:
        img_with_contours = img_orig.copy()

    cv2.drawContours(img_with_contours, contours, -1, contour_color, contour_thickness)
    return img_with_contours


In [17]:
def process_hybrid_video(input_video_path, yolo_model, unet_model, output_video_path):
    ims = io.mimread(input_video_path, memtest=False)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    ancho, alto = None, None
    output_video = None

    for img_orig in ims:
        img_orig = img_orig.astype(np.float32)

        # Preprocesar para YOLO
        yolo_mask = get_yolo_mask(img_orig, yolo_model)
        
        if yolo_mask is None or np.sum(yolo_mask) == 0:
            # Si YOLO no detecta nada, usar UNet
            unet_mask = get_unet_mask(img_orig, unet_model)
            if unet_mask is None:
                print("Warning: UNet mask is empty, skipping frame.")
                continue
            x, y, w, h = cv2.boundingRect(unet_mask.astype(np.uint8))
            if w == 0 or h == 0:
                print("Warning: Empty bounding box, skipping frame.")
                continue
            roi = img_orig[y:y+h, x:x+w]
            refined_mask = get_unet_mask(roi, unet_model)
            refined_mask = cv2.resize(refined_mask, (w, h))
            final_mask = np.zeros_like(img_orig[:, :, 0])
            final_mask[y:y+h, x:x+w] = refined_mask
        else:
            final_mask = yolo_mask
        
        img_with_contours = draw_contours_on_image(final_mask, img_orig)
        
        if img_with_contours.dtype != np.uint8:
            img_with_contours = img_with_contours.astype(np.uint8)

        if ancho is None or alto is None:
            alto, ancho = img_with_contours.shape[:2]
            output_video = cv2.VideoWriter(output_video_path, fourcc, 30, (ancho, alto))

        output_video.write(img_with_contours)

    if output_video is not None:
        output_video.release()


In [18]:
import numpy as np
import cv2
import os
import gc
import tensorflow as tf
from utils.masks import get_unet_mask, get_max_yolo_roi, filter_unet_mask_with_yolo

def process_and_create_segmented_video_with_contours(video_path, yolo_model, unet_model, output_video_path, margin=15):
    # Abrir el video de entrada
    video_cap = cv2.VideoCapture(video_path)
    
    # Obtener el tamaño de los frames
    frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = video_cap.get(cv2.CAP_PROP_FPS)

    # Crear el objeto VideoWriter para guardar el video de salida
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    output_video = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    # Calcular la ROI utilizando YOLO
    roi = get_max_yolo_roi(video_path, yolo_model, margin)
    
    frame_count = 0
    while video_cap.isOpened():
        ret, frame_video = video_cap.read()

        if not ret:
            break

        # Obtener la máscara de UNet
        unet_mask = get_unet_mask(frame_video, unet_model)
    
        # Filtrar la máscara de UNet con la ROI de YOLO
        filtered_mask = filter_unet_mask_with_yolo(unet_mask, roi)

        # Redimensionar la máscara filtrada al tamaño original del frame
        filtered_mask_resized = cv2.resize(filtered_mask.astype(np.uint8), (frame_width, frame_height))

        # Encontrar los contornos en la máscara
        contours, _ = cv2.findContours(filtered_mask_resized, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        # Dibujar los contornos en el frame original
        frame_with_contours = frame_video.copy()  # Crear una copia del frame original
        cv2.drawContours(frame_with_contours, contours, -1, (0, 255, 0), 2)  # Dibujar contornos en verde

        # Escribir el frame procesado en el video de salida
        output_video.write(frame_with_contours)

        frame_count += 1

        # Liberar recursos periódicamente
        if frame_count % 100 == 0:
            gc.collect()
            tf.keras.backend.clear_session()

    # Liberar recursos al final
    video_cap.release()
    output_video.release()

    print(f"Video con segmentación y contornos creado en: {output_video_path}")




In [None]:
yolo_model_path = "/home/voicelab/Desktop/segmentation_glottis/models/YOLO/YOLOV8/best_yolov8n-seg-1cls.pt"
yolo_model = YOLO(yolo_model_path)
unet_model_path = "/home/voicelab/Desktop/segmentation_glottis/models/UNets/S3AR-UNet/s3ar_unet/model/SeARUNet-2/SeARUNet-2.h5"
unet_model = load_model(unet_model_path, compile=False, custom_objects={'InstanceNormalization': tfa.layers.InstanceNormalization})

video_path = '/home/voicelab/Desktop/segmentation_glottis/datasets/videos_VPLab/FN001.avi'
output_video_path = '/home/voicelab/Desktop/segmentation_glottis/models/videos/FN001_YOLO_s3ar-unet.mp4'
process_and_create_segmented_video_with_contours(video_path, yolo_model, unet_model, output_video_path)


In [4]:
import numpy as np
import cv2
import gc
import tensorflow as tf
import time
from utils.masks import get_unet_mask, get_max_yolo_roi, filter_unet_mask_with_yolo

def process_and_create_segmented_video_with_contours(video_path, yolo_model, unet_model, output_video_path, margin=15, interval=10):
    # Abrir el video de entrada
    video_cap = cv2.VideoCapture(video_path)
    
    # Obtener el tamaño de los frames
    frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = video_cap.get(cv2.CAP_PROP_FPS)

    # Crear el objeto VideoWriter para guardar el video de salida
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    output_video = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    # Medir el tiempo de obtención de la ROI con YOLO
    start_time_yolo = time.time()
    roi = get_max_yolo_roi(video_path, yolo_model, margin)
    end_time_yolo = time.time()
    yolo_time = end_time_yolo - start_time_yolo
    print(f"Tiempo de obtención de la ROI con YOLO: {yolo_time:.2f} segundos")

    # Variables para medir el tiempo promedio por frame
    unet_times = []
    frame_count = 0
    start_time_total = time.time()  # Tiempo de inicio total

    while video_cap.isOpened():
        ret, frame_video = video_cap.read()
        if not ret:
            break

        # Medir el tiempo de procesamiento de UNet para cada frame
        start_time_unet = time.time()
        unet_mask = get_unet_mask(frame_video, unet_model)
        filtered_mask = filter_unet_mask_with_yolo(unet_mask, roi)
        filtered_mask_resized = cv2.resize(filtered_mask.astype(np.uint8), (frame_width, frame_height))
        contours, _ = cv2.findContours(filtered_mask_resized, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        end_time_unet = time.time()

        # Calcular el tiempo de UNet y almacenar para el promedio
        unet_time = end_time_unet - start_time_unet
        unet_times.append(unet_time)

        # Dibujar los contornos en el frame original
        frame_with_contours = frame_video.copy()
        cv2.drawContours(frame_with_contours, contours, -1, (0, 255, 0), 2)
        output_video.write(frame_with_contours)
        frame_count += 1

        # Liberar recursos periódicamente
        if frame_count % 100 == 0:
            gc.collect()
            tf.keras.backend.clear_session()

        # Imprimir el tiempo promedio cada 'interval' frames
        if frame_count % interval == 0:
            avg_unet_time = sum(unet_times[-interval:]) / interval
            print(f"Tiempo promedio por frame para UNet en los últimos {interval} frames: {avg_unet_time:.4f} segundos")

    # Cálculo del tiempo total y promedio por frame
    end_time_total = time.time()
    total_time = end_time_total - start_time_total
    avg_total_time_per_frame = total_time / frame_count

    # Resultados finales
    print(f"Tiempo total de procesamiento: {total_time:.2f} segundos")
    print(f"Tiempo promedio de procesamiento por frame: {avg_total_time_per_frame:.4f} segundos")

    # Liberar recursos al final
    video_cap.release()
    output_video.release()

    print(f"Video con segmentación y contornos creado en: {output_video_path}")

# Ejemplo de uso
# process_and_create_segmented_video_with_contours("ruta_del_video.mp4", yolo_model, unet_model, "output_video.mp4", margin=15, interval=10)


In [5]:
yolo_model_path = "/home/voicelab/Desktop/segmentation_glottis/models/YOLO/YOLOV8/best_yolov8n-seg-1cls.pt"
yolo_model = YOLO(yolo_model_path)
unet_model_path = "/home/voicelab/Downloads/epoch025.h5"
unet_model = load_model(unet_model_path, compile=False, custom_objects={'InstanceNormalization': tfa.layers.InstanceNormalization})

video_path = '/home/voicelab/Desktop/segmentation_glottis/datasets/videos_VPLab/FN001.avi'
output_video_path = '/home/voicelab/Desktop/segmentation_glottis/models/videos/FN001unet.mp4'
process_and_create_segmented_video_with_contours(video_path, yolo_model, unet_model, output_video_path)


2024-10-25 23:10:35.306162: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-10-25 23:10:35.311159: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-10-25 23:10:35.312299: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-


0: 640x512 (no detections), 56.5ms
Speed: 2.1ms preprocess, 56.5ms inference, 6.8ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 (no detections), 2.2ms
Speed: 1.1ms preprocess, 2.2ms inference, 0.1ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 (no detections), 2.4ms
Speed: 0.7ms preprocess, 2.4ms inference, 0.2ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 (no detections), 2.1ms
Speed: 0.8ms preprocess, 2.1ms inference, 0.1ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 (no detections), 2.0ms
Speed: 0.6ms preprocess, 2.0ms inference, 0.1ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 glottis, 2.4ms
Speed: 0.6ms preprocess, 2.4ms inference, 43.6ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 glottis, 2.5ms
Speed: 1.1ms preprocess, 2.5ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 glottis, 2.2ms
Speed: 0.7ms preprocess, 2.2ms inference, 0.6ms postprocess per

2024-10-25 23:10:52.211454: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
2024-10-25 23:10:52.222801: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-10-25 23:10:52.434894: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Tiempo promedio por frame para UNet en los últimos 10 frames: 0.1610 segundos
Tiempo promedio por frame para UNet en los últimos 10 frames: 0.0302 segundos
Tiempo promedio por frame para UNet en los últimos 10 frames: 0.0323 segundos
Tiempo promedio por frame para UNet en los últimos 10 frames: 0.0301 segundos
Tiempo promedio por frame para UNet en los últimos 10 frames: 0.0306 segundos
Tiempo promedio por frame para UNet en los últimos 10 frames: 0.0298 segundos
Tiempo promedio por frame para UNet en los últimos 10 frames: 0.0299 segundos
Tiempo promedio por frame para UNet en los últimos 10 frames: 0.0313 segundos
Tiempo promedio por frame para UNet en los últimos 10 frames: 0.0301 segundos
Tiempo promedio por frame para UNet en los últimos 10 frames: 0.0299 segundos
Tiempo promedio por frame para UNet en los últimos 10 frames: 0.0412 segundos
Tiempo promedio por frame para UNet en los últimos 10 frames: 0.0295 segundos
Tiempo promedio por frame para UNet en los últimos 10 frames: 0.