Benchmark table schema

model_name: STRING 

overall_metrics: STRUCT<"micro_averaged": STRUCT<"f1": float, "recall": float, "precision": float>, "macro_averaged": STRUCT<"f1": float, "recall": float, "precision": float>>

per_class_metrics: STRUCT<"ct": STRUCT<"f1": float, "recall": float, "precision": float>>

iou_threshold: float 

confidence_threshold: float

# ASSUMPTIONS
Only models from the new Ultralytics lib will be benchmarked

In [0]:
from io import BytesIO

import mlflow
import numpy as np
import pandas as pd
import PIL
from torch import no_grad, Tensor, cuda, sigmoid, device, tensor
import torch
import pyspark.sql.types as T
import pyspark.sql.functions as F
from ultralytics.utils import ASSETS
from ultralytics.utils.metrics import ConfusionMatrix
from ultralytics.models.yolo.detect import DetectionPredictor

from tsdb.ml.utils import get_model_tags, cut_square_detection, UCModelName
# from tsdb.preprocessing.images import open_image_binary
# from tsdb.ml.infer import load_model, make_confusion_matrix_udf
from tsdb.ml.drift import get_struct_counts



mlflow.set_registry_uri("databricks-uc")

In [0]:
# torch_device = torch.device("cuda") if cuda.is_available() else torch.device("cpu")
# uc_model_name = UCModelName("edav_dev_csels", "towerscout", "yolov10_models")
# uc_alias = "medium"

# model = mlflow.pytorch.load_model(
#         model_uri=f"models:/{str(uc_model_name)}@{uc_alias}",
#         map_location=torch_device
#     )

In [0]:
# print(model.names)

In [0]:
# df = spark.read.format("delta").table("edav_dev_csels.towerscout.benchmark_scored").selectExpr("*", "results.bboxes as bboxes").limit(50)

# df = df.selectExpr(
#     '*',
#     "transform(bboxes, x -> x.class) AS class",
#     "flatten(transform(bboxes, x -> array(x.x1, x.y1, x.x2, x.y2))) AS bboxes",
# )

# display(df)

# test_img_path = df.first()['path'].split(":")[1]
# raw_bbox = df.first()['results']['bboxes']
# gt_bboxes = []
# gt_classes = []
# for bbox in raw_bbox:
#     gt_bboxes.append([bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']])
#     gt_classes.append(bbox['class'])

# gt_bboxes = np.array(gt_bboxes)
# gt_classes = np.array(gt_classes)
# print(test_img_path)
# print(raw_bbox)
# print(gt_bboxes)
# print(gt_classes)

In [0]:
# torch_device = device("cuda") if cuda.is_available() else device("cpu")

# yolo_model_name = "edav_dev_csels.towerscout.yolov10_models"
# yolo_alias = "medium"

# yolo_detector = mlflow.pytorch.load_model(
#         model_uri=f"models:/{yolo_model_name}@{yolo_alias}",
#         map_location=torch_device
#     )

# yolo_detector.eval()


In [0]:
# yolo_detector = load_model()

In [0]:
# conf = 0.2
# iou_thres = 0.2
# confusion_matrix = ConfusionMatrix(nc=1, conf=conf, iou_thres=iou_thres)

# test_img = PIL.Image.open(test_img_path)
# yolo_output = yolo_detector(test_img) #.xyxyn

# boxes = yolo_output[0].boxes.xyxyn
# labels = yolo_output[0].boxes.cls
# confs = yolo_output[0].boxes.conf


# print(boxes)
# print(labels)
# print(confs)
# result = torch.cat((boxes, confs.reshape(-1, 1), labels.reshape(-1, 1)), dim=1)
# print(result)

# print(f"Original CM:\n{confusion_matrix.matrix}\n")

# for detections in yolo_output:
#     boxes = detections.boxes.xyxyn
#     labels = detections.boxes.cls
#     confs = detections.boxes.conf
#     #detection = detection.boxes.xyxy #detection.to("cpu")
#     result = torch.cat((boxes, confs.reshape(-1, 1), labels.reshape(-1, 1)), dim=1).to("cpu")
#     confusion_matrix.process_batch(detections=result, gt_bboxes=tensor(gt_bboxes), gt_cls=tensor(gt_classes))
#     #print(detection)


# print(f"New CM:\n{confusion_matrix.matrix}")

In [0]:
from PIL import Image

def open_image_binary(image_binary: bytes) -> tuple[Image, dict]:  # pragma: no cover
    """
    A function that takes an image binary and returns a PIL image 
    and a dictionary of EXIF metadata.
    NOTE: Unit testing forgon as ByteIO and Image.open are
    tested already 
    
    Args: 
        image_binary: The image binary.
        
    Returns:
        A PIL image and a dictionary of EXIF metadata.
    """
    image_binary = BytesIO(image_binary)

    # Try to read the image and if we fail, we have to default to
    # to the null image case
    try:
        image = PIL.Image.open(image_binary)
        exif = image._getexif()
    except FileNotFoundError:
        image = None
        exif = None
    except UnicodeDecodeError:
        image = None
        exif = None

    return image, exif

def load_model(uc_model_name: UCModelName, uc_alias: str) -> DetectionPredictor:
    """
    Loads a model from the UC model registry and creates a DetectionPredictor
    object using it.

    Args:
        uc_model_name: The name of the YOLO model in UC.
        uc_alias: The alias for the YOLO model in UC.
    
    Returns:
        A DetectionPredictor class that returns parsed outputs from the loaded model.
    """

    # set unity catalog as registry to get models from
    mlflow.set_registry_uri("databricks-uc")
    torch_device = torch.device("cuda") if cuda.is_available() else torch.device("cpu")

    yolo_model = mlflow.pytorch.load_model(
        model_uri=f"models:/{str(uc_model_name)}@{uc_alias}",
        map_location=torch_device
    )
    
    predictor_args = yolo_model.args
    predictor_args.source = ASSETS
    predictor_args.verbose = False
    predictor_args.show = False
    predictor_args.save = False
    predictor_args.save_txt = False

    yolo_model_wrapper = DetectionPredictor(overrides=predictor_args)

    yolo_model_wrapper.setup_model(yolo_model)

    return yolo_model_wrapper


def make_confusion_matrix_udf(uc_model_name: UCModelName, uc_alias: str, conf: float=0.5, iou_thres: float=0.5) -> callable:
    """
    Returns a UDF that performs inference on a batch of labeled images and then computes the confusion matrix
    for each row. 
    
    Args: 
        uc_model_name: A UCModelName object that contains the name of the model in UC.
        uc_alias: The alias for the YOLO model in UC.
        conf: The confidence threshold. Ranges between [0,1] 
            Predicted bounding boxes with a confidence 
            below this threshold will not be considered. 
        iou_thres: The IoU threshold. Ranges between [0,1]
            This is used to determine if a bounding box is macthed to 
            a ground truth bounding box. If a predicted bounding box is 
            matched to a ground truth bounding box, the IoU between the 
            two bounding boxes must be greater than this threshold.
    """

    @F.pandas_udf(returnType="array<array<integer>>")
    @no_grad()
    def confusion_matrix_udf(batch_image_bins: pd.Series, batch_bboxes: pd.Series, batch_classes: pd.Series) -> pd.Series:
        """
        A UDF that performs inference on a batch of images and then computes the confusion matrix
        for each image using the predicted boudning boxes from the model and ground truth 
        bounding boxes and labels from batch_bboxes and batch_classes.

        Args:
            batch_image_bins: A partition of image binaries
            batch_bboxes: A partition of ground truth bounding boxes. Each element in the batch is 
                            assumed to be a 1D list of shape (num_bboxes*4,) where the first 4 
                            elements are the coordiantes of the list are (x1,y1,x2,y2) of the 
                            first boudnding box, the next 4 are the coordinates of the 2nd bounding
                            box and so on. We reshape this into the proper shape (num_bboxes, 4) 
                            later in the function.
            batch_classes: A partition of ground truth labels for the bounding boxes. Each 
                            element in the batch is assumed to be a 1D list of shape (num_bboxes,). 
                            The first element is the class label of the first bounding box, the second 
                            element is the class label of the 2nd bounding box, and so on.

        Returns:
            Confusion matrices for each image in the partition.
        """

        # load model within UDF to avoid serialization issues 
        yolo_detector = load_model(uc_model_name, uc_alias)
        
        num_classes = len(yolo_detector.model.names)
        outputs = []

        for image_binary, bboxes, classes in zip(batch_image_bins, batch_bboxes, batch_classes):
            confusion_matrix = ConfusionMatrix(nc=num_classes, conf=conf, iou_thres=iou_thres)

            image, _ = open_image_binary(image_binary)
            
            if image is None:
                continue
            
            # we reshape the flattenned bounding box list to have 
            # the correct (non-flattened) shape (num_boxes, 4)
            true_bboxes = np.array(bboxes).reshape(-1, 4)
            true_classes = np.array(classes)

            yolo_output = yolo_detector(image, stream=False)
            
            boxes = yolo_output[0].boxes.xyxyn  # get *normalized* bboxes coordinates as tensor of shape (num_boxes, 4)
            labels = yolo_output[0].boxes.cls  # get class labels as tensor of shape (num_boxes, 1)
            confs = yolo_output[0].boxes.conf  # get confidence scores as tensor of shape (num_boxes, 1)

            # concatenate the tensors into a single tensor with columns [x1,y1,x2,y2,conf,label]
            # as that is the format expected by Ultralytics' ConfusionMatrix.process_batch() function
            result = torch.cat((boxes, confs.reshape(-1, 1), labels.reshape(-1, 1)), dim=1).to("cpu")

            confusion_matrix.process_batch(detections=result, gt_bboxes=tensor(true_bboxes), gt_cls=tensor(true_classes))

            outputs.append(confusion_matrix.matrix)


        return pd.Series(outputs)

    return confusion_matrix_udf


In [0]:
test_model = load_model(UCModelName("edav_dev_csels", "towerscout", "yolov10_models"), "small")

In [0]:
print(test_model.model.names)
print(len(test_model.model.names))

In [0]:
df = spark.read.format("delta").table("edav_dev_csels.towerscout.benchmark_scored").selectExpr("content", "modificationTime", "results.bboxes as bboxes").limit(100)

# Note we must flatten because if we don't then casting the resulting 2D arrays will give us a numpy array of weird objects (numpy.object_) and not a true 2D numpy array
# This should be fine since it just requires an extra reshape operation which is cheap
df = df.selectExpr(
    '*',
    "transform(bboxes, x -> x.class) AS class",
    "flatten(transform(bboxes, x -> array(x.x1, x.y1, x.x2, x.y2))) AS bboxes_flattened",
)

df = get_struct_counts(df, "modificationTime", "bboxes", filter_clause="x.class = 0 and x.class_name = 'ct'", time_window_days = 100000000)
aggregated_num_structs = df.agg({"num_structs": "sum"})

print(f"Total ground truth bounding boxes: {aggregated_num_structs.collect()[0][0]}")

display(df.limit(4))

In [0]:
uc_model_name = UCModelName("edav_dev_csels", "towerscout", "yolov10_models")
confusion_matrix_udf = make_confusion_matrix_udf(uc_model_name, uc_alias="small", conf=0.5, iou_thres=0.2)

df_with_conf = df.withColumn("conf_mat", confusion_matrix_udf(F.col("content"), F.col("bboxes_flattened"), F.col("class")))
display(df_with_conf.limit(4))

In [0]:
@F.pandas_udf(returnType="array<array<integer>>")
def sum_arrays(arrays: pd.Series) -> np.ndarray:
    """
    Sums all the arrays in the input Series. All arrays must be of same shape.
    The return type hint `np.ndarray` indicates that the function returns 
    a numpy array. This function is used to perform a grouped aggregation 
    on a column containing 2D arrays. 
    """
    return arrays.sum(axis=0)

In [0]:
# perform a global aggregation on the confusion matrices
cm = df_with_conf.select(sum_arrays(F.col("conf_mat")).alias("total_confusion_matrix"))
display(cm)

In [0]:
confusion_mat = np.array(cm.collect()[0]["total_confusion_matrix"])  # list of lists
print(confusion_mat.sum())

In [0]:
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score


def reconstruct_labels_and_preds(
    confusion_matrix: np.ndarray,
) -> tuple[np.ndarray, np.ndarray]:
    """
    Given an confusion matrix of shape NxN, reconstruct the ground truth and predicted labels.
    NOTE: we assume that the confusion matrix has 'predicted' counts on the y-axis/dimension
    and 'true' counts on the x-axis/dimension.

    Args:
        confusion_matrix: The NxN confusion matrix.
    Retruns:
        A tuple of two 1D numpy arrays: ground truth labels and predicted labels.
    """

    num_classes = confusion_matrix.shape[0]  # Number of classes
    ground_truth = []
    predicted = []

    for actual_class in range(num_classes):
        for predicted_class in range(num_classes):
            count = confusion_matrix[actual_class, predicted_class]

            # Append the actual class 'count' times
            ground_truth.extend([actual_class] * count)

            # Append the predicted class 'count' times
            predicted.extend([predicted_class] * count)

    return np.array(ground_truth), np.array(predicted)


def compute_metrics(confusion_matrix: np.ndarray) -> dict[str, float]:
    """
    Compute per-class recall, precision, F1 score, and their micro and macro averages.

    Args:
        confusion_matrix: The NxN confusion matrix. The

    Returns:
        dict: A dictionary containing per-class metrics and averaged metrics.
    """

    true_labels, preds = reconstruct_labels_and_preds(confusion_matrix)
    
    # ignore background label when computing overal metrics see:
    # tp_fp() function of ConfusionMatrix class
    labels = [i for i in range(0, confusion_matrix.shape[0]-1)]
    metrics = {
        "micro_f1": f1_score(true_labels, preds, average="micro", labels=labels),
        "macro_f1": f1_score(true_labels, preds, average="macro", labels=labels),
        "per_class_f1": f1_score(true_labels, preds, average=None),
        "micro_recall": recall_score(true_labels, preds, average="micro", labels=labels),
        "macro_recall": recall_score(true_labels, preds, average="macro", labels=labels),
        "per_class_recall":  recall_score(true_labels, preds, average=None),
        "micro_precision": precision_score(true_labels, preds, average="micro", labels=labels),
        "macro_precision": precision_score(true_labels, preds, average="macro", labels=labels),
        "per_class_precision": precision_score(true_labels, preds, average=None),
    }

    return metrics


print(confusion_mat)
true_labels, predicted_labels = reconstruct_labels_and_preds(confusion_mat)   

# we may want to not include the background class metrics in the overall precision metric cuz it seems to be 0 all the time. Which makes sense
print(confusion_matrix(true_labels, predicted_labels))

print(compute_metrics(confusion_mat))

In [0]:
test_cm = np.array( [ [0, 8, 0], 
                      [0, 1,  3], 
                      [9,  6, 1]] )


preds = np.array([0,0,0, 1,1,1,1, 2,2,2])
true_labels = np.array([0,0,0, 1,1,1,1, 2,2,2])

gt, pred = reconstruct_labels_and_preds(test_cm)
print(test_cm, "\n")
print(confusion_matrix(gt, pred))

# recall_score(gt, pred, labels=[1,2], average=None)