In [1]:
import pandas as pd
import numpy as np

import motmetrics as mm

In [2]:
arr1 = np.loadtxt('multimodal_test/gt_uv.csv', delimiter=',')
gt_uv = np.delete(arr1, [2,3,4], axis=1);

In [3]:
def convert_normalized_gt_to_absolute(gt_uv_normalized, image_width, image_height):
    """
    Converts normalized (u, v) coordinates to pixel coordinates and adds track ID 0.
    
    Args:
        gt_uv_normalized (np.ndarray): Array of shape (N, 2) with normalized u, v values in [0, 1].
        image_width (int): Width of the image in pixels.
        image_height (int): Height of the image in pixels.

    Returns:
        np.ndarray: Array of shape (N, 3) with columns [track_id, u_px, v_px]
    """
    u_px = (gt_uv_normalized[:, 0] * image_width).astype(int)
    v_px = (gt_uv_normalized[:, 1] * image_height).astype(int)
    track_ids = np.zeros((len(gt_uv_normalized), 1), dtype=int)

    return np.hstack((track_ids, u_px[:, None], v_px[:, None]))

output_rows = convert_normalized_gt_to_absolute(gt_uv_normalized=gt_uv, image_width=1920, image_height=1080)
df = pd.DataFrame(output_rows, columns=["id", "u", "v"])
df.insert(0, "frame", range(len(df)))  # Insert frame column

# Save to CSV
df.to_csv("multimodal_test/tracking_output_gt.csv", index=False)

In [4]:
# Evaluate RMSE:
# for i in 

In [None]:
# Evaluate IDF1: find IDF1 for each case

def load_csv(file_path):
    """Load CSV in the format frame,id,u,v"""
    df = pd.read_csv(file_path)
    return df

def create_accumulator(gt_df, pred_df, dist_threshold=50):
    acc = mm.MOTAccumulator(auto_id=True)

    all_frames = sorted(set(gt_df["frame"]) | set(pred_df["frame"]))

    for frame in all_frames:
        gt_frame = gt_df[gt_df["frame"] == frame]
        pred_frame = pred_df[pred_df["frame"] == frame]

        gt_ids = gt_frame["id"].tolist()
        gt_coords = gt_frame[["u", "v"]].to_numpy()

        pred_ids = pred_frame["id"].tolist()
        pred_coords = pred_frame[["u", "v"]].to_numpy()

        if len(gt_coords) > 0 and len(pred_coords) > 0:
            dists = mm.distances.norm2squared_matrix(gt_coords, pred_coords, max_d2=dist_threshold**2)
        else:
            dists = np.empty((len(gt_coords), len(pred_coords)))
            dists[:] = np.nan

        acc.update(gt_ids, pred_ids, dists)

    return acc

def compute_rmse(gt_df, pred_df):
    merged = pd.merge(gt_df, pred_df, on="frame", suffixes=('_gt', '_pred'))
    merged = merged[merged["id_pred"] == merged["id_gt"]]
    if merged.empty:
        return None
    errors = np.sqrt((merged["u_gt"] - merged["u_pred"])**2 + (merged["v_gt"] - merged["v_pred"])**2)
    rmse = np.sqrt(np.mean(errors**2))
    return rmse

def evaluate_tracking(gt_csv, pred_csv, name="tracker"):
    gt_df = load_csv(gt_csv)
    pred_df = load_csv(pred_csv)

    acc = create_accumulator(gt_df, pred_df)
    mh = mm.metrics.create()
    summary = mh.compute(acc, metrics=mm.metrics.motchallenge_metrics, name=name)

    print(mm.io.render_summary(summary, formatters=mh.formatters, namemap=mm.io.motchallenge_metric_names))

    # Compute RMSE
    rmse = compute_rmse(gt_df, pred_df)
    if rmse is not None:
        print(f"RMSE between ground truth and predictions: {rmse:.2f} pixels")
    else:
        print("RMSE could not be computed (no matching frames/IDs).")

# Example usage:
evaluate_tracking(
    gt_csv="multimodal_test/tracking_output_gt.csv",
    pred_csv="multimodal_test/tracking_output_camera_1_radar_1.csv",
    name="fusion_tracker"
)
# interested in MOTP- MOT Precision: average localization error of matches (lower = better).
# IDF1-F1 score of ID-based precision and recall: how well the tracker maintains consistent object identities.
# and FM-Fragmentations: interruptions in tracking continuity.

# Camera + Radar:
# RMSE: 150.8 pixels
# IDF1: 43.6%
# FM: 7

# Radar Only:
# RMSE: 156.86 pixels
# IDF1: 31.9%
# FM: 6

# Camera Only:
# RMSE: 151.3 pixels
# IDF1: 46.8%
# FM: 8


                IDF1   IDP   IDR  Rcll  Prcn GT MT PT ML FP FN IDs  FM   MOTA    MOTP IDt IDa IDm
fusion_tracker 43.6% 43.6% 43.6% 43.6% 43.6%  1  0  1  0 53 53   0   7 -12.8% 462.268   0   0   0
RMSE between ground truth and predictions: 150.58 pixels
