## Setup

In [None]:
from pathlib import Path
import os

In [None]:
#NOTE: DONT USE RELATIVE PATHS FOR THE MODELS PROVIDED BY L5
experiments_directory = Path(Path(os.path.abspath('')).parent.parent, "Experiments")
experiments_directory.mkdir(parents=True, exist_ok=True)

data_directory = Path(experiments_directory, "data")
data_directory.mkdir(parents=True, exist_ok=True)

prediction_directory = Path(experiments_directory, "prediction")
prediction_directory.mkdir(parents=True, exist_ok=True)

prediction_evaluation_directory = Path(prediction_directory, "evaluation")
prediction_evaluation_directory.mkdir(parents=True, exist_ok=True)

save_directory = Path(prediction_evaluation_directory, "saved_outputs")
save_directory.mkdir(parents=True, exist_ok=True)

In [None]:
import os
os.chdir(prediction_evaluation_directory)

In [None]:
%%writefile requirements.txt
l5kit
pyyaml
wandb
ray==2.1.0
"ray[air]"
h5py

In [None]:
from typing import Dict

from tempfile import gettempdir
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision.models.resnet import resnet50
from tqdm import tqdm

from l5kit.configs import load_config_data
from l5kit.data import LocalDataManager, ChunkedDataset
from l5kit.dataset import AgentDataset, EgoDataset
from l5kit.rasterization import build_rasterizer
from l5kit.evaluation import write_pred_csv, compute_metrics_csv, read_gt_csv, create_chopped_dataset
from l5kit.evaluation.chop_dataset import MIN_FUTURE_STEPS
from l5kit.evaluation.metrics import neg_multi_log_likelihood, time_displace, rmse, prob_true_mode, average_displacement_error_oracle, average_displacement_error_mean, final_displacement_error_oracle, final_displacement_error_mean, detect_collision, distance_to_reference_trajectory
from l5kit.geometry import transform_points
from l5kit.visualization import PREDICTED_POINTS_COLOR, TARGET_POINTS_COLOR, draw_trajectory
from prettytable import PrettyTable
from pathlib import Path

import os

# Pull data

In [None]:
import wandb
wandb.login()

In [None]:
# Run information
project_name = "l5-prediction"
run_name = "evaluate-latest-models"
run_type = "evaluate"
run_description = """
Evaluate a prediction Av model and report results
"""
tags = ["evaluate", "prediction"]

In [None]:
 #🪄🐝
run = wandb.init(
    project=project_name,
    job_type=run_type,
    name=run_name,
    notes=run_description,
    tags=tags
)

In [None]:
artifact_project = "l5-common"
artifact_name = "l5-data"
artifact_alias = "latest"
artifact_type = "dataset"

In [None]:
 #🪄🐝
artifact = run.use_artifact(f"{artifact_project}/{artifact_name}:{artifact_alias}", type=artifact_type)

In [None]:
_ = artifact.download(data_directory)

In [None]:
cfg = load_config_data(Path(data_directory, "configurations", "agent_motion_config.yaml"))
l5_data_location = Path(data_directory, "dataset")

In [None]:
os.environ["L5KIT_DATA_FOLDER"] = str(l5_data_location)

In [None]:
dm = LocalDataManager()

# Evaluation

Evaluation follows a slightly different protocol than training. When working with time series, we must be absolutely sure to avoid leaking the future in the data.

If we followed the same protocol of training, one could just read ahead in the `.zarr` and forge a perfect solution at run-time, even for a private test set.

As such, **the private test set for the competition has been "chopped" using the `chop_dataset` function**.

In [None]:
# ===== GENERATE AND LOAD CHOPPED DATASET
num_frames_to_chop = 100
eval_cfg = cfg["val_data_loader"]
eval_base_path = create_chopped_dataset(dm.require(eval_cfg["key"]), cfg["raster_params"]["filter_agents_threshold"], 
                              num_frames_to_chop, cfg["model_params"]["future_num_frames"], MIN_FUTURE_STEPS)

The result is that **each scene has been reduced to only 100 frames**, and **only valid agents in the 100th frame will be used to compute the metrics**. Because following frames in the scene have been chopped off, we can't just look ahead to get the future of those agents.

In this example, we simulate this pipeline by running `chop_dataset` on the validation set. The function stores:
- a new chopped `.zarr` dataset, in which each scene has only the first 100 frames;
- a numpy mask array where only valid agents in the 100th frame are True;
- a ground-truth file with the future coordinates of those agents;

Please note how the total number of frames is now equal to the number of scenes multipled by `num_frames_to_chop`. 

The remaining frames in the scene have been sucessfully chopped off from the data

In [None]:
eval_zarr_path = str(Path(eval_base_path) / Path(dm.require(eval_cfg["key"])).name)
eval_mask_path = str(Path(eval_base_path) / "mask.npz")
eval_gt_path = str(Path(eval_base_path) / "gt.csv")

eval_zarr = ChunkedDataset(eval_zarr_path).open()
eval_mask = np.load(eval_mask_path)["arr_0"]
# ===== INIT DATASET AND LOAD MASK

There is a small catch to be aware of when saving the model predictions. The output of the models are coordinates in `agent` space and we need to convert them into displacements in `world` space.

To do so, we first convert them back into the `world` space and we then subtract the centroid coordinates.

### Save results
After the model has predicted trajectories for our evaluation set, we can save them in a `csv` file.

During the competition, only the `.zarr` and the mask will be provided for the private test set evaluation.
Your solution is expected to generate a csv file which will be compared to the ground truth one on a separate server

### Perform Evaluation
Pleae note that our metric supports multi-modal predictions (i.e. multiple predictions for a single GT trajectory). In that case, you will need to provide a confidence for each prediction (confidences must all be between 0 and 1 and sum to 1).

In this simple example we don't generate multiple trajectories, so we won't pass any confidences vector. Internally, the metric computation will assume a single trajectory with confidence equal to 1

### Visualise Results
We can also visualise some results from the ego (AV) point of view for those frames of interest (the 100th of each scene).

However, as we chopped off the future from the dataset **we must use the GT csv if we want to plot the future trajectories of the agents**


# Run Evaluation

In [None]:
criterion = nn.MSELoss(reduction="none")

In [None]:
def forward(data, model, device, criterion):
    inputs = data["image"].to(device)
    target_availabilities = data["target_availabilities"].unsqueeze(-1).to(device)
    targets = data["target_positions"].to(device)
    # Forward pass
    outputs = model(inputs).reshape(targets.shape)
    loss = criterion(outputs, targets)
    # not all the output steps are valid, but we can filter them out from the loss using availabilities
    loss = loss * target_availabilities
    loss = loss.mean()
    return loss, outputs

In [None]:
#TODO: Remove reliance on Pandas
import pandas as pd

In [None]:
def evaluate_model(model, criterion, eval_dataloader, eval_gt_path, eval_dataset, eval_zarr, rasterizer, dir_to_save="."):
    #TODO: see if there is a better ray way
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    # ==== EVAL LOOP
    model = model.to(device)
    model.eval()
    torch.set_grad_enabled(False)
    
    

    # store information for evaluation
    future_coords_offsets_pd = []
    timestamps = []
    agent_ids = []

    print("Running metric evaluation...")
    progress_bar = tqdm(eval_dataloader)
    for data in progress_bar:
        _, ouputs = forward(data, model, device, criterion)

        # convert agent coordinates into world offsets
        agents_coords = ouputs.cpu().numpy()
        world_from_agents = data["world_from_agent"].numpy()
        centroids = data["centroid"].numpy()
        coords_offset = transform_points(agents_coords, world_from_agents) - centroids[:, None, :2]

        future_coords_offsets_pd.append(np.stack(coords_offset))
        timestamps.append(data["timestamp"].numpy().copy())
        agent_ids.append(data["track_id"].numpy().copy())
    
    pred_path = Path(dir_to_save, "eval_preds.csv")
    write_pred_csv(pred_path,
       timestamps=np.concatenate(timestamps),
       track_ids=np.concatenate(agent_ids),
       coords=np.concatenate(future_coords_offsets_pd),
    )
    
    pred_df = pd.read_csv(pred_path)
    
    metrics = compute_metrics_csv(eval_gt_path, pred_path, [neg_multi_log_likelihood, 
                                                        time_displace, 
                                                        rmse, 
                                                        prob_true_mode, 
                                                        average_displacement_error_oracle, 
                                                        average_displacement_error_mean,
                                                        final_displacement_error_oracle,
                                                        final_displacement_error_mean,
                                                       ])

    # build a dict to retrieve future trajectories from GT
    print("Running scene frame prediction!")
    gt_rows = {}
    for row in read_gt_csv(eval_gt_path):
        gt_rows[row["track_id"] + row["timestamp"]] = row["coord"]

    eval_ego_dataset = EgoDataset(cfg, eval_dataset.dataset, rasterizer)

    scene_frames = []
    frame_prediction_details = {}
    for frame_number in tqdm(range(99, len(eval_zarr.frames), 100)):  # start from last frame of scene_0 and increase by 100
        agent_indices = eval_dataset.get_frame_indices(frame_number) 
        if not len(agent_indices):
            continue

        # get AV point-of-view frame
        data_ego = eval_ego_dataset[frame_number]
        im_ego = rasterizer.to_rgb(data_ego["image"].transpose(1, 2, 0))
        center = np.asarray(cfg["raster_params"]["ego_center"]) * cfg["raster_params"]["raster_size"]

        predicted_positions = []
        target_positions = []

        for v_index in agent_indices:
            data_agent = eval_dataset[v_index]

            out_net = model(torch.from_numpy(data_agent["image"]).unsqueeze(0).to(device))
            out_pos = out_net[0].reshape(-1, 2).detach().cpu().numpy()
            # store absolute world coordinates
            predicted_positions.append(transform_points(out_pos, data_agent["world_from_agent"]))
            # retrieve target positions from the GT and store as absolute coordinates
            track_id, timestamp = data_agent["track_id"], data_agent["timestamp"]
            target_positions.append(gt_rows[str(track_id) + str(timestamp)] + data_agent["centroid"][:2])


        # convert coordinates to AV point-of-view so we can draw them
        predicted_positions = transform_points(np.concatenate(predicted_positions), data_ego["raster_from_world"])
        target_positions = transform_points(np.concatenate(target_positions), data_ego["raster_from_world"])

        draw_trajectory(im_ego, predicted_positions, PREDICTED_POINTS_COLOR)
        draw_trajectory(im_ego, target_positions, TARGET_POINTS_COLOR)
        
        frame_path = Path(dir_to_save, f"eval_prediction_frame_{frame_number}.png")

        plt.imsave(frame_path, im_ego)
    
        scene_frames.append(PIL.Image.fromarray(im_ego))
        frame_prediction_details[frame_number] = {"image_path" : frame_path}
        
        
    print("Saving GIF!")
    gif_path = Path(dir_to_save, f"prediction_animation.gif")
    frame_one = scene_frames[0]
    frame_one.save(gif_path, format="GIF", append_images=scene_frames,
               save_all=True, duration=1000, loop=0)
    
    return metrics, frame_prediction_details, gif_path, pred_df


We use the `wandb` Public API to easily grab all the runs from our training project (with any optional filters) and use them as candidate models for the evaluation flow.

In [None]:
 #🪄🐝
api = wandb.Api()

#Example: Grab all run trials from a specific date range?
# model_runs = api.runs(f'{wandb_entity}/{project_name}-trials', {
model_runs = api.runs(f'{project_name}-trials', {
    # "$and": [{
    # 'created_at': {
    #     "$lt": 'YYYY-MM-DDT##',
    #     "$gt": 'YYYY-MM-DDT##'
    #     }
    # }]
})
run_model_map = {}
for model_run in model_runs:
    for model_art in model_run.logged_artifacts():
        if "latest" in model_art.aliases:
            run_model_map[model_run.name] = {
                "loss": model_run.summary["loss"],
                "avg_loss": model_run.summary["avg_loss"],
                "art": model_art,
                "run_url": model_run.url,
                "run_id": model_run.id,
                "run_summary": model_run.summary
            }

In [None]:
from ray.air.checkpoint import Checkpoint
import PIL
import h5py

Note: during our evaluation flow we are not creating a new run per each evaluation. Instead, to be able to keep the context of the training of the model with the evaluation, we simply update the summary metadata of the candidate model runs with the additional evaluation metrics, keeping things unified and streamlined

In [None]:
eval_run_table_data = []
for run_name, run_details in run_model_map.items():
    print(run_name)
    run_directory = Path(save_directory, run_name)
    run_data = {}
    run_data["run_name"] = run_name
    model_art = run_details["art"]
    #Hack to link evaluated model trials with eval data
    #This run refers to the run keeping track of this evaluation action
     #🪄🐝
    run.use_artifact(model_art)
    run_model_path = Path(run_directory, "model")
    model_dir = model_art.download(run_model_path)
    checkpoint = Checkpoint.from_directory(model_dir)
    recovered_model = checkpoint.to_dict()
    model = recovered_model['model']
    
    print("Running Semantic Evaluation!")
    cfg["raster_params"]["map_type"] = "py_semantic"
    dir_to_save = Path(run_directory, "semantic")
    dir_to_save.mkdir(parents=True, exist_ok=True)
    rasterizer = build_rasterizer(cfg, dm)
    eval_dataset = AgentDataset(cfg, eval_zarr, rasterizer, agents_mask=eval_mask)
    eval_dataloader = DataLoader(eval_dataset, shuffle=eval_cfg["shuffle"], batch_size=eval_cfg["batch_size"], 
                                 num_workers=eval_cfg["num_workers"])
    semantic_eval_metrics, semantic_eval_frame_prediction_details, semantic_gif, semantic_pred_df = evaluate_model(model, criterion, 
                                                             eval_dataloader, 
                                                             eval_gt_path, 
                                                             eval_dataset, 
                                                             eval_zarr, 
                                                             rasterizer, 
                                                             dir_to_save)
    
    print("Running Satellite Evaluation!")
    cfg["raster_params"]["map_type"] = "py_satellite"
    dir_to_save = Path(save_directory, run_name, "satellite")
    dir_to_save.mkdir(parents=True, exist_ok=True)
    rasterizer = build_rasterizer(cfg, dm)
    eval_dataset = AgentDataset(cfg, eval_zarr, rasterizer, agents_mask=eval_mask)
    eval_dataloader = DataLoader(eval_dataset, shuffle=eval_cfg["shuffle"], batch_size=eval_cfg["batch_size"], 
                                 num_workers=eval_cfg["num_workers"])
    satellite_eval_metrics, satellite_eval_frame_prediction_details, satellite_gif, satellite_pred_df = evaluate_model(model, criterion, 
                                                             eval_dataloader, 
                                                             eval_gt_path, 
                                                             eval_dataset, 
                                                             eval_zarr, 
                                                             rasterizer, 
                                                             dir_to_save)
    
    print("Constructing reporting objects...")
    print(satellite_gif)
    semantic_predictions = []
    for val in semantic_eval_frame_prediction_details.values():
        semantic_predictions.append(wandb.Image(str(val["image_path"])))  #🪄🐝
        
    satellite_predictions = []
    for val in satellite_eval_frame_prediction_details.values():
        satellite_predictions.append(wandb.Image(str(val["image_path"])))  #🪄🐝

    run_data.update(run_details)
    
    all_eval_metrics = {}
    all_eval_metrics.update({f"semantic_{key}": val for key, val in semantic_eval_metrics.items()})
    all_eval_metrics.update({f"satellite_{key}": val for key, val in satellite_eval_metrics.items()})
    run_data.update(all_eval_metrics)
    
    run_data.update({f"semantic_predictions" : semantic_predictions, f"satellite_predictions" : satellite_predictions})
    run_data.update({f"semantic_animation" : wandb.Video(str(semantic_gif)), f"satellite_animation" : wandb.Video(str(satellite_gif))})  #🪄🐝
    del run_data['art']
    del run_data["run_summary"]
    
    run_details["run_summary"].update(all_eval_metrics)
    eval_run_table_data.append(run_data)
    
    run.log({f"{run_name}_predicted_semantic_coords": wandb.Table(dataframe=semantic_pred_df),  #🪄🐝
            f"{run_name}_predicted_satellite_coords": wandb.Table(dataframe=satellite_pred_df)})  #🪄🐝
    print("Done!\n~~~~~~~~~~~~~~~~")

### Log Table containing all details of Eval run

In [None]:
 #🪄🐝
eval_table = wandb.Table(columns = list(eval_run_table_data[0].keys()))

In [None]:
for table_data in eval_run_table_data:
    eval_table.add_data(*list(table_data.values()))

In [None]:
len(eval_table.data)

In [None]:
run.log({"eval_table": eval_table})

In [None]:
run.finish()