## A notebook for exploring video posedata

**Intended use:** the user selects a video that is accompanied by already extracted posedata in a .json file. The notebook provides visualizations that summarize the quality and content of the poses extracted across all frames of the video, as well as armature plots of the detected poses in a selected frame. These can be viewed separately from the source video and even animated.

Note that at present, this only works with .json output files generated via the Open PifPaf command-line tools.

In [None]:
from datetime import datetime, timedelta
import os
from pathlib import Path
from time import sleep
from bokeh.io import output_notebook
from bokeh.layouts import column, row
from bokeh.models import (
    Button,
    CrosshairTool,
    DatetimeTickFormatter,
    Div,
    LinearAxis,
    Range1d,
    Slider,
    Span,
    TapTool,
    Toggle,
)
from bokeh.models.widgets.inputs import Select
from bokeh.plotting import figure, show
from bokeh.themes import Theme
import cv2
from IPython.display import HTML, display
from ipywidgets import Dropdown, Layout
import jsonlines
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from scipy.spatial.distance import cosine


### Build and display the video/posedata selector widget

**Important:** for a video to appear in the dropdown menu, the video and its posedata output file must be present at the path specified in `source_data_folder`, which is by default the folder containing this notebook. The names of the matched video and posedata files should be identical, other than that the posedata file will have .openpifpaf.json appended to the name of the video file.

In [None]:
# If you want to specify a different source data folder, do it like this:
source_data_folder = Path("/Users/peterbroadwell/Documents/mime/")
# source_data_folder = Path(os.getenv("DATA_FOLDER", Path.cwd()))


def get_available_videos(data_folder):
    """
    Available videos will be limited to those with a .json and matching video (.mp4, .avi, etc)
    file in a predefined directory (the notebook's running directory, for now)
    """
    available_json_files = list(data_folder.glob("*.json"))
    available_video_files = (
        p.resolve()
        for p in Path(data_folder).glob("*")
        if p.suffix in {".avi", ".mp4", ".mov", ".mkv", ".webm"}
    )
    available_json = [
        json_file.stem.split(".")[0] for json_file in available_json_files
    ]

    available_videos = []

    for video_name in available_video_files:
        if video_name.stem.split(".")[0] in available_json:
            available_videos.append(video_name.name)

    return available_videos


select_msg = (
    "<style>.widget-label { min-width: 20ex !important; }</style>"
    "<body><p>Please select the video to explore from the dropdown list. To be available in the list, "
    "the pose estimation output .json file and the original video file must share the same name, except "
    f"for the .openpifaf.json extension, and be stored in {source_data_folder}/.</p></body>"
)

display(HTML(select_msg))

video_selector = Dropdown(
    options=get_available_videos(source_data_folder),
    description="Video to explore:",
    disabled=False,
    layout=Layout(width="60%", height="40px"),
)

video_selector


### Collect video and per-frame pose metadata for the selected video
The video should be selected from the drop-down menu above after running the cell

In [None]:
video_file = source_data_folder / video_selector.value

pose_file = f"{video_file}.openpifpaf.json"

print("Video file:", video_file)
print("Posedata file:", pose_file)

cap = cv2.VideoCapture(str(video_file))
video_frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
video_fps = cap.get(cv2.CAP_PROP_FPS)
video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

print("Video FPS:", video_fps)

print("Processing video and JSON files, please wait...")

pose_json = jsonlines.open(pose_file)
pose_data = []

# Per-frame pose data: frame, seconds, num_poses, avg_pose_conf, avg_coords_per_pose
pose_series = {
    "frame": [],
    "seconds": [],
    "timestamp": [],
    "num_poses": [],
    "avg_score": [],
    "avg_coords_per_pose": [],
}

for frame in pose_json:

    pose_data.append(frame)

    # Frame output is numbered from 1 in the JSON
    seconds = float(frame["frame"] - 1) / video_fps

    num_poses = len(frame["predictions"])
    pose_series["num_poses"].append(num_poses)

    pose_series["frame"].append(frame["frame"] - 1)
    pose_series["seconds"].append(seconds)

    # Construct a timestamp that can be used with Bokeh's DatetimeTickFormatter
    td = timedelta(seconds=seconds)
    datestring = str(td)
    if td.microseconds == 0:
        datestring += ".000000"
    dt = datetime.strptime(datestring, "%H:%M:%S.%f")

    pose_series["timestamp"].append(dt)

    pose_scores = []
    pose_coords_counts = []
    avg_score = 0  # NaN for empty frames?
    avg_coords_per_pose = 0

    normalized_poses = []

    for pose in frame["predictions"]:

        # ??? Do something with the bbox? The avg ratio of bbox area to full screen can indicate closeup
        # vs. long shot (could also run monoloco to get this kind of info + more)

        pose_scores.append(pose["score"])
        pose_coords = 0
        for i in range(0, len(pose["keypoints"]), 3):
            # Mostly ignore the coord confidence value, unless it's 0 (coord not found)
            # These seem to be averaged into the full pose "score" already
            if pose["keypoints"][i + 2] != 0:
                pose_coords += 1

        # To find the typically small proportion of poses that are complete
        # if pose_coords == 17:
        #     print(frame['frame'])

        pose_coords_counts.append(pose_coords)

    if num_poses > 0:
        avg_score = sum(pose_scores) / num_poses
        avg_coords_per_pose = sum(pose_coords_counts) / num_poses

    pose_series["avg_score"].append(avg_score)
    pose_series["avg_coords_per_pose"].append(avg_coords_per_pose)

print("Total frames:", len(pose_series["frame"]))

print("Duration:", pose_series["timestamp"][len(pose_series["timestamp"]) - 1].time())


In [None]:
# Default dimension of single pose viz
POSE_MAX_DIM = 100


def unflatten_pose_data(prediction):
    """
    Convert an Open PifPaf pose prediction (a 1D 51-element list) into a 17-element
    list (not a NumPy array) of [x_coord, y_coord, confidence] triples.
    """
    return np.array_split(prediction["keypoints"], len(prediction["keypoints"]) / 3)


def extract_trustworthy_coords(prediction):
    """
    Perform the often-desired task of converting an Open PifPaf pose prediction
    from a 1D vector of coordinates and confidence values to a 17x2 NumPy array
    containing only the armature coordinates, with coordinate values set to NaN,NaN
    for any coordinate with a confidence value of 0.
    """
    unflattened_pose = unflatten_pose_data(prediction)
    return np.array([[coords[0], coords[1]] if coords[2] != 0 else [np.NaN, np.NaN] for coords in unflattened_pose]).flatten()


def get_pose_extent(prediction):
    """Get the min and max x and y coordinates of an Open PifPaf pose prediction"""
    pose_coords = unflatten_pose_data(prediction)
    min_x = np.NaN
    min_y = np.NaN
    max_x = np.NaN
    max_y = np.NaN
    for coords in pose_coords:
        # Coordinates with confidence values of 0 are not considered
        if coords[2] == 0:
            continue
        min_x = np.nanmin([min_x, coords[0]])
        min_y = np.nanmin([min_y, coords[1]])
        max_x = np.nanmax([max_x, coords[0]])
        max_y = np.nanmax([max_y, coords[1]])

    return [min_x, min_y, max_x, max_y]


def shift_pose_to_origin(prediction):
    """
    Shift the keypoint coordinates of an Open PifPaf pose prediction so that the
    min x and y coordinates of its extent are at the 0,0 origin.
    NOTE: This only returns the modified 'keypoints' portion of the prediction.
    """
    pose_coords = unflatten_pose_data(prediction)
    min_x, min_y, max_x, max_y = get_pose_extent(prediction)

    for i, coords in enumerate(pose_coords):
        # Coordinates with confidence values of 0 are not modified; these should not
        # be used in any pose representations or calculations, and often (but not
        # always) already have 0,0 coordinates.
        if coords[2] == 0:
            continue
        pose_coords[i] = [coords[0] - min_x, coords[1] - min_y, coords[2]]

    return {"keypoints": np.concatenate(pose_coords, axis=None)}


def rescale_pose_coords(prediction):
    """
    Rescale the coordinates of an OpenPifPaf pose prediction so that the extent
    of the pose's long axis is equal to the global POSE_MAX_DIM setting. The
    coordinates of the short axis are scaled by the same factor, and then are
    shifted so that the short axis is centered within the POSE_MAX_DIM extent.
    NOTE: This only returns the modified 'keypoints' portion of the prediction.
    """
    pose_coords = unflatten_pose_data(prediction)
    min_x, min_y, max_x, max_y = get_pose_extent(prediction)

    scale_factor = POSE_MAX_DIM / np.max([max_x, max_y])

    x_extent = max_x - min_x
    y_extent = max_y - min_y

    if x_extent >= y_extent:
        x_recenter = 0
        y_recenter = round((POSE_MAX_DIM - (scale_factor * y_extent)) / 2)
    else:
        x_recenter = round((POSE_MAX_DIM - (scale_factor * x_extent)) / 2)
        y_recenter = 0

    for i, coords in enumerate(pose_coords):
        # Coordinates with confidence values of 0 are not modified; these should not
        # be used in any pose representations or calculations, and often (but not
        # always) already have 0,0 coordinates.
        if coords[2] == 0:
            continue
        pose_coords[i] = [
            round(coords[0] * scale_factor + x_recenter),
            round(coords[1] * scale_factor + y_recenter),
            coords[2],
        ]

    return {"keypoints": np.concatenate(pose_coords, axis=None)}


def shift_normalize_rescale_pose_coords(prediction):
    """
    Convenience function to shift an Open PifPaf pose prediction so that its minimal corner
    is at the origin, then rescale so that it fits into a POSE_MAX_DIM * POSE_MAX_DIM extent.
    NOTE: This only returns the modified 'keypoints' portion of the prediction.
    """
    return rescale_pose_coords(shift_pose_to_origin(prediction))


def compare_poses_cosine(p1, p2):
    """
    Calculate the similarity of the 'keypoint' portions of two Open PifPaf pose predictions
    by computing their cosine distance and subtracting this from 1 (so 1=identical).
    """
    unflattened_p1 = unflatten_pose_data(p1)
    return 1 - cosine(
        np.array(unflatten_pose_data(p1))[:, :2].flatten(),
        np.array(unflatten_pose_data(p2))[:, :2].flatten(),
    )


def compute_joint_angles(prediction):
    """
    Build an additional/alternative feature set for an Open PifPaf pose prediction, composed
    of the angles, measured in radians, of several joints/articulation points on the body (see
    list in code comments below).
    """
    pose_coords = unflatten_pose_data(prediction)

    joint_angles = []

    # Joints to use:
    joint_angle_points = [
        [3, 5, 11],  # Left ear - left shoulder - left hip
        [4, 6, 12],  # Right ear - right shoulder - right hip
        [11, 5, 7],  # Left hip - left shoulder - left elbow
        [12, 6, 8],  # Right hip - right shoulder - right elbow
        [5, 7, 9],  # Left shoulder - left elbow - left wrist
        [6, 8, 10],  # Right shoulder - right elbow - right wrist
        [5, 11, 13],  # Left shoulder - left hip - left knee
        [6, 12, 14],  # Right shoulder - right hip - right knee
        [11, 13, 15],  # Left hip - left knee - left ankle
        [12, 14, 16],  # Right hip - right knee - right ankle
    ]

    for angle_points in joint_angle_points:
        # Need 3 points to make an angle; if 1 or more are missing, it's a NaN
        if (
            pose_coords[angle_points[0]][2] == 0
            or pose_coords[angle_points[1]][2] == 0
            or pose_coords[angle_points[2]][2] == 0
        ):
            joint_angles.append(np.NaN)
        else:
            ba = np.array(
                [pose_coords[angle_points[0]][0], pose_coords[angle_points[0]][1]]
            ) - np.array(
                [pose_coords[angle_points[1]][0], pose_coords[angle_points[1]][1]]
            )
            bc = np.array(
                [pose_coords[angle_points[2]][0], pose_coords[angle_points[2]][1]]
            ) - np.array(
                [pose_coords[angle_points[1]][0], pose_coords[angle_points[1]][1]]
            )
            cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
            joint_angles.append(np.arccos(cosine_angle))

    return joint_angles


def compare_poses_angles(joint_angles1, joint_angles2):
    """
    This computes a similarity score for two pose predictions that are represented
    as vectors of joint angles. The similarity metric is essentially standard cosine
    similarity (that tge values in the vectors are angle measurements does not make
    a difference to how it works; they're just treated as numbers), modified to handle
    missing/NaN vector values gracefully. (1=identical)
    """

    angles_dot = np.nansum(np.array(joint_angles1) * np.array(joint_angles2))
    angles_norm = np.sqrt(np.nansum(np.square(np.array(joint_angles1)))) * np.sqrt(
        np.nansum(np.square(np.array(joint_angles2)))
    )
    return angles_dot / angles_norm


In [None]:
def progress_bar(value, max=100):
    return HTML(
        """
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(
            value=value, max=max
        )
    )


print("Computing normalized poses for comparison and clustering")
bar = display(progress_bar(0, len(pose_data)), display_id=True)

# For cluster analysis, each pose must be a 1D array, and all poses must be in a 1D list
# that includes only the pose keypoint coordinates (not the confidence scores).
# So we also create a parallel data structure to keep track of the frame number and numbering
# within the frame of each of the poses.
normalized_pose_data = []
normalized_pose_metadata = []

for i, frame in enumerate(pose_data):

    if i % 100 == 0:
        bar.update(progress_bar(i, len(pose_data)))

    for j, pose in enumerate(frame["predictions"]):
        normalized_coords = extract_trustworthy_coords(shift_normalize_rescale_pose_coords(pose))
        #normalized_pose = unflatten_pose_data(shift_normalize_rescale_pose_coords(pose))
        #normalized_coords = np.array([[coords[0], coords[1]] if coords[2] != 0 else [np.NaN, np.NaN] for coords in normalized_pose]).flatten()
        normalized_pose_data.append(normalized_coords)
        normalized_pose_metadata.append({"frameno": i, "poseno": j})


In [None]:
# Build FAISS indexes of the poses, for fast nearest neighbor similarity search

import faiss

# FAISS can't handle NaNs in the iput vectors so use -1s instead
faiss_pose_data = [tuple(np.nan_to_num(raw_pose, nan=-1).tolist()) for raw_pose in normalized_pose_data]

# This builds an exact (flat) index based on Euclidean distance
faiss_L2_index = faiss.IndexFlatL2(34)
faiss_L2_input = np.array(faiss_pose_data).astype('float32')
faiss_L2_index.add(faiss_L2_input)

# This builds an exact (flat) index based on inner-product distance,
# which is equivalent to cosine similarity when the inputs are normalized
faiss_IP_index = faiss.IndexFlatIP(34)
faiss_IP_input = np.array(faiss_pose_data).astype('float32')
faiss.normalize_L2(faiss_IP_input) # Must normalize the inputs!
faiss_IP_index.add(faiss_IP_input)

In [None]:
# This takes a really long time. XXX Try FAISS's k-means
from sklearn.cluster import OPTICS

MIN_SAMPLES_PER_CLUSTER = 50

cluster_labels = OPTICS(
    min_samples=MIN_SAMPLES_PER_CLUSTER, metric="sqeuclidean"
).fit_predict(normalized_pose_data)


## Representing poses for numerical comparison, clustering

Assumptions:
- Representations derived from different armature-based pose estimation libraries can differ in the number of points and/or angles. One could provide pose "translators" that interpolate or extrapolate the locations of missing points in order to allow comparison of poses from different systems. It's not obvious how often this would need to be done, however, so implementing it might be more trouble than it's worth.
- The core representation logics will be agnostic to the size of the figure in the frame (or even in the environment, if we're lucky enough to be able to determine that); distances will be normalized or eschewed in favor of angles. Size can however still be included as an optional parameter, e.g., the size of the figure's bounding box in pixels.
- Methods will default to 2D, but can accommodate 3D input values by adding further coordinate dimensions or a second angle.
- 3D representations can include an angle indicating the direction of the "front" of the pose. It's possible to do something like this for 2D representations, but rectifying 2D coordinates to a front-on representation may be too noisy and thus not worth the effort. Unless otherwise indicated, 2D representations are always from the camera's point of view.
- *Missing input coordinates* leading to missing features in pose representations: if not interpolated/estimated, these should be represented as NaNs, not 0s, to avoid confusion with legitimate points at the 0,0 origin, or 0-degree angles. The comparison methods may need to support at least two primary modes for dealing with these: a) only poses containing the same features may be compared; all others produce a null result, or b) poses containing different features may be compared, but poses that share a greater number of features should generally count as more similar than poses that share a smaller number of features.

Primary techniques for representing poses:
1. Normalized coordinates (or distances between points). 
1. Angles of three-point armatures (elbows, shoulders, knees, hips)
1. Directions of movement and magnitudes of movement of normalized points 

Note that #3 can be combined constructively with #1 or #2 (turning a pose into a "movelet"), but combining #1 and #2 would be redundant in almost all cases. Obtaining sufficiently accurate data to infer instantaneous directions and magnitude of movement for specific points is not always possible, however.

Pose comparison best practices:
- Convert each pose to a vector, then compute the similarity between them.
- If poses are incomplete, only compare the available data points; points missing from one or both poses are considered null/equal (all relevant distance metrics require this anyway).
- Normalization to ensure scale-invariance of comparisons (considering only 2D coords so far): absolute size, aka distance from camera, shouldn't affect comparison. The most promising method at present involves scaling the largest dimension so that it fits into a set range (e.g., 500 pixels), then scaling the smaller dimension by the same factor. The pose therefore will stretch across the full extent of the larger dimension, and is centered around the middle of the set range of the smaller dimension, which should aid in vector-based comparison.

### Build and launch the explorer app

This displays an interactive chart visualization of the attributes of the posedata in the .json output file across the runtime of the video.

Clicking anywhere in the chart, moving the slider, or clicking the prev/next buttons will select a frame and draw the poses detected in that frame, with the option of displaying the actual image from the source video as the "background."

Please see the cell below the next if you are running this notebook in VS Code. Note also that the Jupyter server must be running on port 8888 for the explorer app to work in Jupyter/JupterLab.

In [None]:
# The body part numberings and armature connectors for the 17-keypoint COCO pose format are defined in
# https://github.com/openpifpaf/openpifpaf/blob/main/src/openpifpaf/plugins/coco/constants.py
# Note that the body part numbers in the connector (skeleton) definitions begin with 1, for some reason, not 0
OPP_COCO_SKELETON = [
    (16, 14),
    (14, 12),
    (17, 15),
    (15, 13),
    (12, 13),
    (6, 12),
    (7, 13),
    (6, 7),
    (6, 8),
    (7, 9),
    (8, 10),
    (9, 11),
    (2, 3),
    (1, 2),
    (1, 3),
    (2, 4),
    (3, 5),
    (4, 6),
    (5, 7),
]
OPP_COCO_COLORS = [
    "orangered",
    "orange",
    "blue",
    "lightblue",
    "darkgreen",
    "red",
    "lightgreen",
    "pink",
    "plum",
    "purple",
    "brown",
    "saddlebrown",
    "mediumorchid",
    "gray",
    "salmon",
    "chartreuse",
    "lightgray",
    "darkturquoise",
    "goldenrod",
]

UPSCALE = 3  # See draw_frame()

# Default dimensions of the output visualizations
FIGURE_WIDTH = 950
FIGURE_HEIGHT = 500

# XXX ImageDraw does't ship with a scaleable font, so best to use matplotlib's
import matplotlib

font_path = os.path.join(
    matplotlib.__path__[0], "mpl-data", "fonts", "ttf", "DejaVuSans.ttf"
)
try:
    label_font = ImageFont.truetype(font_path, size=128)
except:
    label_font = None


def add_pose_to_drawing(pose_prediction, drawing, seqno=None, show_bbox=False):
    pose_coords = unflatten_pose_data(pose_prediction)

    for i, seg in enumerate(OPP_COCO_SKELETON):

        if pose_coords[seg[0] - 1][2] == 0 or pose_coords[seg[1] - 1][2] == 0:
            continue

        line_color = OPP_COCO_COLORS[i]
        shape = [
            (
                int(pose_coords[seg[0] - 1][0] * UPSCALE),
                int(pose_coords[seg[0] - 1][1] * UPSCALE),
            ),
            (
                int(pose_coords[seg[1] - 1][0] * UPSCALE),
                int(pose_coords[seg[1] - 1][1]) * UPSCALE,
            ),
        ]
        drawing.line(shape, fill=line_color, width=2 * UPSCALE)

    if "bbox" in pose_prediction:
        bbox = pose_prediction["bbox"]
    else:
        extent = get_pose_extent(pose_prediction)
        bbox = [extent[0], extent[1], extent[2] - extent[0], extent[3] - extent[1]]

    # bbox format for PifPaf is x0, y0, width, height
    # Also note that both PifPaf and PIL/ImageDraw place (0,0) at top left, not bottom left
    upper_left = (int(bbox[0] * UPSCALE), int(bbox[1] * UPSCALE))
    lower_right = (
        int((bbox[0] + bbox[2]) * UPSCALE),
        int((bbox[1] + bbox[3]) * UPSCALE),
    )

    if show_bbox:
        shape = [upper_left, lower_right]
        drawing.rectangle(shape, outline="blue", width=1 * UPSCALE)

    if seqno is not None:
        drawing.text(
            upper_left, str(seqno + 1), font=label_font, align="right", fill="blue"
        )

    return drawing


def normalize_and_draw_pose(pose_prediction):
    normalized_prediction = shift_normalize_rescale_pose_coords(pose_prediction)
    # XXX Could also grab the background image and excerpt/scale it to match, if desired...
    bg_img = Image.new("RGBA", (POSE_MAX_DIM * UPSCALE, POSE_MAX_DIM * UPSCALE))
    drawing = ImageDraw.Draw(bg_img)
    drawing = add_pose_to_drawing(normalized_prediction, drawing)
    bg_img = bg_img.resize(
        (POSE_MAX_DIM, POSE_MAX_DIM), resample=Image.Resampling.LANCZOS
    )
    return bg_img


def image_from_video_frame(video_file, frameno):
    """Grabs the specified frame from the video and converts it into an RGBA array"""
    cap = cv2.VideoCapture(video_file)
    cap.set(1, frameno)
    ret, img = cap.read()
    rgba_img = cv2.cvtColor(img, cv2.COLOR_RGB2RGBA)
    image = np.asarray(rgba_img)
    return image


def draw_frame(frame, bg_img=None):
    """Draws the poses in the specified frame, superimposing them on the frame image, if provided."""

    pixels_to_poses = {}

    # The only way to get smooth(er) lines in the pose armatures via PIL ImageDraw is to upscale the entire
    # image by some factor, draw the lines, then downscale back to the original resolution while applying
    # Lanczos resampling, because ImageDraw doesn't do any native anti-aliasing.
    if bg_img is None:
        bg_img = Image.new("RGBA", (video_width * UPSCALE, video_height * UPSCALE))
    else:
        bg_img = bg_img.resize((video_width * UPSCALE, video_height * UPSCALE))

    drawing = ImageDraw.Draw(bg_img)

    for i, pose_prediction in enumerate(frame["predictions"]):

        drawing = add_pose_to_drawing(pose_prediction, drawing, i, show_bbox=True)

    bg_img = bg_img.resize(
        (video_width, video_height), resample=Image.Resampling.LANCZOS
    )

    return bg_img


def pil_to_bokeh_image(pil_img, target_width, target_height):
    """The Bokeh interactive notebook tools will only display image data if it's formatted in a particular way"""
    img_array = np.array(pil_img.transpose(Image.Transpose.FLIP_TOP_BOTTOM))

    img = np.empty(img_array.shape[:2], dtype=np.uint32)
    view = img.view(dtype=np.uint8).reshape(img_array.shape)

    for i in range(target_height):
        for j in range(target_width):
            view[i, j, 0] = img_array[i, j, 0]
            view[i, j, 1] = img_array[i, j, 1]
            view[i, j, 2] = img_array[i, j, 2]
            view[i, j, 3] = img_array[i, j, 3]

    return img


def bkapp(doc):
    """Define and run the Bokeh interactive notebook (Python + Javascript) application"""

    max_y = max(pose_series["avg_coords_per_pose"] + pose_series["num_poses"])

    # This is the main interactive timeline chart
    tl = figure(
        width=FIGURE_WIDTH,
        height=FIGURE_HEIGHT,
        title=video_file.name,
        min_border=10,
        y_range=(0, max_y + 1),
        tools="save,box_zoom,pan,reset",
    )
    # Format the X axis as hour-minute-second timecodes
    tl.x_range = Range1d(min(pose_series["timestamp"]), max(pose_series["timestamp"]))
    tl.xaxis.axis_label = "Time"
    time_formatter = DatetimeTickFormatter(
        hourmin="%H:%M:%S",
        minutes="%H:%M:%S",
        minsec="%H:%M:%S",
        seconds="%Ss",
        milliseconds="%3Nms",
    )
    tl.line(
        pose_series["timestamp"],
        pose_series["num_poses"],
        legend_label="Poses per frame",
        color="blue",
        alpha=0.6,
        line_width=2,
    )
    tl.line(
        pose_series["timestamp"],
        pose_series["avg_coords_per_pose"],
        legend_label="Coords per pose",
        color="red",
        alpha=0.6,
        line_width=2,
    )
    # The left Y axis corresponds to counts of poses and coordinates
    tl.yaxis.axis_label = "Poses or Coords"
    tl.extra_y_ranges = {"avg_score": Range1d(0, 1)}
    tl.line(
        pose_series["timestamp"],
        pose_series["avg_score"],
        y_range_name="avg_score",
        legend_label="Avg pose score",
        color="green",
        alpha=0.4,
        line_width=2,
    )
    # The right Y axis corresponds to the average pose score (from 0 to 1)
    tl.add_layout(
        LinearAxis(y_range_name="avg_score", axis_label="Avg Pose Score"), "right"
    )
    tl.xaxis.formatter = time_formatter
    tl.xaxis.ticker.desired_num_ticks = 10
    tl.legend.click_policy = "hide"
    frame_line = Span(
        location=pose_series["timestamp"][0],
        dimension="height",
        line_color="red",
        line_width=3,
    )
    tl.add_layout(frame_line)

    def tl_tap(event):
        """When the chart is clicked, move the slider to the appropriate frame"""
        # event.x is a timestamp, so it needs to be converted to a frameno
        start_dt = datetime(1900, 1, 1)
        dt = datetime.utcfromtimestamp(event.x / 1000)
        t_delta = dt - start_dt
        clicked_frame = round(t_delta.total_seconds() * video_fps)
        slider.value = clicked_frame

    tl_tap_tool = TapTool()
    tl_crosshair_tool = CrosshairTool()

    def get_frame_info(fn):
        return f"Frame info: {pose_series['num_poses'][fn]} detected poses, {pose_series['avg_coords_per_pose'][fn]:.3f} avg coords/pose, {pose_series['avg_score'][fn]:.3f} avg pose score"

    info_div = Div(text=get_frame_info(0))

    tl.add_tools(tl_tap_tool, tl_crosshair_tool)
    tl.on_event("tap", tl_tap)

    # This is the second figure, where the poses in the selected frame are drawn
    fr = figure(
        x_range=(0, video_width),
        y_range=(0, video_height),
        width=FIGURE_WIDTH,
        height=int(FIGURE_WIDTH / video_width * video_height),
        title="Poses in selected frame",
        tools="save",
    )
    # Add an invisible glyph to suppress the "figure has no renderers" warning
    fr.circle(0, 0, size=0, alpha=0.0)

    pose_info_div = Div(text="Click to poses to compare")

    pose_p1 = figure(
        x_range=(0, POSE_MAX_DIM),
        y_range=(0, POSE_MAX_DIM),
        width=POSE_MAX_DIM * 2,
        height=POSE_MAX_DIM * 2,
        title="",
        tools="",
    )
    # Add an invisible glyph to suppress the "figure has no renderers" warning
    pose_p1.circle(0, 0, size=0, alpha=0.0)

    pose_p2 = figure(
        x_range=(0, POSE_MAX_DIM),
        y_range=(0, POSE_MAX_DIM),
        width=POSE_MAX_DIM * 2,
        height=POSE_MAX_DIM * 2,
        title="",
        tools="",
    )
    # Add an invisible glyph to suppress the "figure has no renderers" warning
    pose_p2.circle(0, 0, size=0, alpha=0.0)

    def background_toggle_handler(event):
        """When the image underlay is toggled on or off, prompt the slider to redraw the frame"""
        slider_callback(None, slider.value, slider.value)

    background_switch = Toggle(label="show background", active=False)
    background_switch.on_click(background_toggle_handler)

    def slider_callback(attr, old, new):
        """When the slider moves, draw the poses in the new frame and show the background if desired"""
        fr.renderers = []
        if background_switch.active:
            cap = cv2.VideoCapture(str(video_file))
            cap.set(1, new)
            ret, img = cap.read()
            rgb_bg = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            rgba_bg = cv2.cvtColor(rgb_bg, cv2.COLOR_RGB2RGBA)
            pil_bg = Image.fromarray(rgba_bg)
            frame_img = draw_frame(pose_data[new], pil_bg)
        else:
            frame_img = draw_frame(pose_data[new])
        img = pil_to_bokeh_image(frame_img, video_width, video_height)
        fr.image_rgba(image=[img], x=0, y=0, dw=img.shape[1], dh=img.shape[0])
        if old != new:
            info_div.text = get_frame_info(new)
            frame_line.location = pose_series["timestamp"][new]
            pose_p1.title.text = ""
            pose_p1.renderers = []
            pose_p2.title.text = ""
            pose_p2.renderers = []
            pose_info_div.text = "Click two poses to compare"

    slider = Slider(
        start=0, end=len(pose_data) - 1, value=0, step=1, title="Selected frame"
    )
    slider.on_change("value", slider_callback)

    def get_pose_extent_maps(frameno):
        pose_extent_maps = []
        for i, pose_prediction in enumerate(pose_data[frameno]["predictions"]):

            if "bbox" in pose_prediction:
                bbox = pose_prediction["bbox"]
            else:
                extent = get_pose_extent(pose_prediction)
                bbox = [
                    extent[0],
                    extent[1],
                    extent[2] - extent[0],
                    extent[3] - extent[1],
                ]

            extent_map = {
                "poseno": i,
                "min_x": bbox[0],
                "min_y": video_height - bbox[3] - bbox[1],
                "max_x": bbox[0] + bbox[2],
                "max_y": video_height - bbox[1],
            }

            pose_extent_maps.append(extent_map)

        return pose_extent_maps

    def match_pose_pixel_maps(x, y, pose_extent_maps):
        matched_poses = []
        for extent_map in pose_extent_maps:
            if (
                x >= extent_map["min_x"]
                and x <= extent_map["max_x"]
                and y >= extent_map["min_y"]
                and y <= extent_map["max_y"]
            ):
                matched_poses.append(extent_map["poseno"])
        return matched_poses

    def fr_tap(event):
        """When the frame is clicked"""
        # event.x is a timestamp, so it needs to be converted to a frameno
        pixel_key = f"{int(event.x)}, {int(event.y)}"
        pose_extent_maps = get_pose_extent_maps(slider.value)
        clicked_poses = match_pose_pixel_maps(event.x, event.y, pose_extent_maps)
        if len(clicked_poses):
            pose_img = normalize_and_draw_pose(
                pose_data[slider.value]["predictions"][clicked_poses[0]]
            )
            pose_img = pil_to_bokeh_image(pose_img, POSE_MAX_DIM, POSE_MAX_DIM)

            if pose_p1.title.text == "":
                pose_p1.image_rgba(
                    image=[pose_img],
                    x=0,
                    y=0,
                    dw=pose_img.shape[1],
                    dh=pose_img.shape[0],
                )
                pose_p1.title = f"{clicked_poses[0]+1}"
                pose_info_div.text = "Please click another pose for comparison"
            elif pose_p1.title.text != "" and pose_p2.title.text == "":
                pose_p2.image_rgba(
                    image=[pose_img],
                    x=0,
                    y=0,
                    dw=pose_img.shape[1],
                    dh=pose_img.shape[0],
                )
                pose_p2.title = f"{clicked_poses[0]+1}"

                normalized_p1 = shift_normalize_rescale_pose_coords(
                    pose_data[slider.value]["predictions"][int(pose_p1.title.text) - 1]
                )
                normalized_p2 = shift_normalize_rescale_pose_coords(
                    pose_data[slider.value]["predictions"][int(pose_p2.title.text) - 1]
                )

                cosine_distance = compare_poses_cosine(
                    normalized_p1,
                    normalized_p2,
                )
                p1_angles = compute_joint_angles(normalized_p1)
                p2_angles = compute_joint_angles(normalized_p2)
                pose_p2.title = f"{clicked_poses[0]+1}"
                angle_distance = compare_poses_angles(p1_angles, p2_angles)
                pose_info_div.text = f"Cosine similarity between pose keypoints: {(cosine_distance*100):3.3f}% | Similarity between pose joint angles: {(angle_distance*100):3.3f}%"

    fr_tap_tool = TapTool()

    fr.add_tools(fr_tap_tool)
    fr.on_event("tap", fr_tap)

    def prev_handler(event):
        slider.value = max(0, slider.value - 1)

    def next_handler(event):
        slider.value = min(slider.value + 1, len(pose_data) - 1)

    prev_button = Button(label="prev")
    prev_button.on_click(prev_handler)
    next_button = Button(label="next")
    next_button.on_click(next_handler)

    L2_search_info_div = Div(text="L2 (Euclidean distance) similar pose search")
    IP_search_info_div = Div(text="IP (cosine distance) similar pose search")

    SIMILAR_POSES_TO_FIND = 4
    similar_L2_poses = []
    similar_IP_poses = []

    for s in range(SIMILAR_POSES_TO_FIND):
        similar_L2_poses.append(
            figure(
                x_range=(0, POSE_MAX_DIM),
                y_range=(0, POSE_MAX_DIM),
                width=POSE_MAX_DIM * 2,
                height=POSE_MAX_DIM * 2,
                title="",
                tools="",
            )
        )
    for pose_box in similar_L2_poses:
        pose_box.circle(0, 0, size=0, alpha=0.0)

    for s in range(SIMILAR_POSES_TO_FIND):
        similar_IP_poses.append(
            figure(
                x_range=(0, POSE_MAX_DIM),
                y_range=(0, POSE_MAX_DIM),
                width=POSE_MAX_DIM * 2,
                height=POSE_MAX_DIM * 2,
                title="",
                tools="",
            )
        )
    for pose_box in similar_IP_poses:
        pose_box.circle(0, 0, size=0, alpha=0.0)

    def get_similar_poses_handler(event):
        """
        NOTE that although the FAISS search should always return the query pose as the
        first search result, we don't want to include it in the displayed results,
        hence the +1s and -1s in the code below.
        """
        if pose_p1.title.text == "":
            return

        for pose_box in similar_L2_poses:
            pose_box.renderers = []
        for pose_box in similar_IP_poses:
            pose_box.renderers = []

        target_frameno = slider.value
        target_poseno = pose_p1.title.text

        target_pose = extract_trustworthy_coords(shift_normalize_rescale_pose_coords(
            pose_data[slider.value]["predictions"][int(pose_p1.title.text) - 1]
        ))
        target_L2_pose_query = np.array([np.nan_to_num(target_pose, nan=-1)]).astype("float32")
        target_IP_pose_query = np.array([target_pose]).astype("float32")

        L2_D, L2_I = faiss_L2_index.search(target_L2_pose_query, SIMILAR_POSES_TO_FIND+1)
        IP_D, IP_I = faiss_IP_index.search(target_L2_pose_query, SIMILAR_POSES_TO_FIND+1)

        L2_match_framenos = []
        L2_match_scores = []

        IP_match_framenos = []
        IP_match_scores = []

        for m in range(1, SIMILAR_POSES_TO_FIND+1):
            if m == 0:
                continue

            L2_match_index = L2_I[0][m]
            IP_match_index = IP_I[0][m]

            if L2_match_index != -1:
                L2_target_frameno = normalized_pose_metadata[L2_match_index]['frameno']
                L2_target_poseno = normalized_pose_metadata[L2_match_index]['poseno']
                L2_match_framenos.append(str(L2_target_frameno))
                L2_match_scores.append(str(L2_D[0][m]))

                L2_match_img = normalize_and_draw_pose(
                    pose_data[L2_target_frameno]["predictions"][L2_target_poseno]
                )
                L2_match_img = pil_to_bokeh_image(L2_match_img, POSE_MAX_DIM, POSE_MAX_DIM)

                similar_L2_poses[m-1].image_rgba(
                    image=[L2_match_img],
                    x=0,
                    y=0,
                    dw=L2_match_img.shape[1],
                    dh=L2_match_img.shape[0],
                )

            if IP_match_index != -1:
                IP_target_frameno = normalized_pose_metadata[IP_match_index]['frameno']
                IP_target_poseno = normalized_pose_metadata[IP_match_index]['poseno']
                IP_match_framenos.append(str(IP_target_frameno))
                IP_match_scores.append(str(IP_D[0][m]))

                IP_match_img = normalize_and_draw_pose(
                    pose_data[IP_target_frameno]["predictions"][IP_target_poseno]
                )
                IP_match_img = pil_to_bokeh_image(IP_match_img, POSE_MAX_DIM, POSE_MAX_DIM)

                similar_IP_poses[m-1].image_rgba(
                    image=[IP_match_img],
                    x=0,
                    y=0,
                    dw=IP_match_img.shape[1],
                    dh=IP_match_img.shape[0],
                )

        L2_search_info_div.text = f"L2 matches in frames {', '.join(L2_match_framenos)} | scores {', '.join(L2_match_scores)}"
        IP_search_info_div.text = f"IP matches in frames {', '.join(IP_match_framenos)} | scores {', '.join(IP_match_scores)}"

    def reset_subposes_handler(event):
        pose_p1.title.text = ""
        pose_p1.renderers = []
        pose_p2.title.text = ""
        pose_p2.renderers = []
        pose_info_div.text = ""
        for pose_box in similar_L2_poses:
            pose_box.renderers = []
        for pose_box in similar_IP_poses:
            pose_box.renderers = []
        L2_search_info_div.text = "L2 (Euclidean distance) similar pose search"
        IP_search_info_div.text = "IP (cosine distance) similar pose search"

    reset_subposes_button = Button(label="clear")
    reset_subposes_button.on_click(reset_subposes_handler)

    get_similar_poses_button = Button(label="look up 1st pose")
    get_similar_poses_button.on_click(get_similar_poses_handler)

    control_row = row(children=[prev_button, next_button, background_switch])

    pose_buttons_column = column(reset_subposes_button, get_similar_poses_button)

    subposes_row = row(children=[pose_p1, pose_p2, pose_buttons_column])

    similar_L2_poses_row = row(children=similar_L2_poses)
    similar_IP_poses_row = row(children=similar_IP_poses)

    layout_column = column(
        tl,
        slider,
        info_div,
        control_row,
        fr,
        pose_info_div,
        subposes_row,
        L2_search_info_div,
        similar_L2_poses_row,
        IP_search_info_div,
        similar_IP_poses_row,
    )

    doc.add_root(layout_column)


output_notebook()

show(bkapp, notebook_url="localhost:8889")


**Running the notebook in VS Code:** As of late 2022, if you are running this notebook in VS Code instead of Jupyter or JupyterLab, the above cell will not work (BokehJS will load, but no figures will appear) without the following workaround:

Take note of the error message that appears when you try to run the cell above, particularly the long alphanumeric string suggested as a value for `BOKEH_ALLOW_WS_ORIGIN`. Copy this string, then uncomment the last two lines in the cell below, paste the alphanumeric string in place of the `INSERT_BOKEH_ALLOW_WS_ORIGIN_VALUE_HERE` text, run the cell, then try running the cell above to launch the explorer app again. It should work now.

In [None]:
# If you are following the steps above to run the explorer app in VS Code,
# uncomment the following lines (remove the '#'s) before running this cell:
import os
os.environ["BOKEH_ALLOW_WS_ORIGIN"] = "0jubpfudr7ckf8qfh6dong6lr67pqrvbr5ugu8db8kcm4g6se70e"

### Demo of frame-by-frame pose drawing

The cell below uses a different viz library to draw the poses in each successive frame on an HTML canvas, at the same frame rate as the source video.

Note that this drawing library (`ipycanvas`) doesn't play well with the Bokeh interactive application above, which is why the somewhat clunkier PIL ImageDraw library is used to draw the poses there instead.

In [None]:
from ipycanvas import Canvas, hold_canvas

canvas = Canvas(width=video_width, height=video_height, sync_image_data=True)

display(canvas)


def draw_frame_on_canvas(frame, canvas):

    for pose_prediction in frame["predictions"]:
        pose_coords = np.array_split(
            pose_prediction["keypoints"], len(pose_prediction["keypoints"]) / 3
        )

        for i, seg in enumerate(OPP_COCO_SKELETON):

            if pose_coords[seg[0] - 1][2] == 0 or pose_coords[seg[1] - 1][2] == 0:
                continue

            canvas.stroke_style = OPP_COCO_COLORS[i]
            canvas.line_width = 2

            canvas.stroke_line(
                pose_coords[seg[0] - 1][0],
                pose_coords[seg[0] - 1][1],
                pose_coords[seg[1] - 1][0],
                pose_coords[seg[1] - 1][1],
            )


# This will "animate" all of the detected poses starting from the beginning of the video
for frame in pose_data:

    with hold_canvas():

        canvas.clear()

        draw_frame_on_canvas(frame, canvas)

        sleep(1 / video_fps)
