# 20 Billion Something-Something

Script for processing the 20bn dataset.

In [1]:
%load_ext autoreload
%autoreload 2

## Display video grid

In [2]:
# !pip install mediapipe==0.10.0

In [3]:
# !pip install numpy h5py hdf5plugin Pillow tqdm pandas av seaborn ipywidgets opencv-python 'mediapipe==0.10.0'
# !pip install torch torchvision tensorboard
# !pip install 

In [4]:
import pathlib
import typing

import sys
sys.path.append("..")

from gpred import video_utils
from env import twentybn

def display_video_grid(
    labels: twentybn.dataset.Labels,
    action_instances: typing.List[typing.List[int]],
    path: pathlib.Path,
    num_rows: int = 5
):
    """Displays 3 x N grid of videos.
    
    Args:
        labels: 20BN labels.
        action_instances: List of video ids per action.
        path: Path of videos.
        num_rows: Number of rows to display per batch.
    """
    from IPython.display import clear_output
    import ipywidgets as widgets
    
    next_button = widgets.Button(description="Next")
    
    def assign_button_handler(id_action: int):
        """Assigns click handler to 'Next' button."""
        
        SIZE_BATCH = 3 * num_rows
        num_examples = len(action_instances[id_action])
        idx_example_start = 0
        
        def show_next_video_callback(b: widgets.Button):
            """Called on button click to display next video grid."""
            nonlocal idx_example_start
            with output:
                clear_output()

                print(f"\n{labels.actions[id_action].template}")
                print(f"Examples {idx_example_start}..{idx_example_start + SIZE_BATCH - 1} out of {num_examples}\n")

                idx_examples = range(idx_example_start, idx_example_start + SIZE_BATCH)
                id_videos = [action_instances[id_action][idx_example] for idx_example in idx_examples]

                video_utils.display_video_grid(id_videos, path, labels=[labels.videos[id_video].action_name for id_video in id_videos])

                idx_example_start += SIZE_BATCH
        
        next_button._click_handlers.callbacks = []
        next_button.on_click(show_next_video_callback)

    input_action = widgets.BoundedIntText(value=0, min=0, max=len(labels.actions), description="Action index:")
    output = widgets.interactive_output(assign_button_handler, {"id_action": input_action})

    return widgets.VBox([widgets.HBox([input_action, next_button]), output])

# Generate labels

## Load datasets

### 20BN Something Something

In [5]:
!ls /Something2

SomethingElse  data  labels  videos


In [6]:
!ln -s /Something2/labels ../data/twentybn/labels
!ln -s /Something2/videos ../data/twentybn/videos
!ln -s /Something2/SomethingElse ../data/twentybn/SomethingElse

ln: failed to create symbolic link '../data/twentybn/labels/labels': File exists
ln: failed to create symbolic link '../data/twentybn/videos/videos': File exists
ln: failed to create symbolic link '../data/twentybn/SomethingElse/SomethingElse': File exists


In [7]:
ls ../data/twentybn

[0m[38;5;51mSomethingElse[0m@  hands.pkl  labels.hdf5    val_set.pkl
download.txt    [38;5;51mlabels[0m@    train_set.pkl  [38;5;51mvideos[0m@


In [8]:
import json

import config

paths = config.EnvironmentPaths(environment="twentybn")

print(paths.data)

"""
sth_sth_labels = {
    "{id_action}": "Holding something next to something"
}
"""
with open(paths.data / "labels/labels.json", "r") as f:
    sth_sth_labels = json.load(f)

"""
sth_sth = [
    {
        "id": "78687",
        "label": "holding potato next to vicks vaporub bottle",
        "template": "Holding [something] next to [something]",
        "placeholders": ["potato", "vicks vaporub bottle"],
    }
]
"""
with open(paths.data / "labels/train.json", "r") as f:
    sth_sth_train = json.load(f)

with open(paths.data / "labels/validation.json", "r") as f:
    sth_sth_val = json.load(f)

../data/twentybn


### Something Else

In [9]:
# import tqdm

# """
# sth_else = {
#     "{id}": [
#         {
#             "name": "{id}/####.jpg",
#             "labels": [
#                 {
#                     "box2d": {
#                         "x1": float,
#                         "x2": float,
#                         "y1": float,
#                         "y2": float,
#                     },
#                     "category": "battery",
#                     "gt_annotation": "object 0",
#                     "standard_category": "0000",
#                 }
#             ],
#             "gt_placeholders": ["battery"],
#             "nr_instances": 1},
#         }
#     ]
# }
# """
# sth_else = {}
# for i in range(4):
#     with open(paths.data / f"something_else/bounding_box_smthsmth_part{i+1}.json", "r") as f:
#         for key, frames in json.load(f).items():
#             sth_else[key] = frames

import os
import tqdm
from tqdm import notebook
"""
sth_else = {
    "{id}": [
        {
            "name": "{id}/####.jpg",
            "labels": [
                {
                    "box2d": {
                        "x1": float,
                        "x2": float,
                        "y1": float,
                        "y2": float,
                    },
                    "category": "battery",
                    "gt_annotation": "object 0",
                    "standard_category": "0000",
                }
            ],
            "gt_placeholders": ["battery"],
            "nr_instances": 1},
        }
    ]
}
"""
sth_else = {}
for i in tqdm.notebook.tqdm(range(4)):
    with open(os.path.join(paths.data,f'SomethingElse/bounding_box_smthsmth_part{i+1}.json'), "r") as f:
        for key, frames in json.load(f).items():
            sth_else[key] = frames

  0%|          | 0/4 [00:00<?, ?it/s]

## Reformat labels

In [10]:
import re

# Create template => idx_action map.
idx_actions = {}
for sth_sth_label in sth_sth_val:
    fine_label = sth_sth_label["label"]
    template = sth_sth_label["template"]
    
    coarse_label = re.sub("[\[\]]", "", template)
    idx_action = int(sth_sth_labels[coarse_label])
    idx_actions[template] = idx_action

# Create action labels.
"""
action_labels = [
    {
        "label": "Approaching something with your camera",
        "template": "Approaching [something] with your camera",
    }
]
"""
action_labels = [None] * len(sth_sth_labels)
for sth_sth_label in sth_sth_train:
    if not None in action_labels:
        break
    
    template = sth_sth_label["template"]
    coarse_label = re.sub("[\[\]]", "", template)
    idx_action = idx_actions[template]
    
    action_labels[idx_action] = {
        "label": coarse_label,
        "template": template
    }

# Create video labels.
def process_labels(sth_sth_set, sth_else, idx_actions, video_labels):
    labels = []
    for sth_sth_label in tqdm.tqdm(sth_sth_set):
        id_video = int(sth_sth_label["id"])
        if not str(id_video) in sth_else:
            continue

        template = sth_sth_label["template"]
        id_action = idx_actions[template]
        placeholders = sth_sth_label["placeholders"]

        sth_else_label = sth_else[str(id_video)]
        objects = sth_else_label[0]["gt_placeholders"]

        frames = {}
        for sth_else_frame in sth_else_label:
            idx_frame = int(re.match(r"\d+/(\d+)\.jpg", sth_else_frame["name"])[1]) - 1
            boxes = {}
            for sth_else_box in sth_else_frame["labels"]:
                idx_obj = sth_else_box["standard_category"]
                if idx_obj != "hand":
                    # Simplify integer. JSON key value still need to be strings.
                    idx_obj = str(int(idx_obj))

                box = sth_else_box["box2d"]
                boxes[idx_obj] = [[box["x1"], box["y1"]], [box["x2"], box["y2"]]]
            
            frames[idx_frame] = boxes

        labels.append(id_video)
        video_labels[id_video] = {
            "id_action": id_action,
            "placeholders": placeholders,
            "objects": objects,
            "frames": frames,
        }
    return labels

"""
video_labels = {
    {id_video}: {
        "id_action": id_action,
        "placeholders": ["a potato", "a vicks vaporub bottle"],
        "objects": ["potato", "bottle"],
        "frames": {
            idx_frame: {
                "{id_object/hand}": [[x1, y1], [x2, y2]],
            },
        },
    },
}
train_set = [{video_id}, ...]
val_set = [{video_id}, ...]
"""
unsorted_video_labels = {}
train_set = process_labels(sth_sth_train, sth_else, idx_actions, unsorted_video_labels)
val_set = process_labels(sth_sth_val, sth_else, idx_actions, unsorted_video_labels)

video_labels = {}
for key in sorted(unsorted_video_labels.keys()):
    video_labels[key] = unsorted_video_labels[key]

# Create action instances map.
"""
action_instances = [
    [{id_video}, ...]
]
"""
action_instances = [[] for _ in range(len(action_labels))]
for id_video, video_label in video_labels.items():
    id_action = video_label["id_action"]
    action_instances[id_action].append(id_video)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 168913/168913 [01:17<00:00, 2182.60it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24777/24777 [00:26<00:00, 931.13it/s]


## Save labels

In [11]:
import pickle

# with open(paths.data / "action_labels.pkl", "wb") as f:
#     pickle.dump(action_labels, f)
# with open(paths.data / "action_instances.pkl", "wb") as f:
#     pickle.dump(action_instances, f)
# with open(paths.data / "video_labels.pkl", "wb") as f:
#     pickle.dump(video_labels, f)

with open(paths.data / "train_set.pkl", "wb") as f:
    pickle.dump(train_set, f)
with open(paths.data / "val_set.pkl", "wb") as f:
    pickle.dump(val_set, f)

In [12]:
import typing

import h5py
import numpy as np
import tqdm

from env import twentybn
from gpred import video_utils

def create_video_labels_dataset(video_labels: typing.Dict, action_labels: typing.List, action_instances: typing.List):
    """Stores the Something-Else labels in h5py format.

    h5py = {
        "actions": {
            "id_action": {
                attrs: {
                    "id_action": int,
                    "template": utf8,
                },
                "videos": [V] (num_videos) uint32,
            }
        },
        "videos": {
            "id_video": {
                attrs: {
                    "id_video": int,
                    "id_action": int,
                },
                "objects": [O] (num_objects) utf8,
                "keyframes": [T] (num_keyframes) uint32,
                "boxes": [T, 1 + O, 4] (num_keyframes, hand/num_objects, x1/y1/x2/y2) float32,
            }
        }
    }
    
    Args:
        video_labels: Something-Else labels.
    """
    if os.path.isfile(paths.data / "labels.hdf5"):
        print(paths.data / "labels.hdf5", "exists")
        return
    with h5py.File(paths.data / "labels.hdf5", "w") as f:
        # Prepare action labels.
        dset_actions = f.create_group("actions")
        A = len(action_labels)
        for id_action in range(A):
            grp = dset_actions.create_group(str(id_action))
            grp.attrs.create("id_action", id_action, dtype=np.uint32)
            grp.attrs["template"] = action_labels[id_action]["template"]
            grp.create_dataset("videos", data=np.array(action_instances[id_action], dtype=np.uint32))
        
        # Prepare video labels.
        dset_videos = f.create_group("videos")
        for id_video in tqdm.tqdm(video_labels):
            label = video_labels[id_video]
            grp = dset_videos.create_group(str(id_video))
            grp.attrs.create("id_video", id_video, dtype=np.uint32)
            grp.attrs.create("id_action", label["id_action"], dtype=np.uint32)
            
            O = len(label["objects"])
            grp.attrs.create("objects", label["objects"], shape=(O,), dtype=h5py.string_dtype(encoding="utf-8"))
            
            # Get keyframes from actual video.
            keyframes_video = video_utils.get_keyframes(paths.data / f"videos/{id_video}.webm")
            keyframes = []
            boxes = []
            for keyframe in label["frames"]:
                if not keyframe in keyframes_video:
                    continue
                keyframes.append(keyframe)
                boxes_t = np.full((1 + O, 4), -float("inf"), dtype=np.float32)
                for obj, bbox in label["frames"][keyframe].items():
                    idx_obj = twentybn.utils.object_id_to_idx(obj)
                    boxes_t[idx_obj] = np.array(bbox, dtype=np.float32).flatten()
                boxes.append(boxes_t)
            
            grp.create_dataset("keyframes", data=keyframes, dtype=np.uint32)
            grp.create_dataset("boxes", data=boxes, shape=(len(boxes), 1 + O, 4), dtype=np.float32)
        
        f.create_dataset("video_ids", data=list(video_labels.keys()), dtype=np.uint32)

create_video_labels_dataset(video_labels, action_labels, action_instances)

../data/twentybn/labels.hdf5 exists


In [13]:
# hand detector

# hand_detector.Hand
# hand.palm()
# hand.fingertips()

# Extract pre and post frames

## Functions

In [32]:
import typing

# import sys
# sys.path.append("/scratch/data/repos/grounding-predicates")

import numpy as np
import symbolic

from apps import hand_detector
from env import twentybn
from gpred import video_utils, dnf_utils
import config

def point_inside_rectangle(box: np.ndarray, points: np.ndarray) -> typing.Union[bool, np.ndarray]:
    """Checks whether the points fall inside the rectangle.

    Args:
        box: [4] (x1/y1/x2/y2) corners.
        points: [N, 2] or [2] (x/y).
    Returns:
        Boolean if one point is given, array of booleans [N] otherwise.
    """
    # One point.
    if points.shape == (2,):
        return box[0] <= points[0] and points[0] <= box[2] and box[1] <= points[1] and points[1] <= box[3]
    return (box[0] <= points[:,0]) & (points[:,0] <= box[2]) & (box[1] <= points[:,1]) & (points[:,1] <= box[3])

def box_circle_collision(box: np.ndarray, circle: typing.Tuple[np.ndarray, float]) -> bool:
    """Checks whether the box and circle collide.
    
    Args:
        box: [4] (x1/y1/x2/y2) corners.
        circle: ([2] (x/y) center, radius).
    Returns:
        True if the shapes collide.
    """    
    def line_segment_circle_collision(line_segment: np.ndarray, circle: typing.Tuple[np.ndarray, float]) -> bool:
        """Checks whether the line segment and circle collide.
        
        Args:
            line_segment: [4] (x1/y1/x2/y2) endpoints.
            circle: ([2] (x/y) center, radius).
        Returns:
            True if the shapes collide.
        """
        # [[x1, y1], [x2, y2]]
        endpoints = np.reshape(line_segment, (2, 2))
        #print("endpoints:", endpoints)
        
        # [cx, cy]
        center, radius = circle
        #print("center:", center)
        r2 = radius * radius
        #print("radius2:", r2)
        
        # [[x1 - cx, y1 - cy], [x2 - cx, y2 - cy]]
        dc = center[None, :] - endpoints
        dd = np.sum(dc * dc, axis=1)
        idx_min = np.argmin(dd)
        #print("min:", idx_min, dd)
        
        # Check if closer endpoint is within radius.
        if dd[idx_min] < r2:
            return True
        
        # [x, y]
        origin = endpoints[idx_min]
        v_line = endpoints[1 - idx_min] - origin
        v_line /= np.linalg.norm(v_line)
        v_circle = center - origin
        
        # Check if projection of circle onto line falls outside the segment.
        d_circle_line = v_line.dot(v_circle)
        if d_circle_line < 0:
            #print(":", d_circle_line)
            return False
        
        # Orthogonal distance between circle and line.
        d_circle = v_circle - d_circle_line * v_line
        #print("::", d_circle.dot(d_circle), r2)
        return d_circle.dot(d_circle) < r2
    
    x1, y1, x2, y2 = box
    return (
        point_inside_rectangle(box, circle[0]) or
        line_segment_circle_collision(np.array([x1, y1, x1, y2]), circle) or
        line_segment_circle_collision(np.array([x2, y1, x2, y2]), circle) or
        line_segment_circle_collision(np.array([x1, y1, x2, y1]), circle) or
        line_segment_circle_collision(np.array([x1, y2, x2, y2]), circle)
    )

def box_box_collision(box_a: np.ndarray, box_b: np.ndarray) -> bool:
    """Checks whether the two boxes collide.
    
    Args:
        box_a: [4] (x1/y1/x2/y2) corners.
        box_b: [4] (x1/y1/x2/y2) corners.
    Return:
        Whether the two boxes collide.
    """
    minkowski_0 = box_a[:2] - box_b[2:]
    minkowski_1 = box_a[2:] - box_b[:2]
    return (np.sign(minkowski_0) != np.sign(minkowski_1)).all()

def box_hand_collision(box: np.ndarray, hand: hand_detector.Hand, radius: float = 15) -> bool:
    """Checks whether the box overlaps with any of the fingertips.
    
    Args:
        box: [4] (x1/y1/x2/y2) corners.
        hand: Detected hand.
        radius: Distance from fingertips.
    Returns:
        Whether the box overlaps with any of the fingertips.
    """
    for fingertip in hand.fingertips():
        if box_circle_collision(box, (fingertip, radius)):
            return True
    return False

def identify_contained_hand(box: np.ndarray, detected_hands: typing.List[hand_detector.Hand]) -> typing.Optional[hand_detector.Hand]:
    """Identifies which hand is contained inside the bounding box.
    
    Args:
        box: [4] (x1/y1/x2/y2) corners.
        detected_hands: Detected hands output by `hand_detector.HandDetector`.
    Returns:
        Hand corresponding to the one in the bounding box if any.
    """
    is_contained = np.zeros((len(detected_hands),), dtype=int)
    for i, hand in enumerate(detected_hands):
        points = np.concatenate((hand.palm(), hand.fingertips()), axis=0)
        is_contained[i] = point_inside_rectangle(box, points).sum()
    
    if is_contained.sum() == 0:
        return None
    
    idx_max = is_contained.argmax()
    return detected_hands[idx_max]

class PropositionTestFailure(Exception):
    def __init__(self, message):
        self.message = message

class PropositionUnknown(Exception):
    def __init__(self, message):
        self.message = message

def is_sth_visible(boxes: np.ndarray, idx_object: int, expected: typing.Optional[bool] = None) -> bool:
    """Checks whether the specified object is visible.
    
    Raises a PropositionTestFailure if the expected result is specified and does not match the test result.
    
    Args:
        boxes: [4, 4] (hand/a/b/c, x1/y1/x2/y2) box corners.
        idx_object: Object index (0/1/2/3 for "hand"/"a"/"b"/"c").
        expected: Expected result.
    """
    result = idx_object < boxes.shape[0] and boxes[idx_object, 0] >= 0
    
    if expected is not None and result != expected:
        raise PropositionTestFailure(f"visible({idx_object}) != {expected}")
    
    return result

def is_sth_touching_hand(
    boxes: np.ndarray,
    idx_object: int,
    hand: typing.Optional[hand_detector.Hand],
    expected: typing.Optional[bool] = None
) -> bool:
    """Checks whether the specified object is touching the hand.
    
    Raises a PropositionTestFailure if the expected result is specified and does not match the test result.
    
    Args:
        boxes: [4, 4] (hand/a/b/c, x1/y1/x2/y2) box corners.
        idx_object: Object index (0/1/2/3 for "hand"/"a"/"b"/"c").
        hand: Detected hand, if it exists.
        expected: Expected result.
    """
    if idx_object >= boxes.shape[0] or boxes[0, 0] < 0 or boxes[idx_object, 0] < 0:
        result = False
    elif hand is None:
        raise PropositionUnknown(f"touching({'abc'[idx_object-1]}, hand): No hand detected.")
    else:
        box_hand = boxes[0, :]
        box_obj = boxes[idx_object, :]

        # If boxes don't collide, then they are not touching.
        result = box_box_collision(box_hand, box_obj) and box_hand_collision(box_obj, hand)
    
    if expected is not None and result != expected:
        raise PropositionTestFailure(f"touching({'abc'[idx_object-1]}, hand) != {expected}")
    
    return result

def is_sth_touching_sth(boxes: np.ndarray, idx_object_a: int, idx_object_b, expected: typing.Optional[bool] = None) -> bool:
    """Checks whether one object is touching another.
    
    Returns false if the objects are not overlapping, otherwise raises a PropositionUnknown.
    Raises a PropositionTestFailure if the expected result is specified and does not match the test result.
    
    Args:
        boxes: [4, 4] (hand/a/b/c, x1/y1/x2/y2) box corners.
        idx_object_a: Object index (0/1/2/3 for "hand"/"a"/"b"/"c").
        idx_object_b: Object index (0/1/2/3 for "hand"/"a"/"b"/"c").
        expected: Expected result.
    """
    if max(idx_object_a, idx_object_b) >= boxes.shape[0] or boxes[idx_object_a, 0] < 0 or boxes[idx_object_b, 0] < 0:
        raise PropositionUnknown(f"touching({'abc'[idx_object_a-1]}, {'abc'[idx_object_b-1]}): Missing object.")
    
    box_a = boxes[idx_object_a, :]
    box_b = boxes[idx_object_b, :]
    # If boxes don't collide, then they are not touching.
    result = box_box_collision(box_a, box_b)
    if result:
        raise PropositionUnknown(f"touching({'abc'[idx_object_a-1]}, {'abc'[idx_object_b-1]}): Unable to determine from overlapping boxes.")
    
    if expected is not None and result != expected:
        raise PropositionTestFailure(f"touching({'abc'[idx_object_a-1]}, {'abc'[idx_object_b-1]}) != {expected}")
    
    return result

def generate_tests(
    pddl: symbolic.Pddl,
    s_partial: typing.Optional[np.ndarray] = None
) -> typing.List[typing.Tuple[int, typing.Callable[[typing.Dict, bool], bool]]]:
    """Generate tests for detecting start/end frames.
    
    The first test evaluates whether 'a' is touching the hand by testing whether their bounding boxes overlap.
    The remaining tests evaluates whether all the objects are visible.
    Only the tests specifically required by the action pre/post-conditions should be run.
    
    Args:
        pddl: Pddl instance.
        s_partial: [A, 2, 2, N] (action, pre/post, pos/neg, state) Partial states for all actions. If provided, this function will test which actions are not covered by the tests.
    Returns:
        List of (idx_prop, lambda boxes: bool) tuples where bounding boxes should be passed into the lambda to evaluate the test condition.
    """
    idx_props_visible = [pddl.state_index.get_proposition_index(f"visible({obj})") for obj in ["a", "b", "c", "hand"]]
    idx_props_touching = [pddl.state_index.get_proposition_index(f"touching({sth_a}, {sth_b})") for sth_a, sth_b in [("a", "hand"), ("b", "hand"), ("c", "hand"), ("a", "b"), ("a", "c"), ("b", "c")]]

    tests_visible = [
        (idx_props_visible[0], lambda boxes, hand, expected: is_sth_visible(boxes, 1, expected)),
        (idx_props_visible[1], lambda boxes, hand, expected: is_sth_visible(boxes, 2, expected)),
        (idx_props_visible[2], lambda boxes, hand, expected: is_sth_visible(boxes, 3, expected)),
        (idx_props_visible[3], lambda boxes, hand, expected: is_sth_visible(boxes, 0, expected)),
    ]
    tests_touching = [
        (idx_props_touching[0], lambda boxes, hand, expected: is_sth_touching_hand(boxes, 1, hand, expected)),
        (idx_props_touching[1], lambda boxes, hand, expected: is_sth_touching_hand(boxes, 2, None, expected)),
        (idx_props_touching[2], lambda boxes, hand, expected: is_sth_touching_hand(boxes, 3, None, expected)),
        (idx_props_touching[3], lambda boxes, hand, expected: is_sth_touching_sth(boxes, 1, 2, expected)),
        (idx_props_touching[4], lambda boxes, hand, expected: is_sth_touching_sth(boxes, 1, 3, expected)),
        (idx_props_touching[5], lambda boxes, hand, expected: is_sth_touching_sth(boxes, 2, 3, expected)),
    ]
    
    if s_partial is not None:
        print("Actions not covered by tests:")
        idx_props = idx_props_touching + idx_props_visible
        for id_action, action in enumerate(pddl.actions):
            # [2, N]
            s_pre = s_partial[id_action,0,...]
            s_post = s_partial[id_action,1,...]
            if not s_pre[:,idx_props].any():
                print(id_action, actions[id_action])
    
    return tests_visible + tests_touching

def evaluate_tests(
    tests: typing.List[typing.Tuple[int, typing.Callable[[np.ndarray, hand_detector.Hand, bool], bool]]],
    s_partial: np.ndarray,
    boxes: np.ndarray,
    hand: hand_detector.Hand,
) -> bool:
    """Evaluates whether the propositions given by the tests are satisfied in the partial states.
    
    Any proposition not specified in the partial state is assumed to pass its corresponding test.
    
    Args:
        test: List of (idx_prop, lambda(oxes, hand, expectedd) -> bool) pairs.
        s_partial: [2, N] (pos/neg, num_props) Partial state.
        boxes: [4, 4] (hand/a/b/c, x1/y1/x2/y2) Bounding boxes of objects for the given frame.
        hand: Detected hand.
    Returns:
        True if all the tests are satisfied, raises a PropositionTestFailure otherwise.
    """
    s_pos = s_partial[0]
    s_neg = s_partial[1]
    
    for idx_prop, test in tests:
        if not s_pos[idx_prop] and not s_neg[idx_prop]:
            # Proposition not specified in partial state, so don't test.
            continue
            
        # Either pos or neg is true.
        expected = s_pos[idx_prop]
        test(boxes, hand, expected)
    return True

def precompute_tests(
    tests: typing.List[typing.Tuple[int, typing.Callable[[np.ndarray, hand_detector.Hand, bool], bool]]],
    boxes: np.ndarray,
    hand: hand_detector.Hand,
) -> np.ndarray:
    """Evaluates whether the propositions given by the tests are true.
    
    Any proposition not specified in the partial state is assumed to pass its corresponding test.
    
    Args:
        test: List of (idx_prop, lambda(boxes, hand, expectedd) -> bool) pairs.
        s_partial: [2, N] (pos/neg, num_props) Partial state.
        boxes: [4, 4] (hand/a/b/c, x1/y1/x2/y2) Bounding boxes of objects for the given frame.
        hand: Detected hand.
    Returns:
        [2, Q] (pos/neg, num_tests) Partial state over whether each test returns true.
    """
    results = np.zeros((2, len(tests)), dtype=bool)
    
    # Iterate over all tests.
    for idx_test, (idx_prop, test) in enumerate(tests):
        try:
            # Run test.
            val = test(boxes, hand, None)
        except PropositionUnknown as e:
            # Leave partial state as 0.
            continue

        idx_pos_neg = 1 - val
        results[idx_pos_neg, idx_test] = True
    
    return results

def draw_hands(img: np.ndarray, detected_hands: typing.List[hand_detector.Hand]):
    import PIL
    
    img = PIL.Image.fromarray(img)
    draw = PIL.ImageDraw.Draw(img)

    for hand in detected_hands:
        draw.polygon(hand.palm().flatten().tolist(), outline=(0,255,0))
        for xy in hand.palm():
            box = np.concatenate([xy - 10, xy + 10], axis=0)
            draw.ellipse(box.tolist(), outline=(255,0,0))
        for finger in hand.fingers():
            draw.line(finger.flatten().tolist(), fill=(255,0,255))
            xy = finger[-1]
            box = np.concatenate([xy - 15, xy + 15], axis=0)
            draw.ellipse(box.tolist(), outline=(255,255,255))
            
    img = np.array(img)
    
    return img

def evaluate_hands(
    paths: config.EnvironmentPaths,
    video_label: twentybn.dataset.VideoLabel,
    id_video: int,
) -> typing.List[typing.List[np.ndarray]]:
    """Detects hands in the given video.
    
    Args:
        paths: Environment paths.
        video_label: 20BN label.
        id_video: Video id.
    Returns:
        [T] (num_keyframes) list of [H] (num_hands) lists of [L, 2] (num_landmarks, x/y) float32 arrays.
    """
    # Load video.
    video_frames = video_utils.read_video(paths.data / "videos" / f"{id_video}.webm", video_label.keyframes)
    
    NUM_HANDS = 2
    T = len(video_frames)
    
    # [T] (num_keyframes)
    detected_hands = []
    
    # print("about to call hand detector class")
    # Create hand detector.
    with hand_detector.HandDetector(static_image_mode=False, max_num_hands=NUM_HANDS, min_detection_confidence=0.5) as hands:
        # print("entering class instance????")
        # Iterate over all keyframes.
        for t, img in enumerate(video_frames):
            # [H] (num_hands) list of [L, 2] (num_landmarks, x/y) landmarks.
            # print("video frames being analyzed", t)
            detected = [hand.hand_landmarks for hand in hands.detect(img)]
            detected_hands.append(detected)
    
    return detected_hands

def load_detected_hands(landmarks: typing.List[np.ndarray]) -> typing.List[hand_detector.Hand]:
    """Loads detected hands output by `precompute_hands()`.
    
    Args:
        landmarks: [H] (num_hands) list of [L, 2] (num_landmarks, x/y) landmark arrays.
    Returns:
        List of Hand objects.
    """
    return [hand_detector.Hand(hand_landmark) for hand_landmark in landmarks]
#     detected_hands = []
#     for hand_landmarks in landmarks:
#         if hand_landmarks[0, 0] < 0:
#             break
#         detected_hands.append(hand_detector.Hand(hand_landmarks))
#     return detected_hands

def evaluate_video_conditions(
    paths: config.EnvironmentPaths,
    pddl: symbolic.Pddl,
    video_label: twentybn.dataset.VideoLabel,
    hands: np.ndarray,
    id_video: int,
    s_partial: np.ndarray,
    tests: typing.List[typing.Tuple[int, typing.Callable[[np.ndarray, hand_detector.Hand, bool], bool]]],
    generate_video: bool = False,
) -> np.ndarray:
    """Evaluates the pre/post-conditions for the given video.
    
    Args:
        paths: Environment paths.
        pddl: Pddl instance.
        video_label: 20BN label.
        hands: [T] (num_keyframes) list of [H] (num_hands) lists of [L, 2] (num_landmarks, x/y) landmark arrays for given video.
        id_video: Video id.
        s_partial: [2, 2, N] (pre/post, pos/neg, num_props) Partial state for current action.
        tests: Output of `generate_tests()`.
        generate_video: Whether to generate a video with the object/hand detections.
    Returns:
        [2, T] (pre/post, num_frames) int array indicating whether the frame passes the condition tests (0: False, 1: True, -1: Unknown).
    """

    # Get test propositions.
    # [2, 2, N] -> [2, 2, Q] (pre/post, pos/neg, num_tests)
    idx_props = [idx_prop for idx_prop, test in tests]
    s_expected = s_partial[:, :, idx_props]
    prop_labels = ["pre", "post"] + [pddl.state_index.get_proposition(idx_prop) for idx_prop in idx_props]
    
    # [2, 2, Q] -> [2, Q] (pre/post, num_tests)
    s_expected_pos = s_expected[:, 0, :]
    s_expected_neg = s_expected[:, 1, :]
    
    # Prepare output.
    # [2, T] (pre/post, num_frames)
    T = len(hands)
    test_results = np.zeros((2, T), dtype=np.int8)
    
    if generate_video:
        # Load video.
        video_frames = video_utils.read_video(paths.data / "videos" / f"{id_video}.webm", video_label.keyframes)
        
        video_out = []
    
    # Iterate over all keyframes.
    for t in range(T):
        box_hand = video_label.boxes[t, 0, :]
        if box_hand[0] >= 0:
#                 xy1_hand = np.maximum(0, box_hand[:2].astype(np.int) - 100)
#                 xy2_hand = np.minimum(img.shape[:2][::-1], (box_hand[2:] + 101).astype(np.int))
#                 img_hand = img[xy1_hand[1]:xy2_hand[1], xy1_hand[0]:xy2_hand[0]]
#                 detected_hands = hands.detect(img_hand, xy_offset=xy1_hand)
            detected_hands = load_detected_hands(hands[t])
        else:
            detected_hands = []

        # hand = detected_hands[0] if detected_hands else None
        hand = identify_contained_hand(box_hand, detected_hands)

        # Run tests.
        # [2, Q] (pos/neg, num_tests)
        s_results = precompute_tests(tests, video_label.boxes[t], hand)

        # [2, Q] -> [Q]
        s_results_pos = s_results[0, :]
        s_results_neg = s_results[1, :]

        # [2, Q] (pre/post, num_tests)
        violated = (s_expected_pos & s_results_neg[None, :]) | (s_expected_neg & s_results_pos[None, :])
        unknown = (s_expected_pos & ~s_results_pos[None, :]) | (s_expected_neg & ~s_results_neg[None, :])

        # [2] (pre/post)
        satisfied = np.ones((violated.shape[0],), dtype=np.int8)
        satisfied[unknown.any(axis=1)] = -1
        satisfied[violated.any(axis=1)] = 0
        test_results[:, t] = satisfied

        if generate_video:
            # Load video frame.
            img = video_frames[t]
            
            # Draw hands/bounding boxes.
            img = draw_hands(img, detected_hands)
            img = video_utils.draw_bounding_boxes(img, video_label.boxes[t], ["hand"] + video_label.objects)

            # Convert condition test results to probabilities.
            # [2]
            p_conditions = test_results[:, t].astype(np.float32)
            p_conditions[p_conditions < 0] = 0.5

            # Convert test values to probabilities.
            # [Q]
            p_results = s_results_pos.astype(np.float32) - s_results_neg.astype(np.float32)
            p_results = 0.5 * (p_results + 1)

            # Show pre/post-condition timesteps if available.
            prop_labels_t = prop_labels.copy()
            if t in video_label.pre:
                prop_labels_t[0] = "pre   !!!!!!!!!!!!!!!!!!!!"
            elif t in video_label.post:
                prop_labels_t[1] = "post !!!!!!!!!!!!!!!!!!!!"

            # [2], [Q] -> [2 + Q]
            p_predict = np.concatenate((p_conditions, p_results), axis=0)
            img = video_utils.overlay_predictions(img, p_predict, prop_labels_t)
            video_out.append(img)
    
    if generate_video:
        video_utils.write_video(paths.data / "labeled_videos" / f"{id_video}.webm", video_out)
    
    return test_results

def initialize_hands():
    """Initializes global variables required for `precompute_hands()`.
    
    Each worker process only loads the dataset once and then re-uses them for each action.
    """
    global paths, labels, pddl

    paths = config.EnvironmentPaths(environment="twentybn")

    labels = twentybn.dataset.Labels(paths.data / "labels.hdf5")

    pddl = symbolic.Pddl(str(paths.env / "domain.pddl"), str(paths.env / "problem.pddl"))

def initialize_tests():
    """Initializes global variables required for `process_action()`.
    
    Each worker process only loads the dataset once and then re-uses them for each action.
    """
    global paths, labels, pddl, tests, hands

    initialize_hands()

    tests = generate_tests(pddl)
    
    with open(paths.data / "hands.pkl", "rb") as f:
        hands = pickle.load(f)

def precompute_hands(id_action: int) -> typing.Dict[int, typing.List[typing.List[np.ndarray]]]:
    """Precompues hand detections for all the videos for one action.
    
    Assumes `initialize_hands()` has already been called.
    
    Args:
        id_action: Action id.
    Returns:
        Map from id_video to [T] (num_keyframes) list of [H] (num_hands) list of [L, 2] (num_landmarks, x/y) float32 array of hand detections.
    """
    global paths, labels, pddl
    action = str(pddl.actions[id_action])
    s_partial = dnf_utils.get_partial_state(pddl, action)
    
    # Iterate over all videos of the action.
    detected_hands = {}
    id_videos = labels.actions[id_action].videos
    for id_video in id_videos:
#         print("computing hands for video", id_videos)
        try:
            detected_hands[id_video] = evaluate_hands(paths, labels.videos[id_video], id_video)
            print("fuxxesffully computed ", id_videos)
        except Exception as e:
            print(f";(((( id_action={id_action}:\n{e}")
            with open(f"{id_action}.log", "a") as f:
                f.write(f"{id_video}:\n{e}\n")
    
    return detected_hands

def process_action(id_action: int, generate_video: bool = False, num_videos: typing.Optional[int] = None) -> typing.Dict[int, np.ndarray]:
    """Checks the pre/post-conditions for all the videos for one action.
    
    Assumes `initialize_tests()` has already been called.
    
    Args:
        id_action: Action id.
        generate_video: Whether to generate a video for visualization.
        num_videos: Maximum number of videos per action to process.
    Returns:
        Map from id_video to [2, T] (pre/post, num_frames) int array indicating whether the frame passes the condition tests (0: False, 1: True, -1: Unknown).
    """
    global paths, labels, pddl, tests, hands

    action = str(pddl.actions[id_action])
    s_partial = dnf_utils.get_partial_state(pddl, action)
    
    # Iterate over all videos of the action.
    test_results = {}
    id_videos = labels.actions[id_action].videos if num_videos is None else labels.actions[id_action].videos[:num_videos]
    for id_video in id_videos:
        try:
            test_results[id_video] = evaluate_video_conditions(paths, pddl, labels.videos[id_video], hands[id_video], id_video, s_partial, tests, generate_video)
        except Exception as e:
            print(f"id_action={id_action}:\n{e}")
            with open(f"{id_action}.log", "a") as f:
                f.write(f"{id_video}:\n{e}\n")
    
    return test_results

In [33]:
import h5py
import tqdm

def find_continuous_ones(x: np.ndarray, left_to_right: bool):
    """Finds the maximum run of consecutive ones in the array.
        
        Args:
            x: 1d array.
            left_to_right: Whether to break ties with elements from left to right
        Returns:
            Range of largest run of consecutive ones (idx_start, idx_end).
    """
    x = np.concatenate((np.array([0]), x, np.array([0])))
    diff = x[1:] - x[:-1]
    start = np.squeeze(np.argwhere(diff == 1), axis=1)
    end = np.squeeze(np.argwhere(diff == -1), axis=1)
    
    idx = np.arange(len(start))
    if left_to_right:
        idx = idx[::-1]
    
    if left_to_right: 
        unsorted = np.array([(len(start) - i, end[i] - start[i]) for i in range(len(start))], dtype=[("idx", np.uint32), ("val", np.float32)])
    else:
        unsorted = np.array([(i, end[i] - start[i]) for i in range(len(start))], dtype=[("idx", np.uint32), ("val", np.float32)])
    idx_ranges = np.argsort(unsorted, order=("val", "idx"))[::-1]
    
    ranges = np.stack((start[idx_ranges], end[idx_ranges]), axis=0)
    return ranges

def find_pre_post_boundaries(x_class: np.ndarray) -> typing.Tuple[typing.Optional[int], typing.Optional[int]]:
    """Finds the last certain pre-condition index and first certain post-condition index in their respective clusters.
    
    Args:
        x: [T] float32 class predictions (0-1).
    Returns:
        (last pre index, first post index).
    """
    pre_clusters = find_continuous_ones(x_class < 0.5, left_to_right=True)
    post_clusters = find_continuous_ones(x_class > 0.5, left_to_right=False)
#     print("pre_clusters:", pre_clusters)
#     print("post_clusters:", post_clusters)
    
    # Relax the constraints if one of the clusters is empty.
    if pre_clusters.size == 0:
        pre_clusters = find_continuous_ones(x_class <= 0.5, left_to_right=True)
    if post_clusters.size == 0:
        post_clusters = find_continuous_ones(x_class >= 0.5, left_to_right=False)
    
    idx_pre = 0
    idx_post = 0
    if pre_clusters.size == 0 and post_clusters.size == 0:
        # No clusters.
        return (0, x_class.shape[0])
    elif pre_clusters.size == 0:
        # Only post cluster.
        return (0, post_clusters[0, idx_post])
    elif post_clusters.size == 0:
        # Only pre cluster.
        return (pre_clusters[1, idx_pre], x_class.shape[0])
    
    while pre_clusters[0, idx_pre] >= post_clusters[1, idx_post]:
        if idx_pre >= pre_clusters.shape[1] - 1 and idx_post >= post_clusters.shape[1] - 1:
            return None, None
        
        # Avoid going past the last cluster.
        if idx_pre >= pre_clusters.shape[1] - 1:
            idx_post += 1
            continue
        elif idx_post >= post_clusters.shape[1] - 1:
            idx_pre += 1
            continue
        
        # Keep the larger cluster.
        size_pre = pre_clusters[1, idx_pre] - pre_clusters[0, idx_pre]
        size_post = post_clusters[1, idx_post] - post_clusters[0, idx_post]
        if size_pre > size_post:
            idx_post += 1
            continue
        elif size_pre < size_post:
            idx_pre += 1
            continue

        # Clusters have the same size. Advance the one with the larger succeeding cluster.
        size_pre = pre_clusters[1, idx_pre + 1] - pre_clusters[0, idx_pre + 1]
        size_post = post_clusters[1, idx_post + 1] - post_clusters[0, idx_post + 1]
        if size_pre >= size_post:
            idx_pre += 1
            continue
        elif size_post > size_pre:
            idx_post += 1
            continue
    
    # Make sure pre comes before post.
    post_clusters[0, idx_post] = max(post_clusters[0, idx_post], pre_clusters[0, idx_pre])
    pre_clusters[1, idx_pre] = min(pre_clusters[1, idx_pre], post_clusters[1, idx_post])
    
    return (pre_clusters[1, idx_pre], post_clusters[0, idx_post])

def test_results_to_probabilities(test_results: np.ndarray) -> np.ndarray:
    """Converts [2, T] (pre/post) test results where {0=false, 1=true, -1=unknown}
    to a [2, T] probability vector where {0=pre, 1=post, and 0.5=unknown}.
    
    Args:
        test_results: [2, T] (pre/post, num_timesteps) int32 condition test results.
    Returns:
        [2, T] (pre/post, num_timesteps) float32 probability.
    """
    x = np.array(test_results, dtype=np.float32)
    x_pre = x[0]
    x_post = x[1]
    
    # Set uncertain timesteps leaning to one side.
    idx_maybe_pre_post = (x_pre < 0) & (x_post > 0)
    idx_pre_maybe_post = (x_pre > 0) & (x_post < 0)
    idx_not_pre_maybe_post = (x_pre == 0) & (x_post < 0)
    idx_maybe_pre_not_post = (x_pre < 0) & (x_post == 0)
    idx_not_pre_post = idx_maybe_pre_post | idx_not_pre_maybe_post
    idx_pre_not_post = idx_pre_maybe_post | idx_maybe_pre_not_post
    x[:, idx_not_pre_post] = np.array([0.25, 0.75])[:, None]
    x[:, idx_pre_not_post] = np.array([0.75, 0.25])[:, None]
    
    # Set timesteps where both pre- and post-conditions are true.
    x[:, (x == 1).all(axis=0)] = 0.5
    
    # Set timesteps where neither pre- nor post-conditions are known.
    x[x < 0] = 0.5
    
    return x

def find_pre_post_frames(test_results: np.ndarray) -> typing.Tuple[np.ndarray, np.ndarray]:
    """Computes pre- and post-frames from the test results.
    
    Args:
        test_results: [2, T] int32 condition test results.
    Returns:
        (pre-frames, post-frames).
    """
    # [2, T]
    x_prob = test_results_to_probabilities(test_results)
#     print("x_prob:", x_prob)
    
    # Only choose non-zero elements.
    # [T]
    idx_valid = (x_prob != 0).any(axis=0)
    if (idx_valid == 0).all():
        return np.zeros((0,), dtype=np.uint32), np.zeros((0,), dtype=np.uint32)
    
    # Find pre/post boundaries among non-zero elements.
    # [NZ]
    x_class = x_prob[1, idx_valid]
    idx_nonzero = np.array(idx_valid.nonzero()[0], dtype=np.uint32)
    idx_pre_post = find_pre_post_boundaries(x_class)
    if idx_pre_post[0] is None or idx_pre_post[1] is None:
        return np.zeros((0,), dtype=np.uint32), np.zeros((0,), dtype=np.uint32)
#     print("idx_nonzero:", idx_nonzero)
#     print("idx_pre_post:", idx_pre_post)
    
    # Set boundary as mean between last pre and first post index in NZ.
    idx_boundary = int(0.5 * (idx_pre_post[0] + idx_pre_post[1]) + 0.5)
#     print("idx_boundary:", idx_boundary)
    
    # Convert NZ index to timestep.
    assert idx_boundary <= len(idx_nonzero)
    idx_boundary = idx_nonzero[min(len(idx_nonzero) - 1, idx_boundary)]
    
    # Set pre/post frames to uncertain frames (0.5) within the pre/post boundary.
    # (num_pre, num_post)
    idx_pre = idx_nonzero[(x_class <= 0.5) & (idx_nonzero < idx_boundary)]
    idx_post = idx_nonzero[(x_class >= 0.5) & (idx_nonzero >= idx_boundary)]
    
    return idx_pre, idx_post

def append_pre_post_to_dataset(results: typing.Dict[int, np.ndarray], paths: config.EnvironmentPaths, id_action: typing.Optional[int] = None):
    """Appends pre/post frames to the hdf5 dataset.
    
    Args:
        results: Test results in `condition_test_results.pkl`.
        paths: Environment paths.
        id_action: Process only this action, if not None.
    """
    with h5py.File(paths.data / "labels.hdf5", "a") as f:
        if id_action is None:
            id_videos = np.array(f["video_ids"])
        else:
            id_videos = np.array(f[f"actions/{id_action}/videos"])

        for id_video in tqdm.notebook.tqdm(id_videos):
            idx_pre, idx_post = find_pre_post_frames(results[id_video])

            grp_video = f["videos"][str(id_video)]
            if "pre" in grp_video:
                del grp_video["pre"]
            if "post" in grp_video:
                del grp_video["post"]
            grp_video.create_dataset("pre", data=idx_pre, dtype=np.uint32)
            grp_video.create_dataset("post", data=idx_post, dtype=np.uint32)

## Detect hands

In [34]:
# import sys
# sys.path.append("..")

# import symbolic
# from gpred import dnf_utils
# import config

# paths = config.EnvironmentPaths(environment="twentybn")
# pddl = symbolic.Pddl(str(paths.env / "domain.pddl"), str(paths.env / "problem.pddl"))

In [35]:
# import numpy as np
# def get_dnf(
#     pddl: symbolic.Pddl,
#     action: str,
#     M = None,
# ):
#     """Get DNF proposition indices from cache.

#     Args:
#         pddl: Pddl instance.
#         action: Action call.
#         M: Max number of conjunctions in pre/post-conditions DNFs, computed by
#             compute_max_num_conjunctions().
#     Returns:
#         (dnf, mask), where dnf is a [2 x 2 x N x M] index array with pre- and
#         post- conditions along axis 0 and positive and negative propositions
#         along axis 1. mask is a [2 x M] boolean array indicating the used slots
#         along the M-index (since DNFs have a variable number of conjunctions).
#     """

#     def idx_conditions(
#         conjunctions, N: int, M: int
#     ):
#         """Get the proposition indices for the given pre- or post-conditions.

#         Args:
#             conjunctions: Pre- or post-condition conjunctions.
#             N: Number of propositions.
#             M: Max number of conjunctions.
#         Returns:
#             Two [2 x N x M] arrays, one with proposition indices for the partial
#             state, the other with a mask over the used conjunction slots.
#         """

#         def filter_props(state):
#             """Replace derived predicates with normal ones."""
#             if pddl.name == "twentybn":
#                 from env.twentybn import propositions
#             elif pddl.name == "table":
#                 from env.table import propositions
#             elif pddl.name == "gridworld":
#                 from env.gridworld import propositions
#             else:
#                 raise NotImplementedError(f"Unknown env {pddl.name}.")
#             return set(propositions.alias(str_prop) for str_prop in state)

#         idx = np.zeros((2, N, M), dtype=bool)
#         for j, conj in enumerate(conjunctions):
#             s_pos = filter_props(conj.pos)
#             s_neg = filter_props(conj.neg)
#             idx[0, :, j] = get_indexed_state(pddl.state_index, s_pos)
#             idx[1, :, j] = get_indexed_state(pddl.state_index, s_neg)

#         # Create mask over conjunctions
#         mask = np.zeros((M,), dtype=bool)
#         mask[: len(conjunctions)] = True

#         return idx, mask

#     N = len(pddl.state_index)

#     try:
#         dnf_pre, dnf_post = dnf_utils.get_normalized_conditions(pddl, action)
#     except RuntimeError:
#         raise RuntimeError(f"Could not normalize conditions for {action}.")

#     if M is None:
#         M = max(len(dnf_pre.conjunctions), len(dnf_post.conjunctions))

#     idx_pre, mask_pre = idx_conditions(dnf_pre.conjunctions, N, M)
#     idx_post, mask_post = idx_conditions(dnf_post.conjunctions, N, M)

#     dnf = np.stack((idx_pre, idx_post), axis=0)  # [2 x 2 x N x M] bool array
#     mask = np.vstack((mask_pre, mask_post))  # [2 x M] bool array
#     return dnf, mask

In [36]:
# action = str(pddl.actions[0])
# dnf, mask = get_dnf(pddl, action)
# dnf.shape, mask.shape

In [37]:

# def precompute_hands(id_action: int) -> typing.Dict[int, typing.List[typing.List[np.ndarray]]]:
#     """Precompues hand detections for all the videos for one action.
    
#     Assumes `initialize_hands()` has already been called.
    
#     Args:
#         id_action: Action id.
#     Returns:
#         Map from id_video to [T] (num_keyframes) list of [H] (num_hands) list of [L, 2] (num_landmarks, x/y) float32 array of hand detections.
#     """
#     global paths, labels, pddl
#     action = str(pddl.actions[id_action])
#     s_partial = dnf_utils.get_partial_state(pddl, action)
    
#     # Iterate over all videos of the action.
#     detected_hands = {}
#     id_videos = labels.actions[id_action].videos
#     for id_video in id_videos:
# #         print("computing hands for video", id_videos)
#         try:
#             detected_hands[id_video] = evaluate_hands(paths, labels.videos[id_video], id_video)
#             print("fuxxesffully computed ", id_videos)
#         except Exception as e:
#             print(f";(((( id_action={id_action}:\n{e}")
#             with open(f"{id_action}.log", "a") as f:
#                 f.write(f"{id_video}:\n{e}\n")
    
#     return detected_hands

In [38]:
from importlib import reload
reload(hand_detector)

<module 'apps.hand_detector' from '/scratch/bs3639/ego2023/grounding-predicates/scripts/../apps/hand_detector.py'>

In [20]:
# !wget https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/latest/hand_landmarker.task -O ../apps/hand_landmarker.task 
!wget -q https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task -O ../apps/hand_landmarker.task 

In [23]:
initialize_hands()

In [39]:
import concurrent.futures
import pickle
import time

paths = config.EnvironmentPaths(environment="twentybn")
pddl = symbolic.Pddl(str(paths.env / "domain.pddl"), str(paths.env / "problem.pddl"))
A = len(pddl.actions)


hands = {}
with tqdm.notebook.tqdm(total=A) as loop:
    for id_action in range(A):
        hands.update(precompute_hands(id_action))
# Process hand detections for all actions in parallel.
# NUM_WORKERS = 6
# with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_WORKERS, initializer=initialize_hands) as executor:
#     futures = {}
    # for id_action in range(A):
    #     future = executor.submit(precompute_hands, id_action)
    #     futures[future] = id_action
    #     if id_action < NUM_WORKERS:
    #         time.sleep(1)
    
    # hands = {}
    # with tqdm.notebook.tqdm(total=A) as loop:
    #     for future in concurrent.futures.as_completed(futures):
    #         id_action = futures[future]
    #         try:
    #             hands.update(future.result())
    #         except Exception as e:
    #             print(f"Exception for id_action={id_action}:\n{e}")
    #         loop.update(1)

# # Save hand detections.
# with open(paths.data / "hands.pkl", "wb") as f:
#     pickle.dump(hands, f)

  0%|          | 0/174 [00:00<?, ?it/s]

detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'numpy.ndarray'> (240, 427, 3)
detecting image <class 'nump

KeyboardInterrupt: 

"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"

;(((( id_action=170:
"Unable to open object (object 'pre' doesn't exist)"

;(((( id_action=168:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=167:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=166:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=172:
"Unable to open object (object 'pre' doesn't exist)"


;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"

;(((( id_action=170:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=166:
"Unable to open object (object 'pre' doesn't exist)"

;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=167:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=168:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=172:
"Unable to open object (object 'pre' d

"Unable to open object (object 'pre' doesn't exist)";(((( id_action=168:
"Unable to open object (object 'pre' doesn't exist)"

;(((( id_action=172:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=167:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=170:
"Unable to open object (object 'pre' doesn't exist)"

;(((( id_action=166:
"Unable to open object (object 'pre' doesn't exist)"

;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=168:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=172:
"Unable to open object (object 'pre' doesn't exist)"

;(((( id_action=170:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=167:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=166:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"

;(((( id_action=172:
"Unable to open object (object 'pre' do

"Unable to open object (object 'pre' doesn't exist)"


;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=172:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=168:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=170:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=167:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=166:
"Unable to open object (object 'pre' doesn't exist)"



;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=172:
"Unable to open object (object 'pre' doesn't exist)"

;(((( id_action=167:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=166:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=170:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=172:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=164:
"Unable to open object (object 'pre' d


;(((( id_action=168:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=167:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=172:
"Unable to open object (object 'pre' doesn't exist)"


;(((( id_action=170:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=166:
"Unable to open object (object 'pre' doesn't exist)"

;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=168:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=172:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=167:
"Unable to open object (object 'pre' doesn't exist)"


;(((( id_action=170:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=166:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=172:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=168:
"Unable to open ob

"Unable to open object (object 'pre' doesn't exist)";(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=172:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=167:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=166:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=170:
"Unable to open object (object 'pre' doesn't exist)"

;(((( id_action=168:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=167:
"Unable to open object (object 'pre' doesn't exist)"


;(((( id_action=172:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=166:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=167:
"Unable to open object (object 'pre' doesn't exist)"

;(((( id_action=168:
"Unable to open object (object 'pre' doesn't exist)";(((( id_action=164:
"Unable to open object (object 'pre' do

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open o

"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' d

;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open object (object 'pre' doesn't exist)"
;(((( id_action=164:
"Unable to open o

## Evaluate pre/post-condition tests

In [None]:
initialize_tests()
test_results = {}
for id_action in tqdm.notebook.tqdm(range(len(pddl.actions))):
    test_results.update(process_action(id_action))
with open(paths.data / "condition_test_results.pkl", "wb") as f:
    pickle.dump(test_results, f)

In [None]:
import pickle

paths = config.EnvironmentPaths(environment="twentybn")

with open(paths.data / "condition_test_results.pkl", "rb") as f:
    test_results = pickle.load(f)

append_pre_post_to_dataset(test_results, paths)

## Compute condition test statistics

In [None]:
import typing

import h5py
import numpy as np
import pandas as pd
import tqdm.notebook

import config

def compute_condition_statistics(paths: config.EnvironmentPaths, train_set: typing.List[int], val_set: typing.List[int]):
    """Computes pre/post-condition statistics for the 20BN dataset.
    
    Args:
        paths: Environment paths.
        train_set: Video ids in the original 20BN train set.
        val_set: Video ids in the original 20BN val set.
    Returns:
    | Video | Action | Dataset | Pre | Post |
    """
    df = {
        "Video": [],
        "Action": [],
        "Dataset": [],
        "Pre": [],
        "Post": [],
    }
    val_set = set(val_set)
    with h5py.File(paths.data / "labels.hdf5", "r") as f:
        grp_videos = f["videos"]
        video_ids = np.array(f["video_ids"])
        for id_video in tqdm.notebook.tqdm(video_ids):
            grp_video = grp_videos[str(id_video)]
            
            id_action = int(grp_video.attrs["id_action"])
            
            # Assume video is in either train or val set.
            dataset = "val" if id_video in val_set else "train"
            
            pre = grp_video["pre"].size
            post = grp_video["post"].size
            
            df["Video"].append(id_video)
            df["Action"].append(id_action)
            df["Dataset"].append(dataset)
            df["Pre"].append(pre)
            df["Post"].append(post)
    
    df = pd.DataFrame(df)
    
    return df

In [None]:
import pickle

paths = config.EnvironmentPaths(environment="twentybn")

with open(paths.data/ "train_set.pkl", "rb") as f:
    twentybn_train_set = pickle.load(f)
with open(paths.data / "val_set.pkl", "rb") as f:
    twentybn_val_set = pickle.load(f)

stats = compute_condition_statistics(paths, twentybn_train_set, twentybn_val_set)

## Generate train, val, test splits

In [None]:
import random

import matplotlib.pyplot as plt
import seaborn as sns

import symbolic
from gpred import dnf_utils

def generate_dataset_splits(
    pddl: symbolic.Pddl,
    stats: pd.DataFrame,
    twentybn_train_set: typing.List[int],
    twentybn_val_set: typing.List[int],
) -> typing.Tuple[typing.List[int], typing.List[int], typing.List[int]]:
    """Generates train, val, and test sets.
    
    Train and val sets are taken from the original 20BN train set, randomly selected for each action.
    Test set is taken from the original 20BN val set. The final splits are roughly (85, 15, 15).
    
    Args:
        pddl: Pddl instance.
        stats: Table output by `compute_condition_statistics()`.
    Returns:
        (train_set, val_set, test_set) 3-tuple.
    """
    random.seed(0)
    
    TRAIN_VAL = 1 - 0.15 / 0.85  # Assume original train set is 0.85 of the total.
    
    train_set = []
    val_set = []
    test_set = []
    
    A = len(pddl.actions)
    df = {
        "Action": [],
        "Distribution": [],
        "Dataset": [],
    }

    for id_action, action in enumerate(pddl.actions):
        s_partial = dnf_utils.get_partial_state(pddl, str(action))
        if s_partial.sum() == 0:
            df["Action"] += [id_action, id_action, id_action]
            df["Distribution"] += [0, 0, 0]
            df["Dataset"] += ["train", "val", "test"]
            continue

        stats_a = stats[stats.Action == id_action]
        stats_a = stats_a[(stats_a.Pre > 0) & (stats_a.Post > 0)]

        train_val_ids = list(stats_a.Video[stats_a.Dataset == "train"])
        test_ids = list(stats_a.Video[stats_a.Dataset == "val"])
        
        random.shuffle(train_val_ids)
        train_val_split = int(TRAIN_VAL * len(train_val_ids) + 0.5)
        train_ids = train_val_ids[:train_val_split]
        val_ids = train_val_ids[train_val_split:]

        train_set += train_ids
        val_set += val_ids
        test_set += test_ids
        
        num_train, num_val, num_test = len(train_ids), len(val_ids), len(test_ids)
        num_total = num_train + num_val + num_test
        df["Action"] += [id_action, id_action, id_action]
        df["Distribution"] += [num_train / num_total, num_val / num_total, num_test / num_total]
        df["Dataset"] += ["train", "val", "test"]
    
    plt.subplots(figsize=(5, 40))
    sns.barplot(data=df, y="Action", x="Distribution", hue="Dataset", orient="h")
    plt.xlabel("Proportion of dataset")
    plt.ylabel("Action")
    plt.title("Dataset distribution")
    
    # Preserve original dataset order.
    train_set, val_set, test_set = set(train_set), set(val_set), set(test_set)
    train_set = [id_video for id_video in twentybn_train_set if id_video in train_set]
    val_set = [id_video for id_video in twentybn_train_set if id_video in val_set]
    test_set = [id_video for id_video in twentybn_val_set if id_video in test_set]
    
    return train_set, val_set, test_set

In [None]:
pddl = symbolic.Pddl(str(paths.domain_pddl), str(paths.problem_pddl))

train_set, val_set, test_set = generate_dataset_splits(pddl, stats, twentybn_train_set, twentybn_val_set)

print(f"Train: {len(train_set)}")
print(f"Val: {len(val_set)}")
print(f"Test: {len(test_set)}")

with open(paths.data / "dataset_splits.pkl", "wb") as f:
    pickle.dump((train_set, val_set, test_set), f)

## Analyze condition statistics

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.subplots(figsize=(5, 40))

stats[["Action"]] \
    .assign(Pre = stats.Pre > 0, Post = stats.Post > 0) \
    .groupby("Action", as_index=False) \
    .mean() \
    .melt(id_vars="Action", value_vars=["Pre","Post"], var_name="Condition") \
    .pipe((sns.barplot, "data"), y="Action", x="value", hue="Condition", orient="h")

plt.savefig("figures/pre_post.png", bbox_inches="tight", transparent="True", pad_inches=0)
plt.show()

In [None]:
print(f"Partial: {len(stats[(stats.Pre > 0) | (stats.Post > 0)])} / {len(stats)}")
print(f"Complete: {len(stats[(stats.Pre > 0) & (stats.Post > 0)])} / {len(stats)}")
print(f"Train: {len(stats[(stats.Dataset == 'train') & (stats.Pre > 0) & (stats.Post > 0)])} / {len(stats[stats.Dataset == 'train'])}")
print(f"Val: {len(stats[(stats.Dataset == 'val') & (stats.Pre > 0) & (stats.Post > 0)])} / {len(stats[stats.Dataset == 'val'])}")

## Visualize condition tests

In [None]:
# Sort actions by proportion of videos with pre/post frames (from lowest to highest).

id_actions = np.array(stats[["Action"]] \
    .assign(Pre = stats.Pre > 0, Post = stats.Post > 0) \
    .groupby("Action", as_index=False) \
    .mean() \
    .melt(id_vars="Action", value_vars=["Pre","Post"], var_name="Condition") \
    [["Action", "value"]] \
    .groupby("Action", as_index=False) \
    .min() \
    .sort_values("value") \
    [["Action"]]).squeeze().tolist()

In [None]:
initialize_tests()

In [None]:
# Generate videos for 30 worst-performing actions.

" ".join(str(id_action) for id_action in id_actions[:30])

for id_action in tqdm.notebook.tqdm(id_actions[:30]):
    process_action(id_action, generate_video=True, num_videos=15)

In [None]:
path_videos = paths.data / "labeled_videos"
id_videos = [int(p.stem) for p in path_videos.iterdir() if p.suffix in {".mp4", ".webm"}]

mini_action_instances = [[] for _ in range(len(pddl.actions))]
for id_video in id_videos:
    id_action = labels.videos[id_video].id_action
    mini_action_instances[id_action].append(id_video)

display_video_grid(labels, mini_action_instances, paths.data / "labeled_videos", num_rows=5)

In [None]:
id_action = 60

pddl = symbolic.Pddl(str(paths.env / "domain.pddl"), str(paths.env / "problem.pddl"))
test_results.update(process_action(id_action))
with open(paths.data / "condition_test_results.pkl", "wb") as f:
    pickle.dump(test_results, f)

append_pre_post_to_dataset(test_results, paths, id_action=id_action)
_ = process_action(id_action, generate_video=True, num_videos=15)

In [None]:
id_video = 43454
find_pre_post_frames(test_results[id_video])

# Load labels

In [None]:
import pickle

import config

paths = config.EnvironmentPaths(environment="twentybn")

"""
action_labels = [
    {
        "label": "Approaching something with your camera",
        "template": "Approaching [something] with your camera",
    }
]
"""
with open(paths.data / "action_labels.pkl", "rb") as f:
    action_labels = pickle.load(f)

"""
action_instances = [
    [{id_video}, ...]
]
"""
with open(paths.data / "action_instances.pkl", "rb") as f:
    action_instances = pickle.load(f)

"""
video_labels = {
    {id_video}: {
        "id_action": id_action,
        "placeholders": ["a potato", "a vicks vaporub bottle"],
        "objects": ["potato", "bottle"],
        "frames": {
            idx_frame: {
                "{id_object/hand}": [[x1, y1], [x2, y2]],
            },
        },
    },
}
"""
with open(paths.data / "video_labels.pkl", "rb") as f:
    video_labels = pickle.load(f)

"""
train_set = [{id_video}, ...]
"""
with open(paths.data / "train_set.pkl", "rb") as f:
    train_set = pickle.load(f)

"""
val_set = [{id_video}, ...]
"""
with open(paths.data / "val_set.pkl", "rb") as f:
    val_set = pickle.load(f)

"""
video_ranges = {
    {id_video}: (
        [idx_pre_frames, ...],
        [idx_post_frames, ...]
    )
}
"""
with open(paths.data / "video_ranges.pkl", "rb") as f:
    video_ranges = pickle.load(f)

# Generate hdf5 datasets

In [None]:
import concurrent
import pathlib
import random
import typing

import h5py
import numpy as np
try:
    tqdm
except:
    import tqdm.notebook as tqdm

from env import twentybn
from gpred import video_utils

random.seed(0)

"""
dataset.hdf5 = {
  "id_video": [
    "images": [2, 3 + num_objects, H, W], uint8
    "boxes": [num_objects, 4] (x1, y1, x2, y2), float32
  ],
  "videos": [int, ...],
  "actions": [int, ...]
}
"""

def collect_written_videos(f: h5py.File, dataset: typing.List[int]):
    """Record output videos and actions to the dataset.
    
    Args:
        f: Dataset file.
        dataset: List of video ids from the train/val set.
    """
    videos = []
    actions = []
    for id_video in dataset:
        if not str(id_video) in f.keys():
            continue
        videos.append(id_video)
        actions.append(video_labels[id_video]["id_action"])

    dset_videos = f.create_dataset("videos", (len(videos),), dtype=np.uint32)
    dset_videos[:] = videos
    dset_actions = f.create_dataset("actions", (len(videos),), dtype=np.uint8)
    dset_actions[:] = actions

def extract_pre_post(dataset: typing.List[int], filename: str, path: pathlib.Path) -> typing.Dict[int, typing.Tuple]:
    """Extract pre/post frames and save them to an hdf5 dataset.
    
    Args:
        dataset: List of video ids from the train/val set.
        filename: Name of dataset. The file will be saved as filename.hdf5.
        path: Path of dataset.
    """
    def extract_pre_post_worker(id_video: int):
        if id_video not in video_ranges:
            return

        # Get pre/post video frames.
        path_video = path / f"videos/{id_video}.webm"
        keyframes = list(video_labels[id_video]["frames"].keys())
        pre_frames, post_frames = video_ranges[id_video]
        selected_keyframes = [random.choice(pre_frames), random.choice(post_frames)]
        
        try:
            pre_post_frames = video_utils.read_video(path_video, selected_keyframes)
        except:
            return
        if pre_post_frames is None:
            return

        # Write pre/post frames to dataset.
        height, width = pre_post_frames.shape[1:3]
        masks, indexed_boxes = twentybn.utils.create_bbox_masks(id_video, (height, width), video_labels, selected_keyframes)
        boxes = indexed_boxes[:, :, 1:]
        
        # ([2, 3, H, W], [2, 4, H, W]) => [2, 7, H, W]
        images = np.concatenate((np.moveaxis(pre_post_frames, 3, 1), masks), axis=1)
        
        grp = f.create_group(str(id_video))
        dset_images = grp.create_dataset("images", images.shape, dtype=np.uint8)
        dset_images[...] = images
        dset_boxes = grp.create_dataset("boxes", boxes.shape, dtype=np.float32)
        dset_boxes[...] = boxes
    
    with h5py.File(path / f"{filename}.hdf5", "w") as f:
        for id_video in tqdm.tqdm(dataset):
            extract_pre_post_worker(id_video)
#         with concurrent.futures.ThreadPoolExecutor(60) as pool:
#             futures = [pool.submit(extract_pre_post_worker, id_video) for id_video in dataset]
#             with tqdm.tqdm(total=len(dataset)) as pbar:
#                 for result in concurrent.futures.as_completed(futures):
#                     pbar.update(1)
        
        collect_written_videos(f, dataset)

#extract_pre_post(train_set[:10000], "pre_post_train_mini", paths.data)
#extract_pre_post(val_set[:10000], "pre_post_val_mini", paths.data)
extract_pre_post(train_set, "pre_post_train", paths.data)
extract_pre_post(val_set, "pre_post_val", paths.data)

## predicate.hdf5

In [None]:
import pathlib
import typing

import h5py
import hdf5plugin
import numpy as np
import tqdm.notebook

import symbolic
from gpred import video_utils
from env import twentybn

def create_predicate_dataset(
    pddl: symbolic.Pddl,
    labels: twentybn.dataset.Labels,
    dataset: typing.List[int],
    filename: str,
    path: pathlib.Path
):
    """Extracts pre/post frames and save them to an hdf5 dataset.
    
    Args:
        pddl: Pddl instance.
        labels: 20BN labels.
        dataset: List of video ids from the train/val set.
        filename: Name of dataset. The file will be saved as filename.hdf5.
        path: Path of dataset.
    """
    def extract_pre_post_worker(id_video: int):
        video_label = labels.videos[id_video]
        if video_label.pre.size == 0 or video_label.post.size == 0:
            return

        # Get pre/post video frames.
        path_video = path / f"videos/{id_video}.webm"
        keyframes = video_label.keyframes[np.concatenate((video_label.pre, video_label.post))]
        images = video_utils.read_video(path_video, keyframes)
        
        t_post = len(video_label.pre)
        pre_images = images[:t_post]
        post_images = images[t_post:]
        
        # [T, 16, 3, 4] (num_selected_frames, num_arg_combos, roi/arg_a/arg_b, x1/y1/x2/y2)
        pre_boxes = twentybn.utils.split_bbox_args(pddl, video_label, video_label.pre)
        post_boxes = twentybn.utils.split_bbox_args(pddl, video_label, video_label.post)
        
        grp = f.create_group(str(id_video))
        dset_pre_frames = grp.create_dataset("pre_frames", data=video_label.pre, dtype=np.uint32)
        dset_post_frames = grp.create_dataset("post_frames", data=video_label.post, dtype=np.uint32)
        
        H, W = pre_images[0].shape[:2]
        T_pre = len(pre_images)
        T_post = len(post_images)
#         print(len(pre_images), pre_images[0].shape, (T_pre, H, W, 3))
        dset_pre_images = grp.create_dataset("pre_images", data=pre_images, shape=(T_pre, H, W, 3), chunks=(1, H, W, 3), dtype=np.uint8, **hdf5plugin.Blosc(cname="lz4hc"))
        dset_post_images = grp.create_dataset("post_images", data=post_images, shape=(T_post, H, W, 3), chunks=(1, H, W, 3), dtype=np.uint8, **hdf5plugin.Blosc(cname="lz4hc"))
        
        dset_pre_boxes = grp.create_dataset("pre_boxes", data=pre_boxes, dtype=np.float32)
        dset_post_boxes = grp.create_dataset("post_boxes", data=post_boxes, dtype=np.float32)
    
    def collect_written_videos(f: h5py.File, dataset: typing.List[int]):
        """Record output videos and actions to the dataset.

        Args:
            f: Dataset file.
            dataset: List of video ids from the train/val set.
        """
        videos = []
        actions = []
        for id_video in dataset:
            if not str(id_video) in f.keys():
                continue
            videos.append(id_video)
            actions.append(labels.videos[id_video].id_action)

        dset_videos = f.create_dataset("videos", data=videos, dtype=np.uint32)
        dset_actions = f.create_dataset("actions", data=actions, dtype=np.uint32)
    
    with h5py.File(path / f"{filename}.hdf5", "w") as f:
        for id_video in tqdm.notebook.tqdm(dataset):
            extract_pre_post_worker(id_video)
        
        collect_written_videos(f, dataset)

In [None]:
import pickle

import config

paths = config.EnvironmentPaths(environment="twentybn")

pddl = symbolic.Pddl(str(paths.domain_pddl), str(paths.problem_pddl))

labels = twentybn.dataset.Labels()

print(labels.videos)
# with open(paths.data / "dataset_splits.pkl", "rb") as f:
#     train_set, val_set, test_set = pickle.load(f)

In [None]:
create_predicate_dataset(pddl, labels, train_set, "predicate_train", paths.data)
create_predicate_dataset(pddl, labels, val_set, "predicate_val", paths.data)
create_predicate_dataset(pddl, labels, test_set, "predicate_test", paths.data)

In [None]:
import pathlib
import typing

import h5py
import numpy as np
import symbolic
import tqdm

from env import twentybn
from gpred import video_utils
import config

paths = config.EnvironmentPaths(environment="twentybn")

pddl = symbolic.Pddl(str(paths.env / "domain.pddl"), str(paths.env / "problem.pddl"))

"""
dataset.hdf5 = {
  "id_video": [
    "pre_frames": [num_pre_frames], int,
    "post_frames": [num_post_frames], int,
    "pre_images": [num_pre_frames, H, W, 3], uint8,
    "post_images": [num_post_frames, H, W, 3], uint8,
    "pre_boxes": [num_pre_frames, num_arg_combos, 3, 4] (roi/arg_a/arg_b, x1/y1/x2/y2), float32,
    "pre_boxes": [num_post_frames, num_arg_combos, 3, 4] (roi/arg_a/arg_b, x1/y1/x2/y2), float32,
  ],
  "videos": [int, ...],
  "actions": [int, ...]
}
"""



#create_predicate_dataset(train_set[:10000], "predicate_train_mini", paths.data)
#create_predicate_dataset(val_set[:1000], "predicate_val_mini", paths.data)
#create_predicate_dataset(train_set, "predicate_train", paths.data)
#create_predicate_dataset(val_set, "predicate_val", paths.data)

In [None]:
import pathlib
import typing

import h5py
import numpy as np
import symbolic
import tqdm

from env import twentybn
from gpred import video_utils
import config

paths = config.EnvironmentPaths(environment="twentybn")

pddl = symbolic.Pddl(str(paths.env / "domain.pddl"), str(paths.env / "problem.pddl"))

"""
dataset.hdf5 = {
  "id_video": [
    "pre_frames": [num_pre_frames], int,
    "post_frames": [num_post_frames], int,
    "pre_images": [num_pre_frames, H, W, 3], uint8,
    "post_images": [num_post_frames, H, W, 3], uint8,
    "pre_boxes": [num_pre_frames, num_arg_combos, 3, 4] (roi/arg_a/arg_b, x1/y1/x2/y2), float32,
    "pre_boxes": [num_post_frames, num_arg_combos, 3, 4] (roi/arg_a/arg_b, x1/y1/x2/y2), float32,
  ],
  "videos": [int, ...],
  "actions": [int, ...]
}
"""

def collect_written_videos(f: h5py.File, dataset: typing.List[int]):
    """Record output videos and actions to the dataset.
    
    Args:
        f: Dataset file.
        dataset: List of video ids from the train/val set.
    """
    videos = []
    actions = []
    for id_video in dataset:
        if not str(id_video) in f.keys():
            continue
        videos.append(id_video)
        actions.append(video_labels[id_video]["id_action"])

    dset_videos = f.create_dataset("videos", (len(videos),), dtype=np.uint32)
    dset_videos[:] = videos
    dset_actions = f.create_dataset("actions", (len(videos),), dtype=np.uint8)
    dset_actions[:] = actions

def create_predicate_dataset(labels: twentybn.dataset.Labels, dataset: typing.List[int], filename: str, path: pathlib.Path):
    """Extracts pre/post frames and save them to an hdf5 dataset.
    
    Args:
        labels: 20BN labels.
        dataset: List of video ids from the train/val set.
        filename: Name of dataset. The file will be saved as filename.hdf5.
        path: Path of dataset.
    """
    def extract_pre_post_worker(id_video: int):
        if id_video not in video_ranges:
            return

        # Get pre/post video frames.
        path_video = path / f"videos/{id_video}.webm"
        keyframes = list(video_labels[id_video]["frames"].keys())
        pre_frames, post_frames = video_ranges[id_video]
        
        try:
            # [T, H, W, 3]
            pre_images = video_utils.read_video(path_video, pre_frames)
            post_images = video_utils.read_video(path_video, post_frames)
        except:
            return
        if pre_images is None or post_images is None:
            return

        # [T, 16, 3, 4] (num_selected_frames, num_arg_combos, roi/arg_a/arg_b, x1/y1/x2/y2)
        pre_boxes = twentybn.utils.split_bbox_args(pddl, id_video, video_labels, pre_frames)
        post_boxes = twentybn.utils.split_bbox_args(pddl, id_video, video_labels, post_frames)
        
        grp = f.create_group(str(id_video))
        dset_pre_frames = grp.create_dataset("pre_frames", (len(pre_frames),), dtype=int)
        dset_pre_frames[...] = pre_frames
        dset_post_frames = grp.create_dataset("post_frames", (len(post_frames),), dtype=int)
        dset_post_frames[...] = post_frames
        
        dset_pre_images = grp.create_dataset("pre_images", pre_images.shape, dtype=np.uint8, compression="gzip", compression_opts=4)
        dset_pre_images[...] = pre_images
        dset_post_images = grp.create_dataset("post_images", post_images.shape, dtype=np.uint8, compression="gzip", compression_opts=4)
        dset_post_images[...] = post_images
        
        dset_pre_boxes = grp.create_dataset("pre_boxes", pre_boxes.shape, dtype=np.float32)
        dset_pre_boxes[...] = pre_boxes
        dset_post_boxes = grp.create_dataset("post_boxes", post_boxes.shape, dtype=np.float32)
        dset_post_boxes[...] = post_boxes
    
    with h5py.File(path / f"{filename}.hdf5", "w") as f:
        for id_video in tqdm.notebook.tqdm(dataset):
            extract_pre_post_worker(id_video)
        
        collect_written_videos(f, dataset)

#extract_pre_post(train_set[:10000], "pre_post_train_mini", paths.data)
#extract_pre_post(val_set[:10000], "pre_post_val_mini", paths.data)

#create_predicate_dataset(train_set[:10000], "predicate_train_mini", paths.data)
#create_predicate_dataset(val_set[:1000], "predicate_val_mini", paths.data)
#create_predicate_dataset(train_set, "predicate_train", paths.data)
#create_predicate_dataset(val_set, "predicate_val", paths.data)

# Analyze dataset distribution

In [None]:
import math

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tqdm

from gpred import dnf_utils


def plot_predicate_counts(stats: pd.DataFrame):
    """Plots predicates (x) vs. count (y).
    
    Args:
        stats: Longform dataframe output by `compute_pddl_statistics()`.
    """
    f, ax = plt.subplots(figsize=(20, 10))

    sns.set_style("whitegrid")
    g = sns.countplot(data=stats.sort_values("Predicate"), x="Predicate", hue="Label")
    for item in g.get_xticklabels():
        item.set_rotation(90)

def plot_dnfs(stats: pd.DataFrame):
    """Plots a heatmap of actions vs. propositions specified by their DNFs.
    
    Args:
        stats: Longform table output by compute_pddl_statistics().
    """
    SIZE_SECTION = 10
    CMAP = sns.diverging_palette(10, 130, n=100)
    
    df_action_v_prop = stats.astype({"Label": float}).pivot(index=["Action", "Condition"], columns="Proposition", values="Label")
    num_rows = len(df_action_v_prop)
    num_sections = math.ceil(num_rows / SIZE_SECTION)

    f, axs = plt.subplots(num_sections, 1, figsize=(25, num_sections * 5))

    for i in tqdm.notebook.tqdm(range(num_sections)):
        plt.subplot(num_sections, 1, i + 1)
        g = sns.heatmap(data=df_action_v_prop[i*SIZE_SECTION:min(len(df_action_v_prop), (i+1)*SIZE_SECTION)], square=True, cmap=CMAP, linewidths=0.5, linecolor="#eee", cbar_kws={"shrink": 0.5})
        
def plot_predicate_weights(w: np.ndarray):
    """Plots predicates (x) vs. weight (y).
    
    Args:
        stats: Longform dataframe output by `compute_pddl_statistics()`.
    """
    f, ax = plt.subplots(figsize=(20, 10))
    
    df = pd.DataFrame(w.T, columns=["Pos", "Neg"], index=[str(pred) for pred in pddl.predicates])
    df.reset_index(level=0, inplace=True)
    df = pd.melt(df, id_vars=["index"], value_vars=["Pos","Neg"])
    df = df.rename(columns={"index": "Predicate", "variable": "Label", "value": "Weight"})

    sns.set_style("whitegrid")
    g = sns.barplot(data=df.sort_values("Predicate"), x="Predicate", y="Weight", hue="Label")
    for item in g.get_xticklabels():
        item.set_rotation(90)


In [None]:
import h5py

with h5py.File(paths.data / "predicate_val.hdf5","r") as f:
    actions = [str(action) for action in pddl.actions]
    action_instances = [actions[idx_action] for idx_action in f["actions"]]

In [None]:
stats = dnf_utils.compute_pddl_statistics(pddl)

In [None]:
pddl = symbolic.Pddl(str(paths.env / "domain.pddl"), str(paths.problem_pddl))
stats = dnf_utils.compute_pddl_statistics(pddl)

plot_predicate_counts(stats)

In [None]:
w = dnf_utils.compute_predicate_class_weights(pddl, action_instances=action_instances)
plot_predicate_weights(np.minimum(1, w))

In [None]:
w_inv = dnf_utils.compute_predicate_class_weights(pddl)
plot_predicate_weights(w_inv)

In [None]:
plot_dnfs(stats)

### Find video resolution ranges

In [None]:
with h5py.File(paths.data / "pre_post_train.hdf5", "r") as f:
    H_max = 0
    H_min = 10000
    W_max = 0
    W_min = 10000
    for id_video in tqdm.tqdm(f["videos"]):
        dim = f[str(id_video)]["images"].shape[2:]
        H_min = min(H_min, dim[0])
        H_max = max(H_max, dim[0])
        W_min = min(W_min, dim[1])
        W_max = max(W_max, dim[1])

with h5py.File(paths.data / "pre_post_val.hdf5", "r") as f:
    for id_video in tqdm.tqdm(f["videos"]):
        dim = f[str(id_video)]["images"].shape[2:]
        H_min = min(H_min, dim[0])
        H_max = max(H_max, dim[0])
        W_min = min(W_min, dim[1])
        W_max = max(W_max, dim[1])

print(H_max, W_max, H_min, W_min)

### List videos with mismatching placeholders

In [None]:
for id_video, video_label in video_labels.items():
    if len(video_label["objects"]) != len(video_label["placeholders"]):
        if not video_label["id_action"] in (102, 144):
            print(id_video, video_label["id_action"], video_label["objects"], video_label["placeholders"])