In [33]:
import pickle
from glob import glob
from pathlib import Path
import os
from tqdm import tqdm
import pandas as pd
import zarr

from src.models.vision import get_encoder
from src.data.process_demos import encode_demo

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_dir = Path(os.environ["FURNITURE_DATA_DIR"])

rollout_dir = base_dir / "raw" / "sim_rollouts"

## Index the raw rollout data

Now done in a standalone script `src.data.index_rollouts`

In [11]:
paths = glob(str(rollout_dir / "**/*.pkl"), recursive=True)

In [12]:
# Make a new index file in this directory specifying which rollouts were successful and for which task
file_path = rollout_dir / "index.csv"

# Check if the file already exists
if file_path.exists():
    print("Index file already exists, exiting")
else:
    print("Creating index file")
    # Create the index file
    with open(file_path, "w") as f:
        f.write("path,furniture,success\n")

Index file already exists, exiting


In [13]:
# Get a set of all the paths that are already in the index file
read_idxs = set(pd.read_csv(file_path)["path"])
remaining_paths = [p for p in paths if p not in read_idxs]
len(read_idxs), len(remaining_paths)

(272, 6728)

In [10]:
# Process all the rollouts not already in the index file
for path in tqdm(remaining_paths):
    with open(path, "rb") as f:
        rollout = pickle.load(f)

    # Check if the rollout was successful
    success = rollout["success"]

    # Get the furniture name
    furniture = rollout["furniture"]

    # Append the path to the index file
    with open(file_path, "a") as f:
        f.write(f"{path},{furniture},{success}\n")

  4%|▍         | 272/7000 [13:00<5:21:37,  2.87s/it]


KeyboardInterrupt: 

## Augment an existing Zarr array with new data from the index

In [26]:
base_dir = Path("/data/scratch/ankile/furniture-data/data")

In [27]:
zarr_path = (
    base_dir
    / "processed"
    / "sim"
    / "feature_separate_small"
    / "vip"
    / "one_leg"
    / "data_aug.zarr"
)

store = zarr.open(str(zarr_path), mode="a")

In [29]:
if "rollout_paths" not in store:
    store.create_dataset("rollout_paths", shape=(0,), dtype=str)

In [36]:
# Read in the index file as a dataframe
index = pd.read_csv(file_path)

index = index[index["success"] == True]

# Get the paths to all the successful rollouts
paths = index["path"].values

# Compare with the paths already in the zarr file
zarr_paths = store["rollout_paths"][:]
paths = [p for p in paths if p not in zarr_paths]

len(paths)

62

In [37]:
# Get an encoder
encoder = get_encoder("vip", freeze=True, device="cuda:0")
batch_size = 1024



In [39]:
# Check length of actions before
store["action"].shape

(259179, 8)

In [55]:
store["episode_ends"] = list(store["episode_ends"]) + [1]

In [None]:
store["episode_ends"]

In [53]:
# Iterate over the paths and add them to the zarr file
end_index = 0

for path in tqdm(paths[:1]):
    with open(path, "rb") as f:
        data = pickle.load(f)

    # Cut off the last observation because it is not used
    # data["observations"] = data["observations"][:-1]
    assert len(data["actions"]) == len(data["observations"]), f"Mismatch in {path}"

    # store["action"].append(data["actions"])
    # store["rewards"].append(data["rewards"])
    # store["skills"].append(data["skills"])
    store["episode_ends"].append(end_index + len(data["actions"]))

    break
    store["episode_ends"].append(end_index := end_index + len(data["actions"]))
    store["furniture"].append(data["furniture"])

    obs = data["observations"]
    demo_robot_states, demo_features1, demo_features2 = encode_demo(
        encoder, batch_size, obs
    )
    store["robot_state"].append(demo_robot_states)
    store["features1"].append(demo_features1)
    store["features2"].append(demo_features2)
    store["rollout_paths"].append(path)

  0%|          | 0/1 [00:00<?, ?it/s]

600
(511,)





IndexError: tuple index out of range

In [None]:
# Check length of actions after
store["action"].shape