diff --git a/.gitignore b/.gitignore index f4d759f..e8aaad7 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ build dist *.egg-info/ src/teach/analysis/.ipynb_checkpoints/ -pip-wheel-metadata/ \ No newline at end of file +pip-wheel-metadata/ +*.pyc \ No newline at end of file diff --git a/README.md b/README.md index 7d20cd6..5d70c86 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Aishwarya Padmakumar*, Jesse Thomason*, Ayush Shrivastava, Patrick Lange, Anjali Narayan-Chen, Spandana Gella, Robinson Piramuthu, Gokhan Tur, Dilek Hakkani-Tur TEACh is a dataset of human-human interactive dialogues to complete tasks in a simulated household environment. -The code is licensed under the MIT License (see SOFTWARELICENSE), images are licensed under Apache 2.0 +The code and model weights are licensed under the MIT License (see SOFTWARELICENSE), images are licensed under Apache 2.0 (see IMAGESLICENSE) and other data files are licensed under CDLA-Sharing 1.0 (see DATALICENSE). Please include appropriate licensing and attribution when using our data and code, and please cite our paper. @@ -113,8 +113,263 @@ teach_eval \ --inference_output_dir $OUTPUT_DIR \ --split valid_seen \ --metrics_file $METRICS_FILE +``` + +## TEACh Benchmark Challenge + +For participation in the challenge, you will need to submit a docker image container your code and model. +Docker containers using your image will serve your model as HTTP API following the [TEACh API Specification](#TEACh API Specification). +For your convenience, we included the `teach_api` command which implements this API and is compatible with models implementing `teach.inference.teach_model.TeachModel` also used by `teach_inference`. + +We have also included two sample Docker images using `teach.inference.sample_model.SampleModel` and `teach.inference.et_model.ETModel` respectively in +[`docker/`](./docker). + +When evaluating a submissions, the submitted container will be started with access to a single GPU and no internet access. For details see [Step 3 - Start your container](#step-3---start-your-container). + +The main evaluation code invoking your submission will also be run as Docker container. It reuses the `teach_inference` CLI command together with `teach.inference.remote_model.RemoteModel` to call the HTTP API running in your container. For details on how to start it locally see [Step 4 - Start the evaluation](#step-4---start-the-evaluation). + +### Testing Locally + +Assuming you have [downloaded the data](#downloading-the-dataset) to `/home/ubuntu/teach-dataset` and followed [Prerequisites](#prerequisites) and [Remote Server Setup](#remote-server-setup). + + +#### Step 0 - Setup Environment + +```buildoutcfg +export HOST_DATA_DIR=/home/ubuntu/teach-dataset +export HOST_IMAGES_DIR=/home/ubuntu/images +export HOST_OUTPUT_DIR=/home/ubuntu/output +export API_PORT=5000 +export SUBMISSION_PK=168888 +export INFERENCE_GPUS='"device=0"' +export API_GPUS='"device=1"' +export SPLIT=valid_seen +export DOCKER_NETWORK=no-internet + +mkdir -p $HOST_IMAGES_DIR $HOST_OUTPUT_DIR +docker network create --driver=bridge --internal $DOCKER_NETWORK +``` +Note: If you run on a machine that only has a single GPU, set `API_GPUS='"device=0"'`. + +#### Step 1 - Build the `remote-inference-runner` container + +```buildoutcfg +docker build -t remote-inference-runner -f docker/Dockerfile.RemoteInferenceRunner . ``` +#### Step 2 - Build your container + +Note: When customizing the images for your own usage, do not edit the following or your submission will fail: +- `teach_api` options: `--data_dir /data --images_dir /images --split $SPLIT` +- `EXPOSE 5000` and don't change the port the flask API listens on + +For the `SampleModel` example, the corresponding command is: + +```buildoutcfg +docker build -t teach-model-api-samplemodel -f docker/Dockerfile.TEAChAPI-SampleModel . +``` + +For the `baseline models`, follow the corresponding command replacing `MODEL_VARIANT=et` with +the desired variant e.g. `et_plus_a`. + +```buildoutcfg +mkdir -p ./models +mv $HOST_DATA_DIR/baseline_models ./models/ +mv $HOST_DATA_DIR/et_pretrained_models ./models/ +docker build --build-arg MODEL_VARIANT=et -t teach-model-api-etmodel -f docker/Dockerfile.TEAChAPI-ETModel . +``` + +#### Step 3 - Start your container + +For the `SampleModel` example, the corresponding command is: + +```buildoutcfg +docker run -d --rm \ + --gpus $API_GPUS \ + --name TeachModelAPI \ + --network $DOCKER_NETWORK \ + -e SPLIT=$SPLIT \ + -v $HOST_DATA_DIR:/data:ro \ + -v $HOST_IMAGES_DIR/$SUBMISSION_PK:/images:ro \ + -t teach-model-api-samplemodel +``` + +For the baseline models, just replace the image name e.g. if you followed the commands above + +```buildoutcfg +docker run -d --rm \ + --gpus $API_GPUS \ + --name TeachModelAPI \ + --network $DOCKER_NETWORK \ + -e SPLIT=$SPLIT \ + -v $HOST_DATA_DIR:/data:ro \ + -v $HOST_IMAGES_DIR/$SUBMISSION_PK:/images:ro \ + -t teach-model-api-etmodel +``` + +Verify the API is running with + +```buildoutcfg +docker exec TeachModelAPI curl @TeachModelAPI:5000/ping + +Output: +{"action":"Look Up","obj_relative_coord":[0.1,0.2]} +``` + +#### Step 4 - Start the evaluation + +```buildoutcfg +docker run --rm \ + --privileged \ + -e DISPLAY=:0 \ + -e NVIDIA_DRIVER_CAPABILITIES=all \ + --name RemoteInferenceRunner \ + --network $DOCKER_NETWORK \ + --gpus $INFERENCE_GPUS \ + -v /tmp/.X11-unix:/tmp/.X11-unix:ro \ + -v $HOST_DATA_DIR:/data:ro \ + -v $HOST_IMAGES_DIR/$SUBMISSION_PK:/images \ + -v $HOST_OUTPUT_DIR/$SUBMISSION_PK:/output \ + remote-inference-runner teach_inference \ + --data_dir /data \ + --output_dir /output \ + --images_dir /images \ + --split $SPLIT \ + --metrics_file /output/metrics_file \ + --model_module teach.inference.remote_model \ + --model_class RemoteModel \ + --model_api_host_and_port "@TeachModelAPI:$API_PORT" +``` + +#### Step 5 - Results + +The evaluation metrics will be in `$HOST_OUTPUT_DIR/$SUBMISSION_PK/metrics_file`. +Images for each episode will be in `$HOST_IMAGES_DIR/$SUBMISSION_PK`. + +### Running without docker + +You may want to test your implementation without rebuilding Docker images. You can test your model by directly calling the `teach_api` CLI command e.g. + +Using the `teach.inference.sample_model.SampleModel`: + +```buildoutcfg +export DATA_DIR=/home/ubuntu/teach-dataset +export IMAGE_DIR=/tmp/images + +teach_api \ + --data_dir $DATA_DIR \ + --images_dir $IMAGE_DIR +``` + +Using the `teach.inference.et_model.ETModel` assuming you already moved the models from the teach-dataset location to +`./models` following instructions in [Step 2 - Build your container](#step-2---build-your-container). + +```buildoutcfg +export DATA_DIR=/home/ubuntu/teach-dataset +export IMAGE_DIR=/tmp/images + +teach_api \ + --data_dir $DATA_DIR \ + --images_dir $IMAGE_DIR \ + --split valid_seen \ + --model_module teach.inference.et_model \ + --model_class ETModel \ + --model_dir ./models/baseline_models/et \ + --visual_checkpoint ./models/et_pretrained_models/fasterrcnn_model.pth + --object_predictor ./models/et_pretrained_models/maskrcnn_model.pth \ + --seed 4 +``` + +The corresponding command for running `teach_inference` against such an API +without container uses `teach.inference.remote_model.RemoteModel`. + +```buildoutcfg +export DATA_DIR=/home/ubuntu/teach-dataset +export OUTPUT_DIR=/home/ubuntu/output/valid_seen +export METRICS_FILE=/home/ubuntu/output/valid_seen/metrics +export IMAGE_DIR=/tmp/images + +teach_inference \ + --data_dir $DATA_DIR \ + --output_dir $OUTPUT_DIR \ + --split valid_seen \ + --metrics_file $METRICS_FILE \ + --model_module teach.inference.remote_model \ + --model_class RemoteModel \ + --model_api_host_and_port 'localhost:5000' \ + --images_dir $IMAGE_DIR + +``` + +### Smaller split + +It may be useful for faster turn around time to locally create a smaller split in `$DATA_DIR/edh_instances/test_seen` +with a handful of files from `$DATA_DIR/edh_instances/valid_seen` for faster turn around times. + +### Runtime Checks + +The TEACh Benchmark Challenge places a maximum time limit of 36 hours when using all GPUs of a `p3.16xlarge` instance. +The best way to verify that your code is likely to satisfy this requirement would be to use a script to run two Docker evaluation processes in sequence on a `p3.16xlarge` EC2 instance, one for the `valid_seen` split and one for the `valid_unseen` split. +Note that you will need to specify `export API_GPUS='"device=1,2,3,4,5,6,7"'` (we reserve GPU 0 for `ai2thor` in our runs) to use all GPUs and your model code will need to place different instances of the model on different GPUs for this test (see the use of `process_index` in `ETModel.set_up_model()` for an example). +Also note that while the test splits are close in size to the validation splits, they are not identical so your runtime estimate will necessarily be an approximation. + +### TEACh API Specification + +As mentioned above, `teach_api` already implements this API and it is usually not necessary to implement this yourself. During evaluations of submissions, edh_instances without ground truth and images corresponding to the edh_instances' histories will be available in `/data`. `/images` will contain images produced during inference at runtime. `teach_api` already handles loading and passes them to your implementation of `teach.inference.teach_model.TeachModel`. + +#### Start EDH Instance + +This endpoint will be called once at the start of processing a new EDH instance. Currently, we ensure that the API processes only a single EDH instance from start to finish i.e. once called it can be assumed that the previous EDH instance has completed. + +URL : `/start_new_edh_instance` +Method : `POST` +Payload: + +```json +{ + "edh_name": "[name of the EDH instance file]" +} +``` + +Responses: + +Status Code: `200` +Response: `success` + +Status Code: `500` +Response: `[error message]` + + +#### Get next action + +This endpoint will be called at each timestep during inference to get the next predicted action from the model. + +URL : `/get_next_action` +Method : `POST` +Payload: + +```json +{ + "edh_name": "[name of the EDH instance file]", + "img_name": "[name of the image taken in the simulator after the previous action]", + "prev_action": "[JSON string representation of previous action]", // this is optional +} +``` + +Responses: + +Status Code: `200` + +```json +{ + "action": "[An action name from all_agent_actions]", + "obj_relative_coord": [0.1, 0.5] // see teach.inference.teach_model.TeachModel.get_next_action +} +``` + +Status Code: `500` +Response: `[error message]` + ## Security See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. @@ -123,3 +378,5 @@ See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more inform The code is licensed under the MIT License (see SOFTWARELICENSE), images are licensed under Apache 2.0 (see IMAGESLICENSE) and other data files are licensed under CDLA-Sharing 1.0 (see DATALICENSE). + + diff --git a/docker/Dockerfile.RemoteInferenceRunner b/docker/Dockerfile.RemoteInferenceRunner new file mode 100644 index 0000000..91aa91a --- /dev/null +++ b/docker/Dockerfile.RemoteInferenceRunner @@ -0,0 +1,36 @@ +FROM ubuntu:18.04 + +# install python3.8 +RUN apt update && \ + apt install --no-install-recommends -y build-essential software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt install --no-install-recommends -y python3.8 python3.8-dev python3.8-distutils && \ + apt clean && rm -rf /var/lib/apt/lists/* + +# register the version in alternatives and set higher priority to 3.8 +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1 +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2 + +RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \ + ffmpeg \ + vim \ + curl + +# upgrade pip to latest version +RUN curl -s https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ + python3 get-pip.py --force-reinstall && \ + rm get-pip.py + +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + +# copy all source code into the image +COPY . . + +ENV PYTHONPATH /src +RUN pip install -e . + +# Download AI2Thor executable +RUN python3 -c "from teach.settings import get_settings; from teach.simulators.simulator_THOR import COMMIT_ID, TEAChController; TEAChController(base_dir=get_settings().AI2THOR_BASE_DIR, download_only=True, commit_id=COMMIT_ID);" + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/docker/Dockerfile.TEAChAPI-ETModel b/docker/Dockerfile.TEAChAPI-ETModel new file mode 100644 index 0000000..bac456b --- /dev/null +++ b/docker/Dockerfile.TEAChAPI-ETModel @@ -0,0 +1,29 @@ +FROM python:3.8 +ARG MODEL_VARIANT et + +# download model files into et_models and copy over +RUN mkdir -p et_models +COPY models/baseline_models/$MODEL_VARIANT et_models + +# Download Faster RCNN and Mask RCNN and copy over +RUN mkdir -p et_pretrained_models +COPY models/et_pretrained_models et_pretrained_models + +# upgrade pip to latest version +RUN curl -s https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ + python3 get-pip.py --force-reinstall && \ + rm get-pip.py + +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt +COPY . . +RUN pip install -e . +EXPOSE 5000 + +ENV ET_ROOT=/src/teach/modeling/ET/ +ENV PYTHONPATH=/src:$ET_ROOT +ENV SPLIT=valid_seen + +CMD teach_api --model_module teach.inference.et_model --model_class ETModel --data_dir /data --images_dir /images \ +--split $SPLIT --visual_checkpoint /et_pretrained_models/fasterrcnn_model.pth \ +--object_predictor /et_pretrained_models/maskrcnn_model.pth --model_dir /et_models diff --git a/docker/Dockerfile.TEAChAPI-SampleModel b/docker/Dockerfile.TEAChAPI-SampleModel new file mode 100644 index 0000000..4d10ee8 --- /dev/null +++ b/docker/Dockerfile.TEAChAPI-SampleModel @@ -0,0 +1,16 @@ +FROM python:3.8 + +# upgrade pip to latest version +RUN curl -s https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ + python3 get-pip.py --force-reinstall && \ + rm get-pip.py + +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt +COPY . . +RUN pip install -e . +EXPOSE 5000 + +ENV SPLIT=valid_seen + +CMD teach_api --model_module teach.inference.sample_model --model_class SampleModel --data_dir /data --images_dir /images --split $SPLIT --seed 4 diff --git a/requirements.txt b/requirements.txt index 50c4be8..1998244 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,27 @@ pydub==0.24.1 python-Levenshtein tqdm pillow -pydantic==1.8.2 \ No newline at end of file +pydantic==1.8.2 +numpy +pandas +opencv-python +vocab +revtok +Pillow +sacred +etaprogress +scikit-video +lmdb +gtimer +filelock +termcolor +torch==1.7.1 +torchvision==0.8.2 +tensorboardX==1.8 +Flask +flask_restful +future +pandoc +six +typing +ConfigArgParse \ No newline at end of file diff --git a/setup.py b/setup.py index 14e82e6..63584e3 100644 --- a/setup.py +++ b/setup.py @@ -26,10 +26,11 @@ "teach_eval = teach.cli.eval:main", "teach_inference = teach.cli.inference:main", "teach_replay = teach.cli.replay:main", + "teach_api = teach.cli.api:main", ] }, include_package_data=True, - python_requires=">=3.6", + python_requires=">=3.7", zip_safe=False, install_requires=[ "ai2thor==3.1.0", diff --git a/src/teach/cli/api.py b/src/teach/cli/api.py new file mode 100644 index 0000000..1a4f35b --- /dev/null +++ b/src/teach/cli/api.py @@ -0,0 +1,221 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +import json +import os +from argparse import ArgumentParser +from os.path import isfile + +from flask import Flask, jsonify, request +from flask_restful import reqparse +from PIL import Image + +from teach.utils import dynamically_load_class, load_images + +app = Flask(__name__) +app.config["JSONIFY_PRETTYPRINT_REGULAR"] = False +app.logger.info("initialize flask server") + + +def parse_args(): + arg_parser = ArgumentParser() + arg_parser.add_argument( + "--data_dir", + type=str, + required=True, + help='Base data directory containing subfolders "games" and "edh_instances', + ) + arg_parser.add_argument( + "--images_dir", + type=str, + required=True, + help="Images directory containing inference image output", + ) + arg_parser.add_argument( + "--split", + type=str, + default="valid_seen", + choices=["train", "valid_seen", "valid_unseen", "test_seen", "test_unseen"], + help="One of train, valid_seen, valid_unseen, test_seen, test_unseen", + ) + arg_parser.add_argument( + "--model_module", + type=str, + default="teach.inference.sample_model", + help="Path of the python module to load the model class from.", + ) + arg_parser.add_argument( + "--model_class", type=str, default="SampleModel", help="Name of the TeachModel class to use during inference." + ) + arg_parser.add_argument( + "--use_edh_file", dest="use_edh_file", action="store_true", help="Use edh file instead of request json." + ) + arg_parser.add_argument( + "--use_img_file", dest="use_img_file", action="store_true", help="Use img file instead of request bytes." + ) + return arg_parser.parse_known_args() + + +teach_args, model_args = parse_args() +model_class = dynamically_load_class(teach_args.model_module, teach_args.model_class) +process_index, num_processes = 1, 1 +model = model_class(process_index, num_processes, model_args=model_args) + + +def _get_edh_instance(req_args): + if teach_args.use_edh_file: + if not req_args.edh_name: + return None, "request parameter edh_name does not have a value" + edh_instance_path = os.path.join(teach_args.data_dir, "edh_instances", teach_args.split, req_args.edh_name) + if not isfile(edh_instance_path): + return None, f"edh file={edh_instance_path} does not exist" + with open(edh_instance_path) as handle: + edh_instance = json.load(handle) + else: + edh_instance = json.loads(req_args.edh_instance) + return edh_instance, None + + +def _get_img(req_args): + if not req_args.img_name: + return None, "request parameter img_name does not have a value" + if teach_args.use_img_file: + img_path = os.path.join(teach_args.images_dir, req_args.img_name) + if not isfile(img_path): + return None, f"image file={img_path} does not exist" + img = Image.open(img_path) + else: + img_file = request.files.get("img") + if not img_file: + return None, f"image is not set in request with key='img'" + img = Image.open(img_file) + return img, None + + +def _get_edh_history_images(edh_name, edh_instance): + edh_history_images = [] + history_file_names = edh_instance["driver_image_history"] + if not history_file_names: + return edh_history_images, None + + try: + if not teach_args.use_img_file: + images = request.files.getlist("edh_history_images") + if images: + for img in images: + edh_history_images.append(Image.open(img)) + + if not edh_history_images: + image_dir = os.path.join(teach_args.data_dir, "images", teach_args.split, edh_instance["game_id"]) + edh_history_images = load_images(image_dir, history_file_names) + + except Exception: + err_msg = f"failed to load history images edh_name={edh_name}" + app.logger.error(err_msg, exc_info=True) + return None, err_msg + + if not edh_history_images: + err_msg = f"history images are empty for edh_name={edh_name} for history_file_names={history_file_names}" + app.logger.error(err_msg) + return None, err_msg + + return edh_history_images, None + + +@app.route("/get_next_action", methods=["POST"]) +def get_next_action(): + req_args = get_next_action_parse_args() + edh_instance, err_msg = _get_edh_instance(req_args) + if err_msg: + return err_msg, 500 + img, err_msg = _get_img(req_args) + if err_msg: + return err_msg, 500 + prev_action = json.loads(req_args.prev_action) if req_args.prev_action else None + try: + action, obj_relative_coord = model.get_next_action(img, edh_instance, prev_action) + except Exception as e: + err_msg = f"failed to get_next_action with edh_name={req_args.edh_name}" + app.logger.error(err_msg, exc_info=True) + return err_msg, 500 + app.logger.debug(f"model.get_next_action returns action={action}, obj_relative_coord={obj_relative_coord}") + resp = jsonify(action=action, obj_relative_coord=obj_relative_coord) + return resp, 200 + + +@app.route("/start_new_edh_instance", methods=["POST"]) +def start_new_edh_instance(): + req_args = start_new_edh_instance_parse_args() + app.logger.info(f"start_new_edh_instance with edh_name={req_args.edh_name}") + edh_instance, err_msg = _get_edh_instance(req_args) + if err_msg: + return err_msg, 500 + edh_history_images, err_msg = _get_edh_history_images(req_args.edh_name, edh_instance) + if err_msg: + return err_msg, 500 + try: + model.start_new_edh_instance(edh_instance, edh_history_images) + except Exception as e: + err_msg = f"failed to start_new_edh_instance with edh_name={req_args.edh_name}" + app.logger.error(err_msg, exc_info=True) + return err_msg, 500 + return "success", 200 + + +@app.route("/") +@app.route("/ping") +@app.route("/test") +def test(): + resp = jsonify(action="Look Up", obj_relative_coord=[0.1, 0.2]) + return resp, 200 + + +def get_next_action_parse_args(): + parser = reqparse.RequestParser() + parser.add_argument( + "img_name", + type=str, + help="Image name for PIL Image containing agent's egocentric image.", + ) + parser.add_argument( + "edh_name", + type=str, + help="EDH instance file name.", + ) + parser.add_argument( + "prev_action", + type=str, + help="One of None or a dict with keys 'action' and 'obj_relative_coord' containing returned values.", + ) + parser.add_argument( + "edh_instance", + type=str, + help="One of None or a dict with keys 'action' and 'obj_relative_coord' containing returned values.", + ) + args = parser.parse_args() + return args + + +def start_new_edh_instance_parse_args(): + parser = reqparse.RequestParser() + parser.add_argument( + "edh_name", + type=str, + help="EDH instance file name.", + ) + parser.add_argument( + "edh_instance", + type=str, + help="One of None or a dict with keys 'action' and 'obj_relative_coord' containing returned values.", + ) + args = parser.parse_args() + return args + + +def main(): + app.run(host="0.0.0.0", port=5000) + app.logger.info("started flask server") + + +if __name__ == "__main__": + main() diff --git a/src/teach/cli/download.py b/src/teach/cli/download.py index fd1dacb..0053a48 100644 --- a/src/teach/cli/download.py +++ b/src/teach/cli/download.py @@ -10,6 +10,7 @@ import boto3 from botocore import UNSIGNED from botocore.config import Config +from tqdm import tqdm DEFAULT_DATASET_BUCKET_NAME = "teach-dataset" DEFAULT_DIRECTORY = "/tmp/teach-dataset" @@ -19,9 +20,26 @@ "experiment_games.tar.gz", "images_and_states.tar.gz", "tfd_instances.tar.gz", + "baseline_models.tar.gz", + "et_pretrained_models.tar.gz", ] +def update_download_progressbar(t): + def inner(bytes_amount): + t.update(bytes_amount) + + return inner + + +def download_with_progressbar(s3_resource, bucket_name, key, directory): + file_object = s3_resource.Object(bucket_name=bucket_name, key=key) + total_file_size = file_object.content_length + bucket = s3_resource.Bucket(bucket_name) + with tqdm(total=total_file_size, unit="B", unit_scale=True, desc=key) as t: + bucket.download_file(Key=key, Filename=f"{directory}/{key}", Callback=update_download_progressbar(t)) + + def download_dataset(directory, key=None, bucket_name=DEFAULT_DATASET_BUCKET_NAME): """ Download file from the S3 bucket to the target directory. @@ -31,33 +49,39 @@ def download_dataset(directory, key=None, bucket_name=DEFAULT_DATASET_BUCKET_NAM if not os.path.exists(directory): os.makedirs(directory) s3_resource = boto3.resource("s3", region_name="us-east-1", config=Config(signature_version=UNSIGNED)) - bucket = s3_resource.Bucket(bucket_name) if key: print(f"Downloading s3://{bucket_name}/{key} to {directory}") - bucket.download_file(Key=key, Filename=f"{directory}/{key}") + download_with_progressbar(s3_resource, bucket_name, key, directory) else: for file_name in FILE_LIST: print(f"Downloading s3://{bucket_name}/{file_name} to {directory}") - bucket.download_file(Key=file_name, Filename=f"{directory}/{file_name}") + download_with_progressbar(s3_resource, bucket_name, file_name, directory) except Exception as e: print(f"Exception reading from: {bucket_name}") print(f"Exception: {str(e)}") +def extract_all_with_progress(archive, directory): + members = archive.getmembers() + for member in tqdm(iterable=members, total=len(members)): + archive.extract(member=member, path=directory) + + def extract_dataset(directory, file_name=None): """ Extract extract archive file(s) in the given directory. """ print(f"Extracting dataset to {directory}") if file_name: + print(f"Extracting file: {file_name}") with tarfile.open(os.path.join(directory, file_name)) as archive: - archive.extractall(directory) - print(f"Extracted file: {file_name}") + extract_all_with_progress(archive, directory) + else: for file_name in FILE_LIST: + print(f"Extracting file: {file_name}") with tarfile.open(os.path.join(directory, file_name)) as archive: - archive.extractall(directory) - print(f"Extracted file: {file_name}") + extract_all_with_progress(archive, directory) def process_arguments(): @@ -105,7 +129,7 @@ def main(): print("Input directory:", directory) print("Input skip-extract:", skip_extract) - print("Input skip-download:", skip_extract) + print("Input skip-download:", skip_download) print("Input file:", file_name) if not skip_download: diff --git a/src/teach/cli/eval.py b/src/teach/cli/eval.py index 16cf4bf..65f764a 100644 --- a/src/teach/cli/eval.py +++ b/src/teach/cli/eval.py @@ -36,8 +36,8 @@ def main(): "--split", type=str, default="valid_seen", - choices=["train", "valid_seen", "valid_unseen"], - help="One of train, valid_seen, valid_unseen", + choices=["train", "valid_seen", "valid_unseen", "test_seen", "test_unseen"], + help="One of train, valid_seen, valid_unseen, test_seen, test_unseen", ) arg_parser.add_argument( "--max_init_tries", @@ -105,7 +105,7 @@ def main(): results["traj_stats"] = traj_stats with open(args.metrics_file, "w") as h: - json.dump(traj_stats, h) + json.dump(results, h) if __name__ == "__main__": diff --git a/src/teach/cli/inference.py b/src/teach/cli/inference.py index 57fe9fd..5675889 100644 --- a/src/teach/cli/inference.py +++ b/src/teach/cli/inference.py @@ -2,11 +2,12 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 - -import argparse +import glob import json +import multiprocessing as mp import os from argparse import ArgumentParser +from datetime import datetime from teach.eval.compute_metrics import aggregate_metrics from teach.inference.inference_runner import InferenceRunner, InferenceRunnerConfig @@ -24,6 +25,18 @@ def main(): required=True, help='Base data directory containing subfolders "games" and "edh_instances', ) + arg_parser.add_argument( + "--images_dir", + type=str, + required=True, + help="Images directory for episode replay output", + ) + arg_parser.add_argument( + "--use_img_file", + dest="use_img_file", + action="store_true", + help="synchronous save images with model api use the image file instead of streaming image", + ) arg_parser.add_argument( "--output_dir", type=str, @@ -34,8 +47,8 @@ def main(): "--split", type=str, default="valid_seen", - choices=["train", "valid_seen", "valid_unseen"], - help="One of train, valid_seen, valid_unseen", + choices=["train", "valid_seen", "valid_unseen", "test_seen", "test_unseen"], + help="One of train, valid_seen, valid_unseen, test_seen, test_unseen", ) arg_parser.add_argument( "--edh_instance_file", @@ -72,33 +85,49 @@ def main(): "--model_class", type=str, default="SampleModel", help="Name of the TeachModel class to use during inference." ) arg_parser.add_argument( - "model_args", nargs=argparse.REMAINDER, help="Any unknown arguments will be captured and passed to the model" + "--replay_timeout", type=int, default=500, help="The timeout for playing back the interactions in an episode." ) - args = arg_parser.parse_args() + + start_time = datetime.now() + args, model_args = arg_parser.parse_known_args() if args.edh_instance_file: edh_instance_files = [args.edh_instance_file] else: + inference_output_files = glob.glob(os.path.join(args.output_dir, "inference__*.json")) + finished_edh_instance_files = [os.path.join(fn.split("__")[1]) for fn in inference_output_files] edh_instance_files = [ os.path.join(args.data_dir, "edh_instances", args.split, f) for f in os.listdir(os.path.join(args.data_dir, "edh_instances", args.split)) + if f not in finished_edh_instance_files ] + if not edh_instance_files: + print( + f"all the edh instances have been ran for input_dir={os.path.join(args.data_dir, 'edh_instances', args.split)}" + ) + exit(1) runner_config = InferenceRunnerConfig( data_dir=args.data_dir, split=args.split, output_dir=args.output_dir, + images_dir=args.images_dir, metrics_file=args.metrics_file, num_processes=args.num_processes, max_init_tries=args.max_init_tries, max_traj_steps=args.max_traj_steps, max_api_fails=args.max_api_fails, model_class=dynamically_load_class(args.model_module, args.model_class), - model_args=args.model_args, + replay_timeout=args.replay_timeout, + model_args=model_args, + use_img_file=args.use_img_file, ) runner = InferenceRunner(edh_instance_files, runner_config) metrics = runner.run() + inference_end_time = datetime.now() + logger.info("Time for inference: %s" % str(inference_end_time - start_time)) + results = aggregate_metrics(metrics, args) print("-------------") print( @@ -121,10 +150,17 @@ def main(): print("PLW GC: %.3f" % (results["path_length_weighted_goal_condition_success_rate"])) print("-------------") - results["traj_metrics"] = metrics + results["traj_stats"] = metrics with open(args.metrics_file, "w") as h: - json.dump(metrics, h) + json.dump(results, h) + + end_time = datetime.now() + logger.info("Total time for inference and evaluation: %s" % str(end_time - start_time)) if __name__ == "__main__": + # Using spawn method, parent process creates a new and independent child process, + # which avoid sharing unnecessary resources. + # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods + mp.set_start_method("spawn", force=True) main() diff --git a/src/teach/eval/compute_metrics.py b/src/teach/eval/compute_metrics.py index 2818303..287c87b 100644 --- a/src/teach/eval/compute_metrics.py +++ b/src/teach/eval/compute_metrics.py @@ -14,10 +14,14 @@ def evaluate_traj(success, edh_instance, traj_len, final_gc_total, final_gc_sati edh_instance["expected_init_goal_conditions_total"], edh_instance["expected_init_goal_conditions_satisfied"] ) final_gc_satisfied = min(final_gc_total, final_gc_satisfied) - goal_condition_success_rate = 1.0 - ( - (final_gc_total - final_gc_satisfied) - / (edh_instance["expected_init_goal_conditions_total"] - init_gc_satisfied) - ) + + total_goal_conditions = edh_instance["expected_init_goal_conditions_total"] - init_gc_satisfied + # TODO: Remove this after testing and recheck EDH instances to remove any where there is nothing to do + if total_goal_conditions != 0: + unsatisfied_goal_conditions = final_gc_total - final_gc_satisfied + goal_condition_success_rate = 1.0 - (unsatisfied_goal_conditions / total_goal_conditions) + else: + goal_condition_success_rate = 1 # SPL gt_path_len = len(edh_instance["driver_actions_future"]) @@ -35,7 +39,7 @@ def evaluate_traj(success, edh_instance, traj_len, final_gc_total, final_gc_sati "path_len_weighted_success_spl": float(plw_s_spl), "goal_condition_spl": float(pc_spl), "path_len_weighted_goal_condition_spl": float(plw_pc_spl), - "path_len_weight": int(gt_path_len), + "gt_path_len": int(gt_path_len), "success": int(success), "traj_len": int(traj_len), } @@ -135,6 +139,7 @@ def load_traj_metrics(output_file, pred_actions_file, args): traj_metrics = create_new_traj_metrics(edh_instance) traj_metrics["game_id"] = edh_instance["game_id"] traj_metrics["instance_id"] = edh_instance["instance_id"] + traj_metrics["gt_path_len"] = len(edh_instance["driver_actions_future"]) traj_metrics.update( evaluate_traj( success, edh_instance, len(pred_actions), final_goal_conditions_total, final_goal_conditions_satisfied diff --git a/src/teach/inference/et_model.py b/src/teach/inference/et_model.py new file mode 100644 index 0000000..925f329 --- /dev/null +++ b/src/teach/inference/et_model.py @@ -0,0 +1,188 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +import argparse +import os +from pathlib import Path +from typing import List + +import numpy as np +import torch +from alfred import constants +from alfred.data import GuidesEdhDataset +from alfred.data.preprocessor import Preprocessor +from alfred.utils import data_util, eval_util, model_util + +from teach.inference.actions import obj_interaction_actions +from teach.inference.teach_model import TeachModel +from teach.logger import create_logger + +logger = create_logger(__name__) + + +class ETModel(TeachModel): + """ + Wrapper around ET Model for inference + """ + + def __init__(self, process_index: int, num_processes: int, model_args: List[str]): + """Constructor + + :param process_index: index of the eval process that launched the model + :param num_processes: total number of processes launched + :param model_args: extra CLI arguments to teach_eval will be passed along to the model + """ + parser = argparse.ArgumentParser() + parser.add_argument("--seed", type=int, default=1, help="Random seed") + parser.add_argument("--device", type=str, default="cuda", help="cpu or cuda") + parser.add_argument("--model_dir", type=str, required=True, help="Model folder name under $ET_LOGS") + parser.add_argument("--checkpoint", type=str, default="latest.pth", help="latest.pth or model_**.pth") + parser.add_argument("--object_predictor", type=str, required=True, help="Path to MaskRCNN model checkpoint") + parser.add_argument("--visual_checkpoint", type=str, required=True, help="Path to FasterRCNN model checkpoint") + parser.add_argument( + "--skip_edh_history", + action="store_true", + default=False, + help="Specify this to ignore actions and image frames in EDH history", + ) + + args = parser.parse_args(model_args) + args.dout = args.model_dir + self.args = args + + logger.info("ETModel using args %s" % str(args)) + np.random.seed(args.seed) + + self.et_model_args = None + self.object_predictor = None + self.model = None + self.extractor = None + self.vocab = None + self.preprocessor = None + self.set_up_model(process_index) + + self.input_dict = None + self.cur_edh_instance = None + + def set_up_model(self, process_index): + os.makedirs(self.args.dout, exist_ok=True) + model_path = os.path.join(self.args.model_dir, self.args.checkpoint) + logger.info("Loading model from %s" % model_path) + + self.et_model_args = model_util.load_model_args(model_path) + dataset_info = data_util.read_dataset_info_for_inference(self.args.model_dir) + train_data_name = self.et_model_args.data["train"][0] + train_vocab = data_util.load_vocab_for_inference(self.args.model_dir, train_data_name) + + self.object_predictor = eval_util.load_object_predictor(self.args) + if model_path is not None: + torch.cuda.empty_cache() + gpu_count = torch.cuda.device_count() + logger.info(f"gpu_count: {gpu_count}") + device = f"cuda:{process_index % gpu_count}" if self.args.device == "cuda" else self.args.device + self.args.device = device + logger.info(f"Loading model agent using device: {device}") + self.model, self.extractor = eval_util.load_agent(model_path, dataset_info, self.args, for_inference=True) + + self.vocab = {"word": train_vocab["word"], "action_low": self.model.vocab_out} + self.preprocessor = Preprocessor(vocab=self.vocab) + + def start_new_edh_instance(self, edh_instance, edh_history_images, edh_name=None): + self.model.reset() + + self.cur_edh_instance = data_util.process_traj( + edh_instance, Path(os.path.join("test", edh_instance["instance_id"])), 0, self.preprocessor + ) + feat_numpy = {"lang": GuidesEdhDataset.load_lang(self.cur_edh_instance)} + _, self.input_dict, _ = data_util.tensorize_and_pad( + [(self.cur_edh_instance, feat_numpy)], self.args.device, constants.PAD + ) + + if not self.args.skip_edh_history and edh_history_images is not None and len(edh_history_images) > 0: + img_features = self.extractor.featurize(edh_history_images, batch=32) + self.model.frames_traj = img_features + self.model.frames_traj = torch.unsqueeze(self.model.frames_traj, dim=0) + self.model.action_traj = torch.tensor( + [ + self.vocab["action_low"].word2index(action["action_name"]) + for action in edh_instance["driver_action_history"] + ], + device=self.args.device, + ) + self.model.action_traj = torch.unsqueeze(self.model.action_traj, 0) + + def get_next_action(self, img, edh_instance, prev_action, img_name=None, edh_name=None): + """ + Sample function producing random actions at every time step. When running model inference, a model should be + called in this function instead. + :param img: PIL Image containing agent's egocentric image + :param edh_instance: EDH instance + :param prev_action: One of None or a dict with keys 'action' and 'obj_relative_coord' containing returned values + from a previous call of get_next_action + :param img_name: image file name + :param edh_name: EDH instance file name + :return action: An action name from all_agent_actions + :return obj_relative_coord: A relative (x, y) coordinate (values between 0 and 1) indicating an object in the image; + The TEACh wrapper on AI2-THOR examines the ground truth segmentation mask of the agent's egocentric image, selects + an object in a 10x10 pixel patch around the pixel indicated by the coordinate if the desired action can be + performed on it, and executes the action in AI2-THOR. + """ + img_feat = self.extractor.featurize([img], batch=1) + self.input_dict["frames"] = img_feat + + with torch.no_grad(): + prev_api_action = None + if prev_action is not None and "action" in prev_action: + prev_api_action = prev_action["action"] + m_out = self.model.step(self.input_dict, self.vocab, prev_action=prev_api_action) + + m_pred = model_util.extract_action_preds( + m_out, self.model.pad, self.vocab["action_low"], clean_special_tokens=False + )[0] + action = m_pred["action"] + + obj = None + if action in obj_interaction_actions and len(m_pred["object"]) > 0 and len(m_pred["object"][0]) > 0: + obj = m_pred["object"][0][0] + + predicted_click = None + if obj is not None: + predicted_click = self.get_obj_click(obj, img) + logger.debug("Predicted action: %s, obj = %s, click = %s" % (str(action), str(obj), str(predicted_click))) + + # Assume previous action succeeded if no better info available + prev_success = True + if prev_action is not None and "success" in prev_action: + prev_success = prev_action["success"] + + # remove blocking actions + action = self.obstruction_detection(action, prev_success, m_out, self.model.vocab_out) + return action, predicted_click + + def get_obj_click(self, obj_class_idx, img): + rcnn_pred = self.object_predictor.predict_objects(img) + obj_class_name = self.object_predictor.vocab_obj.index2word(obj_class_idx) + candidates = list(filter(lambda p: p.label == obj_class_name, rcnn_pred)) + if len(candidates) == 0: + return [np.random.uniform(), np.random.uniform()] + index = np.argmax([p.score for p in candidates]) + mask = candidates[index].mask[0] + predicted_click = list(np.array(mask.nonzero()).mean(axis=1)) + predicted_click = [ + predicted_click[0] / mask.shape[1], + predicted_click[1] / mask.shape[0], + ] + return predicted_click + + def obstruction_detection(self, action, prev_action_success, m_out, vocab_out): + """ + change 'MoveAhead' action to a turn in case if it has failed previously + """ + if action != "Forward" or prev_action_success: + return action + dist_action = m_out["action"][0][0].detach().cpu() + idx_rotateR = vocab_out.word2index("Turn Right") + idx_rotateL = vocab_out.word2index("Turn Left") + action = "Turn Left" if dist_action[idx_rotateL] > dist_action[idx_rotateR] else "Turn Right" + logger.debug("Blocking action is changed to: %s" % str(action)) + return action diff --git a/src/teach/inference/inference_runner.py b/src/teach/inference/inference_runner.py index dd9630c..8e34f22 100644 --- a/src/teach/inference/inference_runner.py +++ b/src/teach/inference/inference_runner.py @@ -5,7 +5,10 @@ import multiprocessing as mp import os import time +from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass +from os.path import isdir +from pathlib import Path from typing import List, Type from PIL import Image @@ -17,7 +20,12 @@ from teach.inference.teach_model import TeachModel from teach.logger import create_logger from teach.replay.episode_replay import EpisodeReplay -from teach.utils import create_task_thor_from_state_diff, save_dict_as_json, with_retry +from teach.utils import ( + create_task_thor_from_state_diff, + load_images, + save_dict_as_json, + with_retry, +) definitions = Definitions(version="2.0") action_id_to_info = definitions.map_actions_id2info @@ -29,13 +37,16 @@ class InferenceRunnerConfig: data_dir: str split: str output_dir: str + images_dir: str model_class: Type[TeachModel] model_args: List[str] metrics_file: str = "metrics.json" num_processes: int = 1 max_init_tries: int = 3 max_traj_steps: int = 1000 - max_api_fails: int = 3 + max_api_fails: int = 30 + use_img_file: bool = False + replay_timeout: int = 500 class InferenceRunner: @@ -66,15 +77,20 @@ def _get_metrics_files(config): @staticmethod def _launch_processes(edh_instance_files, config: InferenceRunnerConfig): processes = [] + ers = [] try: for process_index in range(config.num_processes): - process = InferenceRunner._launch_process(process_index, edh_instance_files, config) + er = EpisodeReplay("thor", ["ego", "allo", "targetobject"]) + ers.append(er) + process = InferenceRunner._launch_process(process_index, edh_instance_files, config, er) processes.append(process) finally: InferenceRunner._join_processes(processes) + for er in ers: + er.simulator.shutdown_simulator() @staticmethod - def _launch_process(process_index, edh_instance_files, config: InferenceRunnerConfig): + def _launch_process(process_index, edh_instance_files, config: InferenceRunnerConfig, er: EpisodeReplay): num_files = len(edh_instance_files) num_files_per_process = InferenceRunner._get_num_files_per_process( num_files=num_files, num_processes=config.num_processes @@ -87,65 +103,109 @@ def _launch_process(process_index, edh_instance_files, config: InferenceRunnerCo files_to_process = edh_instance_files[start_index:end_index] - process = mp.Process(target=InferenceRunner._run, args=(process_index, files_to_process, config)) + process = mp.Process(target=InferenceRunner._run, args=(process_index, files_to_process, config, er)) process.start() time.sleep(0.1) return process @staticmethod - def _run(process_index, files_to_process, config: InferenceRunnerConfig): + def _run(process_index, files_to_process, config: InferenceRunnerConfig, er: EpisodeReplay): metrics_file = InferenceRunner._get_metrics_file_name_for_process(process_index, config.metrics_file) metrics = dict() model = config.model_class(process_index, config.num_processes, model_args=config.model_args) for file_index, instance_file in enumerate(files_to_process): - instance_id, instance_metrics = InferenceRunner._run_edh_instance(instance_file, config, model) - metrics[instance_id] = instance_metrics - save_dict_as_json(metrics, metrics_file) + try: + instance_id, instance_metrics = InferenceRunner._run_edh_instance(instance_file, config, model, er) + metrics[instance_id] = instance_metrics + save_dict_as_json(metrics, metrics_file) + + logger.info(f"Instance {instance_id}, metrics: {instance_metrics}") + logger.info(f"Process {process_index} completed {file_index + 1} / {len(files_to_process)} instances") + except Exception: + err_msg = f"exception happened for instance={instance_file}, continue with the rest" + logger.error(err_msg, exc_info=True) + continue - logger.info(f"Process {process_index} completed {file_index + 1} / {len(files_to_process)} instances") + @staticmethod + def _load_edh_history_images(edh_instance, config: InferenceRunnerConfig): + image_file_names = edh_instance["driver_image_history"] + image_dir = os.path.join(config.data_dir, "images", config.split, edh_instance["game_id"]) + return load_images(image_dir, image_file_names) @staticmethod - def _run_edh_instance(instance_file, config: InferenceRunnerConfig, model: TeachModel): + def _run_edh_instance(instance_file, config: InferenceRunnerConfig, model: TeachModel, er: EpisodeReplay): edh_instance = InferenceRunner._load_edh_instance(instance_file) edh_check_task = create_task_thor_from_state_diff(edh_instance["state_changes"]) game_file = InferenceRunner._get_game_file(edh_instance, config) metrics = create_new_traj_metrics(edh_instance) - logger.debug(f"Processing instance {edh_instance['instance_id']}") + instance_id = edh_instance["instance_id"] + logger.debug(f"Processing instance {instance_id}") try: init_success, er = with_retry( - fn=lambda: InferenceRunner._initialize_episode_replay(edh_instance, game_file, edh_check_task), + fn=lambda: InferenceRunner._initialize_episode_replay( + edh_instance, game_file, edh_check_task, config.replay_timeout, er + ), retries=config.max_init_tries - 1, check_first_return_value=True, ) except Exception: init_success = False - logger.error("Failed to initialize episode replay", exc_info=True) + logger.error(f"Failed to initialize episode replay for instance={instance_id}", exc_info=True) + + edh_history_images = None + try: + if not config.use_img_file: + edh_history_images = InferenceRunner._load_edh_history_images(edh_instance, config) + except Exception: + init_success = False + logger.error(f"Failed to load_edh_history_images for {instance_id}", exc_info=True) metrics["init_success"] = init_success if not init_success: return edh_instance["instance_id"], metrics - prev_action = None - er.simulator.is_record_mode = True - pred_actions = list() - - traj_steps_taken = 0 - for _ in range(config.max_traj_steps): - traj_steps_taken += 1 - img = InferenceRunner._get_latest_ego_image(er) - action, obj_relative_coord = model.get_next_action(img, edh_instance, prev_action) - step_success = InferenceRunner._execute_action(er.simulator, action, obj_relative_coord) - InferenceRunner._update_metrics(metrics, action, obj_relative_coord, step_success) - prev_action = {"action": action, "obj_relative_coord": obj_relative_coord} - pred_actions.append(prev_action) - if InferenceRunner._should_end_inference(action, metrics, config.max_api_fails): - break + model_started_success = False + try: + model_started_success = model.start_new_edh_instance(edh_instance, edh_history_images, instance_file) + except Exception: + model_started_success = False + metrics["error"] = 1 + logger.error(f"Failed to start_new_edh_instance for {instance_id}", exc_info=True) + + if model_started_success: + prev_action = None + er.simulator.is_record_mode = True + pred_actions = list() + + traj_steps_taken = 0 + for _ in range(config.max_traj_steps): + traj_steps_taken += 1 + try: + img = InferenceRunner._get_latest_ego_image(er) + image_name = InferenceRunner._save_image(config, edh_instance, img, traj_steps_taken) + action, obj_relative_coord = model.get_next_action( + img, edh_instance, prev_action, image_name, instance_file + ) + step_success = InferenceRunner._execute_action(er.simulator, action, obj_relative_coord) + InferenceRunner._update_metrics(metrics, action, obj_relative_coord, step_success) + prev_action = {"action": action, "obj_relative_coord": obj_relative_coord} + pred_actions.append(prev_action) + except Exception as e: + logger.error( + f"_run_edh_instance Exception: {str(e)} for instance_id={instance_id}, " + f"traj_steps_taken={traj_steps_taken}", + exc_info=True, + ) + metrics["error"] = 1 + break + if InferenceRunner._should_end_inference(action, metrics, config.max_api_fails): + break ( success, @@ -162,15 +222,16 @@ def _run_edh_instance(instance_file, config: InferenceRunnerConfig, model: Teach ) metrics.update(metrics_diff) - pred_actions_file = os.path.join(config.output_dir, "pred_actions__" + edh_instance["instance_id"] + ".json") + os.makedirs(config.output_dir, exist_ok=True) + pred_actions_file = os.path.join(config.output_dir, "pred_actions__" + instance_id + ".json") with open(pred_actions_file, "w") as handle: json.dump(pred_actions, handle) er.simulator.dir_out = config.output_dir - output_file = os.path.join(config.output_dir, "inference__" + edh_instance["instance_id"] + ".json") - er.simulator.done(file_name=output_file) + output_file = os.path.join(config.output_dir, "inference__" + instance_id + ".json") + er.simulator.save(file_name=output_file) - return edh_instance["instance_id"], metrics + return instance_id, metrics @staticmethod def _check_episode_progress(er, task): @@ -184,8 +245,8 @@ def _check_episode_progress(er, task): return success, final_goal_conditions_total, final_goal_conditions_satisfied @staticmethod - def _initialize_episode_replay(edh_instance, game_file, task): - er = EpisodeReplay("thor", ["ego", "allo", "targetobject"]) + def _initialize_episode_replay(edh_instance, game_file, task, replay_timeout, er: EpisodeReplay): + start_time = time.perf_counter() er.set_episode_by_fn_and_idx(game_file, 0, 0) edh_interactions = list() for interaction in edh_instance["interactions"][: edh_instance["pred_start_idx"]]: @@ -193,7 +254,14 @@ def _initialize_episode_replay(edh_instance, game_file, task): edh_interactions.append(Interaction.from_dict(interaction, action["action_type"])) er.episode.interactions = edh_interactions - init_success, _ = er.play_episode(task=task, shutdown_on_finish=False) + init_success = False + with ThreadPoolExecutor() as tp: + future = tp.submit(er.play_episode, task=task, shutdown_on_finish=False) + logger.info(f"Started episode replay with timeout: {replay_timeout} sec") + init_success, _ = future.result(timeout=replay_timeout) + + elapsed_time = time.perf_counter() - start_time + logger.info(f"Elapsed time for episode replay: {elapsed_time}") return init_success, er if init_success else None @@ -262,3 +330,26 @@ def _get_num_files_per_process(num_files, num_processes): def _join_processes(processes): for process in processes: process.join() + + @staticmethod + def _save_image(config, edh_instance, img, traj_steps_taken): + image_name = f"img__{edh_instance['instance_id']}_{traj_steps_taken}.jpeg" + if config.use_img_file: + InferenceRunner._save_image_sync(img, image_name, config) + else: + InferenceRunner._save_image_async(img, image_name, config) + return image_name + + @staticmethod + def _save_image_async(img, image_name, config: InferenceRunnerConfig): + process = mp.Process(target=InferenceRunner._save_image_sync, args=(img, image_name, config)) + process.start() + return image_name + + @staticmethod + def _save_image_sync(img, image_name, config: InferenceRunnerConfig): + if not isdir(config.images_dir): + Path(config.images_dir).mkdir(parents=True, exist_ok=True) + image_path = os.path.join(config.images_dir, image_name) + img.save(image_path) + return image_name diff --git a/src/teach/inference/remote_model.py b/src/teach/inference/remote_model.py new file mode 100644 index 0000000..2aec046 --- /dev/null +++ b/src/teach/inference/remote_model.py @@ -0,0 +1,100 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +import json +import logging +import sys +from argparse import ArgumentParser +from io import BytesIO +from typing import List + +import requests + +from teach.inference.teach_model import TeachModel + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +logger.addHandler(logging.StreamHandler(sys.stdout)) + +TEACH_MODEL_API_URL_PREDICT = "http://{}/get_next_action" +TEACH_MODEL_API_URL_START_EDH = "http://{}/start_new_edh_instance" +TEACH_MODEL_API_URL_TEST = "http://{}/test" + + +class RemoteModelException(Exception): + def __init__(self, message): + super().__init__(message) + + +def assign_api_by_process_idx(host_and_ports, process_index): + splits = host_and_ports.split(",") + if process_index >= len(splits): + raise RemoteModelException(f"process_index={process_index} can't be handled by available APIs:{splits}") + return splits[process_index].strip() + + +class RemoteModel(TeachModel): + def __init__(self, process_index: int, num_processes: int, model_args: List[str]): + + parser = ArgumentParser() + parser.add_argument( + "--model_api_host_and_port", + type=str, + default="localhost:5000", + help="Teach Model API hosts and ports, E.g.:api1:5000,api2:5000", + ) + args = parser.parse_args(model_args) + + host_and_port = assign_api_by_process_idx(args.model_api_host_and_port, process_index) + self.test_url = TEACH_MODEL_API_URL_TEST.format(host_and_port) + self.predict_url = TEACH_MODEL_API_URL_PREDICT.format(host_and_port) + self.start_edh_url = TEACH_MODEL_API_URL_START_EDH.format(host_and_port) + + def get_next_action(self, img, edh_instance, prev_action, img_name=None, edh_name=None): + if not img or not edh_instance: + logger.warning("either img or edh_instance is None") + return None, None + img_in_memory = BytesIO() + img.save(img_in_memory, "jpeg") + img_in_memory.seek(0) + data = { + "img_name": img_name, + "edh_name": edh_name, + "prev_action": json.dumps(prev_action) if prev_action else None, + "edh_instance": json.dumps(edh_instance), + } + + resp = requests.post(self.predict_url, data=data, files={"img": (img_name, img_in_memory, "image/jpeg")}) + + if resp.status_code != 200: + logger.debug(f"failed sending data={data}") + raise RemoteModelException(resp.text) + + resp_json = resp.json() + action = resp_json.get("action") + obj_relative_coord = resp_json.get("obj_relative_coord") + return action, obj_relative_coord + + def test_connection(self): + resp = requests.get(self.test_url) + return resp.status_code == 200 + + def start_new_edh_instance(self, edh_instance, edh_history_images, edh_name=None): + images = [] + if edh_history_images: + idx = 0 + for image in edh_history_images: + img_in_memory = BytesIO() + image.save(img_in_memory, "jpeg") + img_in_memory.seek(0) + images.append(("edh_history_images", (f"history{idx}", img_in_memory, "image/jpeg"))) + idx += 1 + + data = {"edh_name": edh_name, "edh_instance": json.dumps(edh_instance)} + resp = requests.post(self.start_edh_url, data=data, files=images) + + if resp.status_code != 200: + logger.debug(f"failed sending data={data}") + raise RemoteModelException(resp.text) + + return True diff --git a/src/teach/inference/sample_model.py b/src/teach/inference/sample_model.py index e95b3f6..425422a 100644 --- a/src/teach/inference/sample_model.py +++ b/src/teach/inference/sample_model.py @@ -33,14 +33,15 @@ def __init__(self, process_index: int, num_processes: int, model_args: List[str] logger.info(f"SampleModel using seed {args.seed}") np.random.seed(args.seed) - def get_next_action(self, img, edh_instance, prev_action): + def get_next_action(self, img, edh_instance, prev_action, img_name=None, edh_name=None): """ - Sample function producing random actions at every time step. When running model inference, a model should be - called in this function instead. + This method will be called at each timestep during inference to get the next predicted action from the model. :param img: PIL Image containing agent's egocentric image :param edh_instance: EDH instance :param prev_action: One of None or a dict with keys 'action' and 'obj_relative_coord' containing returned values from a previous call of get_next_action + :param img_name: image file name + :param edh_name: EDH instance file name :return action: An action name from all_agent_actions :return obj_relative_coord: A relative (x, y) coordinate (values between 0 and 1) indicating an object in the image; The TEACh wrapper on AI2-THOR examines the ground truth segmentation mask of the agent's egocentric image, selects @@ -55,3 +56,14 @@ def get_next_action(self, img, edh_instance, prev_action): np.random.uniform(high=0.99), ] return action, obj_relative_coord + + def start_new_edh_instance(self, edh_instance, edh_history_images, edh_name=None): + """ + Since this class produces random actions at every time step, no particular setup is needed. When running model + inference, this would be a suitable place to preprocess the dialog, action and image history + :param edh_instance: EDH instance + :param edh_history_images: List of images as PIL Image objects (loaded from files in + edh_instance['driver_image_history']) + :param edh_name: EDH instance file name + """ + pass diff --git a/src/teach/inference/teach_model.py b/src/teach/inference/teach_model.py index 2a9b0e7..5528739 100644 --- a/src/teach/inference/teach_model.py +++ b/src/teach/inference/teach_model.py @@ -19,16 +19,30 @@ def __init__(self, process_index: int, num_processes: int, model_args: List[str] """ @abstractmethod - def get_next_action(self, img, edh_instance, prev_action): + def get_next_action(self, img, edh_instance, prev_action, img_name=None, edh_name=None): """ This method will be called at each timestep during inference to get the next predicted action from the model. :param img: PIL Image containing agent's egocentric image :param edh_instance: EDH instance :param prev_action: One of None or a dict with keys 'action' and 'obj_relative_coord' containing returned values from a previous call of get_next_action + :param img_name: image file name + :param edh_name: EDH instance file name :return action: An action name from all_agent_actions :return obj_relative_coord: A relative (x, y) coordinate (values between 0 and 1) indicating an object in the image; The TEACh wrapper on AI2-THOR examines the ground truth segmentation mask of the agent's egocentric image, selects an object in a 10x10 pixel patch around the pixel indicated by the coordinate if the desired action can be performed on it, and executes the action in AI2-THOR. """ + + @abstractmethod + def start_new_edh_instance(self, edh_instance, edh_history_images, edh_name=None): + """ + This method will be called at the start of each EDH instance after the environment has been set to the + initial state by replaying history actions but before any actions are requested from the model by calling + get_next_action + :param edh_instance: EDH instance + :param edh_history_images: List of images as PIL Image objects (loaded from files in + edh_instance['driver_image_history']) + :param edh_name: EDH instance file name + """ diff --git a/src/teach/logger.py b/src/teach/logger.py index 8c45c93..9d00805 100644 --- a/src/teach/logger.py +++ b/src/teach/logger.py @@ -3,6 +3,7 @@ import logging +import sys from teach.settings import get_settings @@ -18,4 +19,8 @@ def create_logger(name: str = None, level=logging.DEBUG): logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(name if name else __name__) logger.setLevel(level) + handler = logging.StreamHandler(sys.stdout) + formatter = logging.Formatter("[%(threadName)s-%(process)s-%(levelname)s] %(name)s: %(message)s") + handler.setFormatter(formatter) + logger.addHandler(handler) return logger diff --git a/src/teach/modeling/ET/LICENSE b/src/teach/modeling/ET/LICENSE new file mode 100644 index 0000000..65554f2 --- /dev/null +++ b/src/teach/modeling/ET/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 ALFRED + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/src/teach/modeling/ET/README.md b/src/teach/modeling/ET/README.md new file mode 100644 index 0000000..5d34ff6 --- /dev/null +++ b/src/teach/modeling/ET/README.md @@ -0,0 +1,81 @@ +# Episodic Transformer based EDH Baseline Model + +This subdirectory is based on the [Episodic Transformer (E.T.) repository](https://github.com/alexpashevich/E.T.) which builds on the [ALFRED repository](https://github.com/askforalfred/alfred). +The E.T. model is adapted here for the TEACh EDH benchmark. +Note that we have removed files not used when running E.T. on TEACh, and many files have been significantly modified. + +The following instructions to train and evaluate an E.T. model on TEACh assume that you have the TEACh dataset downloaded. +If running on a laptop, it might be desirable to mimic the folder structure of the TEACh dataset, but using only a small number of games from each split, and their corresponding images and EDH instances. + +Set some useful environment variables +```buildoutcfg +export ET_DATA=/tmp/teach-dataset +export TEACH_ROOT_DIR=/path/to/teach/repo +export ET_LOGS=/path/to/store/checkpoints +export VENV_DIR=/path/to/folder/to/store/venv +export TEACH_SRC_DIR=$TEACH_ROOT_DIR/src +export ET_ROOT=$TEACH_SRC_DIR/guides/modeling/ET +export INFERENCE_OUTPUT_PATH=/path/to/store/inference/execution/files +``` +Create a virtual environment + +```buildoutcfg +python3 -m venv $VENV_DIR/teach_env +source $VENV_DIR/teach_env/bin/activate +cd TEACH_ROOT_DIR +pip install --upgrade pip +pip install -r requirements.txt +export PYTHONPATH=$TEACH_SRC_DIR:$ET_ROOT:$PYTHONPATH +``` + +Download the ET pretrained checkpoint for Faster RCNN and Mask RCNN models +```buildoutcfg +wget http://pascal.inrialpes.fr/data2/apashevi/et_checkpoints.zip +unzip et_checkpoints.zip +mv pretrained $ET_LOGS/ +rm et_checkpoints.zip +``` + +Perform ET preprocessing (this extracts image features and does some processing of EDH jsons) +```buildoutcfg +python -m alfred.data.create_lmdb \ + with args.visual_checkpoint=$ET_LOGS/pretrained/fasterrcnn_model.pth \ + args.data_input=edh_instances \ + args.task_type=edh \ + args.data_output=lmdb_edh \ + args.vocab_path=None +``` +Note: If running on laptop on a small subset of the data, use `args.vocab_path=$ET_ROOT/files/human.vocab` and add `args.device=cpu`. + + +Train a model (adjust the `train.epochs` value in this command to specify the number of desired train epochs) +```buildoutcfg +python -m alfred.model.train with exp.model=transformer \ + exp.name=teach_et_trial \ + exp.data.train=lmdb_edh \ + train.epochs=20 \ + train.seed=2 +``` +Note: If running on laptop on a small subset of the data, add `exp.device=cpu` and `exp.num_workers=1` + +Copy certain necessary files to the model folder so that we do not have to access training info at inference time. +```buildoutcfg +cp $ET_DATA/lmdb_edh/data.vocab $ET_LOGS/teach_et_trial +cp $ET_DATA/lmdb_edh/params.json $ET_LOGS/teach_et_trial +``` + +Evaluate the trained model +```buildoutcfg +cd $TEACH_ROOT_DIR +python src/teach/cli/inference.py \ + --model_module teach.inference.et_model \ + --model_class ETModel \ + --data_dir $ET_DATA \ + --output_dir $INFERENCE_OUTPUT_PATH/inference__teach_et_trial \ + --split valid_seen \ + --metrics_file $INFERENCE_OUTPUT_PATH/metrics__teach_et_trial.json \ + --seed 4 \ + --model_dir teach_et_trial \ + --object_predictor $ET_LOGS/pretrained/maskrcnn_model.pth \ + --device cpu +``` \ No newline at end of file diff --git a/src/teach/modeling/ET/alfred/README.md b/src/teach/modeling/ET/alfred/README.md new file mode 100644 index 0000000..53b39b5 --- /dev/null +++ b/src/teach/modeling/ET/alfred/README.md @@ -0,0 +1,46 @@ +# Files Structure + +``` +/data + create_lmdb.py (script to create an LMDB dataset out of trajectory files) + preprocessor.py (class to preprocess trajectories annotations and actions) + process_tests.py (script to process test splits for leaderboard evaluation) + zoo/base.py (base class for LMDB dataset loading using multiple threads) + zoo/alfred.py (class to load an LMDB dataset for an E.T. training) + zoo/speaker.py (class to load an LMDB dataset for a translation pretraining) +/env + reward.py (rewards definitions) + tasks.py (tasks definitions) + thor_env.py (interface between AI2Thor and E.T. code) +/eval + eval_agent.py (script to evaluate an agent on full tasks or subgoals) + eval_master.py (class for multi-process evaluation) + eval_subgoals.py (functions for subgoal evaluation) + eval_task.py (functions for full task evaluation) + leaderboard.py (script to evaluate an agent on test splits) +/gen + constants.py (list of constants) + generate_trajs.py (script to generate new trajectories) + goal_library.py (library defining goals using PDDL) + render_trajs.py (script to render existing trajectories) +/model + train.py (script for models training) + base.py (base class for E.T. and translator models) + learned.py (class with main train routines) + speaker.py (translator model) + transformer.py (E.T. model) +/nn + attention.py (basic attention mechanisms) + dec_object.py (object decoder class) + enc_lang.py (language encoder class) + enc_visual.py (visual observations encoder class) + enc_vl.py (multimodal encoder class) + encodings.py (positional and temporal encodings) + transforms.py (visual observations transformations) +/utils + data_util.py (data handling utils) + eval_util.py (evaluation utils) + helper_util.py (help utils) + metric_util.py (utils to compute scores) + model_util.py (utils for E.T. and translation models) +``` diff --git a/src/teach/modeling/ET/alfred/config.py b/src/teach/modeling/ET/alfred/config.py new file mode 100644 index 0000000..844f136 --- /dev/null +++ b/src/teach/modeling/ET/alfred/config.py @@ -0,0 +1,214 @@ +from sacred import Ingredient +from sacred.settings import SETTINGS + +exp_ingredient = Ingredient("exp") +train_ingredient = Ingredient("train") +eval_ingredient = Ingredient("eval") +dagger_ingredient = Ingredient("dagger") + +SETTINGS.CONFIG.READ_ONLY_CONFIG = False + + +@exp_ingredient.config +def cfg_exp(): + # HIGH-LEVEL MODEL SETTINGS + # where to save model and/or logs + name = "default" + # model to use + model = "transformer" + # which device to use + device = "cuda" + # number of data loading workers or evaluation processes (0 for main thread) + num_workers = 12 + # we can fine-tune a pre-trained model + pretrained_path = None + # run the code on a small chunk of data + fast_epoch = False + + # Set this to 1 if running on a Mac and to large numbers like 250 if running on EC2 + lmdb_max_readers = 1 + + # DATA SETTINGS + data = { + # dataset name(s) for training and validation + "train": None, + # additional dataset name(s) can be specified for validation only + "valid": "", + # specify the length of each dataset + "length": 30000, + # what to use as annotations: {'lang', 'lang_frames', 'frames'} + "ann_type": "lang", + # Train dataloader type - sample or shuffle ("sample" results in sampling length points per epoch with + # replacement and "shuffle" results in iterating through the train dataset in random order per epoch + "train_load_type": "shuffle", + } + + lang_pretrain_over_history_subgoals = False + + +@eval_ingredient.config +def cfg_eval(): + # which experiment to evaluate (required) + exp = None + # which checkpoint to load ('latest.pth', 'model_**.pth') + checkpoint = "latest.pth" + # which split to use ('train', 'valid_seen', 'valid_unseen') + split = "valid_seen" + use_sample_for_train = True + use_random_actions = False + no_lang = False + no_vision = False + + # shuffle the trajectories + shuffle = False + # max steps before episode termination + max_steps = 1000 + # max API execution failures before episode termination + max_fails = 10 + # subgoals to evaluate independently, eg:all or GotoLocation,PickupObject or 0,1 + subgoals = "" + # smooth nav actions (might be required based on training data) + smooth_nav = False + # forward model with expert actions (only for subgoals) + no_model_unroll = False + # no teacher forcing with expert (only for subgoals) + no_teacher_force = False + # run in the debug mode + debug = False + # X server number + x_display = "0" + # range of checkpoints to evaluate, (9, 20, 2) means epochs 9, 11, 13, 15, 17, 19 + # if None, only 'latest.pth' will be evaluated + eval_range = (9, 20, 1) + # object predictor path + object_predictor = None + + # Is this evaluation for EDH instances or TFD instances? + eval_type = "edh" + + # Set this to 1 if running on a Mac and to large numbers like 250 if running on EC2 + # lmdb_max_readers = 1 + + # Set this to true if the model was trained (and should for inference try to get a wide view) + wide_view = False + + force_retry = False + + +@train_ingredient.config +def cfg_train(): + # GENERAL TRANING SETTINGS + # random seed + seed = 1 + # load a checkpoint from a previous epoch (if available) + resume = True + # whether to print execution time for different parts of the code + profile = False + + # For ablations + no_lang = False + no_vision = False + + # HYPER PARAMETERS + # batch size + batch = 8 + # number of epochs + epochs = 20 + # optimizer type, must be in ('adam', 'adamw') + optimizer = "adamw" + # L2 regularization weight + weight_decay = 0.33 + # learning rate settings + lr = { + # learning rate initial value + "init": 1e-4, + # lr scheduler type: {'linear', 'cosine', 'triangular', 'triangular2'} + "profile": "linear", + # (LINEAR PROFILE) num epoch to adjust learning rate + "decay_epoch": 10, + # (LINEAR PROFILE) scaling multiplier at each milestone + "decay_scale": 0.1, + # (COSINE & TRIANGULAR PROFILE) learning rate final value + "final": 1e-5, + # (TRIANGULAR PROFILE) period of the cycle to increase the learning rate + "cycle_epoch_up": 0, + # (TRIANGULAR PROFILE) period of the cycle to decrease the learning rate + "cycle_epoch_down": 0, + # warm up period length in epochs + "warmup_epoch": 0, + # initial learning rate will be divided by this value + "warmup_scale": 1, + } + # weight of action loss + action_loss_wt = 1.0 + # weight of object loss + object_loss_wt = 1.0 + # weight of subgoal completion predictor + # subgoal_aux_loss_wt = 0.1 + subgoal_aux_loss_wt = 0 + # weight of progress monitor + # progress_aux_loss_wt = 0.1 + progress_aux_loss_wt = 0 + # maximizing entropy loss (by default it is off) + entropy_wt = 0.0 + + # Should train loss be computed over history actions? (default False) + compute_train_loss_over_history = False + + # TRANSFORMER settings + # size of transformer embeddings + demb = 768 + # number of heads in multi-head attention + encoder_heads = 12 + # number of layers in transformer encoder + encoder_layers = 2 + # how many previous actions to use as input + num_input_actions = 1 + # which encoder to use for language encoder (by default no encoder) + encoder_lang = { + "shared": True, + "layers": 2, + "pos_enc": True, + "instr_enc": False, + } + # which decoder to use for the speaker model + decoder_lang = { + "layers": 2, + "heads": 12, + "demb": 768, + "dropout": 0.1, + "pos_enc": True, + } + # do not propagate gradients to the look-up table and the language encoder + detach_lang_emb = False + + # DROPOUTS + dropout = { + # dropout rate for language (goal + instr) + "lang": 0.0, + # dropout rate for Resnet feats + "vis": 0.3, + # dropout rate for processed lang and visual embeddings + "emb": 0.0, + # transformer model specific dropouts + "transformer": { + # dropout for transformer encoder + "encoder": 0.1, + # remove previous actions + "action": 0.0, + }, + } + + # ENCODINGS + enc = { + # use positional encoding + "pos": True, + # use learned positional encoding + "pos_learn": False, + # use learned token ([WORD] or [IMG]) encoding + "token": False, + # dataset id learned encoding + "dataset": False, + } + + use_alfred_weights = False diff --git a/src/teach/modeling/ET/alfred/constants.py b/src/teach/modeling/ET/alfred/constants.py new file mode 100644 index 0000000..6a72af8 --- /dev/null +++ b/src/teach/modeling/ET/alfred/constants.py @@ -0,0 +1,114 @@ +import os + +######################################################################################################################## +# General Settings + +ET_ROOT = os.environ["ET_ROOT"] +ET_DATA = os.environ["ET_DATA"] if "ET_DATA" in os.environ else None +ET_LOGS = os.environ["ET_LOGS"] if "ET_LOGS" in os.environ else None + +PAD = 0 + +######################################################################################################################## + +# TRAIN AND EVAL SETTINGS +# evaluation on multiple GPUs +NUM_EVAL_WORKERS_PER_GPU = 3 +# vocabulary file name +VOCAB_FILENAME = "data.vocab" +# vocabulary with object classes +OBJ_CLS_VOCAB = "files/obj_cls.vocab" + +############################# + +OBJECTS_ACTIONS = [ + "None", + "AlarmClock", + "Apple", + "AppleSliced", + "ArmChair", + "BaseballBat", + "BasketBall", + "Bathtub", + "BathtubBasin", + "Bed", + "Book", + "Bowl", + "Box", + "Bread", + "BreadSliced", + "ButterKnife", + "CD", + "Cabinet", + "Candle", + "Cart", + "CellPhone", + "Cloth", + "CoffeeMachine", + "CoffeeTable", + "CounterTop", + "CreditCard", + "Cup", + "Desk", + "DeskLamp", + "DiningTable", + "DishSponge", + "Drawer", + "Dresser", + "Egg", + "Faucet", + "FloorLamp", + "Fork", + "Fridge", + "GarbageCan", + "Glassbottle", + "HandTowel", + "Kettle", + "KeyChain", + "Knife", + "Ladle", + "Laptop", + "Lettuce", + "LettuceSliced", + "Microwave", + "Mug", + "Newspaper", + "Ottoman", + "Pan", + "Pen", + "Pencil", + "PepperShaker", + "Pillow", + "Plate", + "Plunger", + "Pot", + "Potato", + "PotatoSliced", + "RemoteControl", + "Safe", + "SaltShaker", + "Shelf", + "SideTable", + "Sink", + "SinkBasin", + "SoapBar", + "SoapBottle", + "Sofa", + "Spatula", + "Spoon", + "SprayBottle", + "Statue", + "StoveBurner", + "TVStand", + "TennisRacket", + "TissueBox", + "Toilet", + "ToiletPaper", + "ToiletPaperHanger", + "Tomato", + "TomatoSliced", + "Vase", + "Watch", + "WateringCan", + "WineBottle", +] diff --git a/src/teach/modeling/ET/alfred/data/__init__.py b/src/teach/modeling/ET/alfred/data/__init__.py new file mode 100644 index 0000000..ea8b64a --- /dev/null +++ b/src/teach/modeling/ET/alfred/data/__init__.py @@ -0,0 +1,2 @@ +from alfred.data.zoo.guides_edh import GuidesEdhDataset +from alfred.data.zoo.guides_speaker import GuidesSpeakerDataset diff --git a/src/teach/modeling/ET/alfred/data/create_lmdb.py b/src/teach/modeling/ET/alfred/data/create_lmdb.py new file mode 100644 index 0000000..c62a171 --- /dev/null +++ b/src/teach/modeling/ET/alfred/data/create_lmdb.py @@ -0,0 +1,305 @@ +import copy +import json +import logging +import os +import pickle +import re +import shutil +import threading +from pathlib import Path + +import torch +from alfred import constants +from alfred.data.preprocessor import Preprocessor +from alfred.nn.enc_visual import FeatureExtractor +from alfred.utils import data_util, helper_util, model_util +from progressbar import ProgressBar +from sacred import Experiment, Ingredient +from vocab import Vocab + +from teach.logger import create_logger + +logger = create_logger(__name__, level=logging.INFO) + +args_ingredient = Ingredient("args") +ex = Experiment("create_data", ingredients=[args_ingredient]) + + +@args_ingredient.config +def cfg_args(): + # name of the output dataset + data_output = "lmdb_teach_edh" + # where to load the original ALFRED dataset images and jsons from + data_input = "edh_instances" + task_type = "edh" + # whether to overwrite old data in case it exists + overwrite = False + # number of processes to run the data processing in (0 for main thread) + num_workers = 4 + # debug run with only 16 entries + fast_epoch = False + + # VISUAL FEATURES SETTINGS + # visual archi (resnet18, fasterrcnn, maskrcnn) + visual_archi = "fasterrcnn" + # where to load a pretrained model from + visual_checkpoint = None + # which images to use (by default: RGBs) + image_folder = "images" + # feature compression + compress_type = "4x" + # which device to use + device = "cuda" + + # LANGUAGE ANNOTATIONS SETTINGS + # generate dataset with subgoal annotations instead of human annotations + subgoal_ann = False + # use an existing vocabulary if specified (None for starting from scratch) + vocab_path = "files/base.vocab" + + +def process_feats(traj_paths, extractor, lock, image_folder, save_path): + (save_path / "feats").mkdir(exist_ok=True) + if str(save_path).endswith("/worker00"): + with lock: + progressbar = ProgressBar(max_value=traj_paths.qsize()) + progressbar.start() + while True: + with lock: + if traj_paths.qsize() == 0: + break + traj_path = Path(traj_paths.get()) + filename_new = "{}:{}".format(traj_path.parts[-2], re.sub(".json", ".pt", traj_path.name)) + # extract features with th extractor + images = data_util.read_traj_images(traj_path, image_folder) + if images is None or len(images) == 0: + raise RuntimeError( + "Failed to find images with image_folder =", + image_folder, + ", traj_path =", + traj_path.parts, + ) + feat = data_util.extract_features(images, extractor) + if feat is not None: + torch.save(feat, save_path / "feats" / filename_new) + with lock: + with open(save_path.parents[0] / "processed_feats.txt", "a") as f: + f.write(str(traj_path) + "\n") + model_util.update_log(save_path.parents[0], stage="feats", update="increase", progress=1) + if str(save_path).endswith("/worker00"): + progressbar.update(progressbar.max_value - traj_paths.qsize()) + if str(save_path).endswith("/worker00"): + progressbar.finish() + + +def process_jsons(traj_paths, preprocessor, lock, save_path): + save_path.mkdir(exist_ok=True) + (save_path / "masks").mkdir(exist_ok=True) + (save_path / "jsons").mkdir(exist_ok=True) + if str(save_path).endswith("/worker00"): + with lock: + progressbar = ProgressBar(max_value=len(traj_paths)) + progressbar.start() + while True: + with lock: + if len(traj_paths) == 0: + break + traj_path = Path(traj_paths.pop()) + with traj_path.open() as f: + traj_orig = json.load(f) + + trajs = [data_util.process_traj(traj_orig, traj_path, 0, preprocessor)] + + # save masks and traj jsons + filename = "{}:{}".format(traj_path.parts[-2], re.sub(".json", ".pkl", traj_path.name)) + with (save_path / "jsons" / filename).open("wb") as f: + pickle.dump(trajs, f) + # report the progress + with lock: + model_util.update_log(save_path.parents[0], stage="jsons", update="increase", progress=1) + if str(save_path).endswith("/worker00"): + progressbar.update(progressbar.max_value - len(traj_paths)) + if str(save_path).endswith("/worker00"): + progressbar.finish() + + +def get_traj_paths(input_path, processed_files_path, fast_epoch): + if (input_path / "processed.txt").exists(): + # the dataset was generated locally + with (input_path / "processed.txt").open() as f: + traj_paths = [line.strip() for line in f.readlines()] + traj_paths = [line.split(";")[0] for line in traj_paths if line.split(";")[1] == "1"] + traj_paths = [str(input_path / line) for line in traj_paths] + else: + # the dataset was downloaded from ALFRED servers + traj_paths_all = sorted([str(path) for path in input_path.glob("*/*.json")]) + traj_paths = traj_paths_all + if fast_epoch: + traj_paths = traj_paths[::20] + num_files = len(traj_paths) + if processed_files_path is not None and processed_files_path.exists(): + if str(processed_files_path).endswith(constants.VOCAB_FILENAME): + traj_paths = [] + else: + with processed_files_path.open() as f: + processed_files = set([line.strip() for line in f.readlines()]) + traj_paths = [traj for traj in traj_paths if traj not in processed_files] + traj_paths = [Path(path) for path in traj_paths] + return traj_paths, num_files + + +def run_in_parallel(func, num_workers, output_path, args, use_processes=False): + if num_workers == 0: + args.append(output_path / "worker00") + func(*args) + else: + threads = [] + for idx in range(num_workers): + args_worker = copy.copy(args) + [output_path / "worker{:02d}".format(idx)] + if not use_processes: + ThreadClass = threading.Thread + else: + ThreadClass = torch.multiprocessing.Process + thread = ThreadClass(target=func, args=args_worker) + thread.start() + threads.append(thread) + for thread in threads: + thread.join() + + +def gather_data(output_path, num_workers): + for dirname in ("feats", "masks", "jsons"): + if (output_path / dirname).is_dir(): + shutil.rmtree(output_path / dirname) + (output_path / dirname).mkdir() + for dirname in ("feats", "masks", "jsons"): + for path_file in output_path.glob("worker*/{}/*".format(dirname)): + if path_file.stat().st_size == 0: + continue + path_symlink = output_path / dirname / path_file.name + link_file = True + if path_symlink.is_symlink(): + # this file was already linked + if path_file.stat().st_size > path_symlink.stat().st_size: + # we should replace the previously linked file with a new one + link_file = True + path_symlink.unlink() + else: + # we should keep the previously linked file + link_file = False + if link_file: + path_symlink.symlink_to(path_file) + + partitions = ("train", "valid_seen", "valid_unseen", "test_seen", "test_unseen") + if not (output_path / ".deleting_worker_dirs").exists(): + for partition in partitions: + logger.info("Processing %s trajectories" % partition) + feats_files = output_path.glob("feats/{}:*.pt".format(partition)) + feats_files = sorted([str(path) for path in feats_files]) + jsons_files = [p.replace("/feats/", "/jsons/").replace(".pt", ".pkl") for p in feats_files] + (output_path / partition).mkdir(exist_ok=True) + data_util.gather_feats(feats_files, output_path / partition / "feats") + data_util.gather_jsons(jsons_files, output_path / partition / "jsons.pkl") + + logger.info("Removing worker directories") + (output_path / ".deleting_worker_dirs").touch() + for worker_idx in range(max(num_workers, 1)): + worker_dir = output_path / "worker{:02d}".format(worker_idx) + shutil.rmtree(worker_dir) + for dirname in ("feats", "masks", "jsons"): + shutil.rmtree(output_path / dirname) + os.remove(output_path / ".deleting_worker_dirs") + os.remove(output_path / "processed_feats.txt") + + +@ex.automain +def main(args): + torch.multiprocessing.set_start_method("spawn") + args = helper_util.AttrDict(**args) + if args.data_output is None: + raise RuntimeError("Please, specify the name of output dataset") + + # set up the paths + output_path = Path(constants.ET_DATA) / args.data_output + input_path = Path(constants.ET_DATA) / args.data_input + logger.info("Creating a dataset {} using data from {}".format(args.data_output, input_path)) + if not input_path.is_dir(): + raise RuntimeError("The input dataset {} does not exist".format(input_path)) + if output_path.is_dir() and args.overwrite: + logger.info("Erasing the old directory") + shutil.rmtree(output_path) + output_path.mkdir(exist_ok=True) + + # read which files need to be processed + trajs_list, num_files = get_traj_paths(input_path, output_path / constants.VOCAB_FILENAME, args.fast_epoch) + model_util.save_log( + output_path, + progress=num_files - len(trajs_list), + total=num_files, + stage="jsons", + ) + logger.info("Creating a dataset with {} trajectories using {} workers".format(num_files, args.num_workers)) + logger.info("Processing JSONs and masks ({} were already processed)".format(num_files - len(trajs_list))) + + # first process jsons and masks + if len(trajs_list) > 0: + lock = threading.Lock() + preprocessor = data_util.get_preprocessor(Preprocessor, args.subgoal_ann, lock, args.vocab_path, args.task_type) + run_in_parallel( + process_jsons, + args.num_workers, + output_path, + args=[trajs_list, preprocessor, lock], + ) + vocab_copy = {} + for key, vocab in preprocessor.vocab.items(): + vocab_copy[key] = Vocab.from_dict(vocab.to_dict()) + torch.save(vocab_copy, output_path / constants.VOCAB_FILENAME) + + # read which features need to be extracted + trajs_list, num_files_again = get_traj_paths(input_path, output_path / "processed_feats.txt", args.fast_epoch) + assert num_files == num_files_again + model_util.save_log( + output_path, + progress=num_files - len(trajs_list), + total=num_files, + stage="feats", + ) + logger.info("Extracting features ({} were already processed)".format(num_files - len(trajs_list))) + + # then extract features + extractor = FeatureExtractor( + args.visual_archi, + args.device, + args.visual_checkpoint, + share_memory=True, + compress_type=args.compress_type, + ) + if len(trajs_list) > 0: + manager = torch.multiprocessing.Manager() + lock = manager.Lock() + trajs_queue = manager.Queue() + for path in trajs_list: + trajs_queue.put(path) + args_process_feats = [trajs_queue, extractor, lock, args.image_folder] + run_in_parallel( + process_feats, + args.num_workers, + output_path, + args=args_process_feats, + use_processes=True, + ) + + # finally, gather all the data + gather_data(output_path, args.num_workers) + # save dataset info to a file + feat_shape = extractor.feat_shape + params = { + "feat_shape": feat_shape, + "visual_checkpoint": args.visual_checkpoint, + "visual_archi": args.visual_archi, + "compress_type": args.compress_type, + } + with (output_path / "params.json").open("w") as f: + json.dump(params, f, sort_keys=True, indent=4) + logger.info("The dataset was saved to {}".format(output_path)) diff --git a/src/teach/modeling/ET/alfred/data/preprocessor.py b/src/teach/modeling/ET/alfred/data/preprocessor.py new file mode 100644 index 0000000..b315a9b --- /dev/null +++ b/src/teach/modeling/ET/alfred/data/preprocessor.py @@ -0,0 +1,84 @@ +import copy + +import revtok +from alfred.utils import data_util +from vocab import Vocab + + +class Preprocessor(object): + def __init__(self, vocab, subgoal_ann=False, is_test_split=False, frame_size=300): + self.subgoal_ann = subgoal_ann + self.is_test_split = is_test_split + self.frame_size = frame_size + + if vocab is None: + self.vocab = { + "word": Vocab(["<>", "<>", "<>", "<>"]), + "action_low": Vocab(["<>", "<>", "<>", "<>"]), + "action_high": Vocab(["<>", "<>", "<>", "<>"]), + } + else: + self.vocab = vocab + + self.word_seg = self.vocab["word"].word2index("<>", train=False) + + @staticmethod + def numericalize(vocab, words, train=True): + """ + converts words to unique integers + """ + if not train: + new_words = set(words) - set(vocab.counts.keys()) + if new_words: + # replace unknown words with <> + words = [w if w not in new_words else "<>" for w in words] + return vocab.word2index(words, train=train) + + def process_language(self, ex, traj, r_idx, is_test_split=False): + if self.is_test_split: + is_test_split = True + + instr_anns = [utterance for (speaker, utterance) in ex["dialog_history"]] + instr_anns = [revtok.tokenize(data_util.remove_spaces_and_lower(instr_ann)) for instr_ann in instr_anns] + instr_anns = [[w.strip().lower() for w in instr_ann] for instr_ann in instr_anns] + traj["ann"] = { + "instr": [instr_ann + ["<>"] for instr_ann in instr_anns], + } + traj["ann"]["instr"] += [["<>"]] + if "num" not in traj: + traj["num"] = {} + traj["num"]["lang_instr"] = [ + self.numericalize(self.vocab["word"], x, train=not is_test_split) for x in traj["ann"]["instr"] + ] + + def tokenize_and_numericalize(self, dialog_history, numericalize=True, train=False): + instr_anns = [utterance for (speaker, utterance) in dialog_history] + + # tokenize annotations + instr_anns = [revtok.tokenize(data_util.remove_spaces_and_lower(instr_ann)) for instr_ann in instr_anns] + + instr_anns = [[w.strip().lower() for w in instr_ann] for instr_ann in instr_anns] + instr = [instr_ann + ["<>"] for instr_ann in instr_anns] + + instr += [["<>"]] + + if numericalize: + instr = [self.numericalize(self.vocab["word"], word, train=train) for word in instr] + instr = sum(instr, []) # flatten + return instr + + def process_actions(self, ex, traj): + if "num" not in traj: + traj["num"] = {} + traj["num"]["driver_actions_low"] = list() + traj["num"]["driver_actions_pred_mask"] = list() + for action in ex["driver_action_history"]: + action_dict_with_idx = copy.deepcopy(action) + action_dict_with_idx["action"] = (self.vocab["action_low"].word2index(action["action_name"], train=True),) + traj["num"]["driver_actions_low"].append(action_dict_with_idx) + traj["num"]["driver_actions_pred_mask"].append(0) + for action in ex["driver_actions_future"]: + action_dict_with_idx = copy.deepcopy(action) + action_dict_with_idx["action"] = (self.vocab["action_low"].word2index(action["action_name"], train=True),) + traj["num"]["driver_actions_low"].append(action_dict_with_idx) + traj["num"]["driver_actions_pred_mask"].append(1) diff --git a/src/teach/modeling/ET/alfred/data/zoo/base.py b/src/teach/modeling/ET/alfred/data/zoo/base.py new file mode 100644 index 0000000..69eae9b --- /dev/null +++ b/src/teach/modeling/ET/alfred/data/zoo/base.py @@ -0,0 +1,138 @@ +import logging +import os +import pickle +import warnings + +import lmdb +import numpy as np +import torch +from alfred import constants +from alfred.utils import data_util +from torch.utils.data import Dataset as TorchDataset + +from teach.logger import create_logger + +logger = create_logger(__name__, level=logging.INFO) + + +class BaseDataset(TorchDataset): + def __init__(self, name, partition, args, ann_type): + logger.debug("Dataset __init__ with args %s" % str(args)) + path = os.path.join(constants.ET_DATA, name) + self.partition = partition + self.name = name + self.args = args + if ann_type not in ("lang", "frames", "lang_frames"): + raise ValueError("Unknown annotation type: {}".format(ann_type)) + self.ann_type = ann_type + self.test_mode = False + self.pad = constants.PAD + + # read information about the dataset + self.dataset_info = data_util.read_dataset_info(name) + if self.dataset_info["visual_checkpoint"]: + logger.info("Visual checkpoint for data preprocessing: %s" % str(self.dataset_info["visual_checkpoint"])) + + # load data + self._length = self.load_data(path) + if self.args.fast_epoch: + self._length = 16 + logger.info("%s dataset size = %d" % (partition, self._length)) + + # load vocabularies for input language and output actions + vocab = data_util.load_vocab(name, ann_type) + self.vocab_in = vocab["word"] + out_type = "action_low" if args.model == "transformer" else "action_high" + self.vocab_out = vocab[out_type] + logger.debug("Loaded vocab_out: %s" % str(self.vocab_out.to_dict()["index2word"])) + # if several datasets are used, we will translate outputs to this vocab later + self.vocab_translate = None + + def load_data(self, path, feats=True, jsons=True): + """ + load data + """ + # do not open the lmdb database open in the main process, do it in each thread + if feats: + self.feats_lmdb_path = os.path.join(path, self.partition, "feats") + + # load jsons with pickle and parse them + if jsons: + with open(os.path.join(path, self.partition, "jsons.pkl"), "rb") as jsons_file: + jsons = pickle.load(jsons_file) + self.jsons_and_keys = [] + for idx in range(len(jsons)): + key = "{:06}".format(idx).encode("ascii") + if key in jsons: + task_jsons = jsons[key] + for json in task_jsons: + # compatibility with the evaluation + if "task" in json and isinstance(json["task"], str): + pass + else: + json["task"] = "/".join(json["root"].split("/")[-3:-1]) + # add dataset idx and partition into the json + json["dataset_name"] = self.name + self.jsons_and_keys.append((json, key)) + # if the dataset has script annotations, do not add identical data + if len(set([str(j["ann"]["instr"]) for j in task_jsons])) == 1: + break + + # return the true length of the loaded data + return len(self.jsons_and_keys) if jsons else None + + def load_frames(self, key): + """ + load image features from the disk + """ + if not hasattr(self, "feats_lmdb"): + self.feats_lmdb, self.feats = self.load_lmdb(self.feats_lmdb_path) + feats_bytes = self.feats.get(key) + feats_numpy = np.frombuffer(feats_bytes, dtype=np.float32).reshape(self.dataset_info["feat_shape"]) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + frames = torch.tensor(feats_numpy) + return frames + + def load_lmdb(self, lmdb_path): + """ + load lmdb (should be executed in each worker on demand) + """ + database = lmdb.open( + lmdb_path, + readonly=True, + lock=False, + readahead=False, + meminit=False, + max_readers=self.args.lmdb_max_readers, + ) + cursor = database.begin(write=False) + return database, cursor + + def __len__(self): + """ + return dataset length + """ + return self._length + + def __getitem__(self, idx): + """ + get item at index idx + """ + raise NotImplementedError + + @property + def id(self): + return self.partition + ":" + self.name + ";" + self.ann_type + + def __del__(self): + """ + close the dataset + """ + if hasattr(self, "feats_lmdb"): + self.feats_lmdb.close() + if hasattr(self, "masks_lmdb"): + self.masks_lmdb.close() + + def __repr__(self): + return "{}({})".format(type(self).__name__, self.id) diff --git a/src/teach/modeling/ET/alfred/data/zoo/guides_edh.py b/src/teach/modeling/ET/alfred/data/zoo/guides_edh.py new file mode 100644 index 0000000..52dca95 --- /dev/null +++ b/src/teach/modeling/ET/alfred/data/zoo/guides_edh.py @@ -0,0 +1,107 @@ +import logging +import os + +import torch +from alfred import constants +from alfred.data.zoo.base import BaseDataset + +from teach.logger import create_logger + +logger = create_logger(__name__, level=logging.INFO) + + +class GuidesEdhDataset(BaseDataset): + def __init__(self, name, partition, args, ann_type): + super().__init__(name, partition, args, ann_type) + # preset values + self._load_features = True + self._load_frames = True + # load the vocabulary for object classes + vocab_obj_file = os.path.join(constants.ET_ROOT, constants.OBJ_CLS_VOCAB) + logger.info("Loading object vocab from %s" % vocab_obj_file) + self.vocab_obj = torch.load(vocab_obj_file) + + def load_data(self, path): + return super().load_data(path, feats=True, jsons=True) + + def __getitem__(self, idx): + task_json, key = self.jsons_and_keys[idx] + feat_dict = {} + if self._load_features: + feat_dict = self.load_features(task_json) + if self._load_frames: + feat_dict["frames"] = self.load_frames(key) + + # Add a stop action and duplicate the last frame + feat_dict["action"].append(self.vocab_out.word2index("Stop")) + feat_dict["frames"] = torch.cat((feat_dict["frames"], torch.unsqueeze(feat_dict["frames"][-1, :], 0)), 0) + feat_dict["obj_interaction_action"].append(0) + feat_dict["driver_actions_pred_mask"].append(0) + + if self.args.no_lang: + feat_dict["lang"] = [self.vocab_in.word2index("<>")] + elif self.args.no_vision: + feat_dict["frames"] = torch.rand(feat_dict["frames"].shape) + + return task_json, feat_dict + + def load_features(self, task_json): + """ + load features from task_json + """ + feat = dict() + # language inputs + feat["lang"] = GuidesEdhDataset.load_lang(task_json) + + # action outputs + if not self.test_mode: + # low-level action + feat["action"] = GuidesEdhDataset.load_action(task_json, self.vocab_out) + feat["obj_interaction_action"] = [ + a["obj_interaction_action"] for a in task_json["num"]["driver_actions_low"] + ] + feat["driver_actions_pred_mask"] = task_json["num"]["driver_actions_pred_mask"] + feat["object"] = self.load_object_classes(task_json, self.vocab_obj) + + return feat + + @staticmethod + def load_lang(task_json): + """ + load numericalized language from task_json + """ + return sum(task_json["num"]["lang_instr"], []) + + @staticmethod + def load_action(task_json, vocab_orig, action_type="action_low"): + """ + load action as a list of tokens from task_json + """ + if action_type == "action_low": + # load low actions + lang_action = [[vocab_orig.word2index(a["action_name"]) for a in task_json["num"]["driver_actions_low"]]] + lang_action = sum(lang_action, []) + elif action_type == "action_high_future": + if "future_subgoals" in task_json: + lang_action = [vocab_orig.word2index(w) for w in task_json["future_subgoals"]] + else: + lang_action = [0] + elif action_type == "action_high_all": + lang_action = [ + vocab_orig.word2index(w) for w in task_json["history_subgoals"] + task_json["future_subgoals"] + ] + else: + raise NotImplementedError("Unknown action_type {}".format(action_type)) + return lang_action + + def load_object_classes(self, task_json, vocab=None): + """ + load object classes for interactive actions + """ + object_classes = [] + for idx, action in enumerate(task_json["num"]["driver_actions_low"]): + if self.args.compute_train_loss_over_history or task_json["num"]["driver_actions_pred_mask"][idx] == 1: + if action["oid"] is not None: + object_class = action["oid"].split("|")[0] + object_classes.append(object_class if vocab is None else vocab.word2index(object_class)) + return object_classes diff --git a/src/teach/modeling/ET/alfred/data/zoo/guides_speaker.py b/src/teach/modeling/ET/alfred/data/zoo/guides_speaker.py new file mode 100644 index 0000000..0e4c456 --- /dev/null +++ b/src/teach/modeling/ET/alfred/data/zoo/guides_speaker.py @@ -0,0 +1,32 @@ +from alfred.data.zoo.base import BaseDataset +from alfred.data.zoo.guides_edh import GuidesEdhDataset + + +class GuidesSpeakerDataset(BaseDataset): + def load_data(self, path): + return super(GuidesSpeakerDataset, self).load_data(path, feats=True, masks=False, jsons=True) + + def __getitem__(self, idx): + task_json, key = self.jsons_and_keys[idx] + # load language and frames if asked first + feat_dict = {} + feat_dict["lang"] = GuidesEdhDataset.load_lang(task_json) + if "frames" in self.ann_type: + feat_dict["frames"] = self.load_frames(key) + + # load output actions + if self.args.lang_pretrain_over_history_subgoals: + feat_dict["action"] = GuidesEdhDataset.load_action( + task_json, self.vocab_out, self.vocab_translate, "action_high_all" + ) + else: + feat_dict["action"] = GuidesEdhDataset.load_action( + task_json, self.vocab_out, self.vocab_translate, "action_high_future" + ) + + # remove all the lang key/value pairs if only frames are used as input + if self.ann_type == "frames": + keys_lang = [key for key in feat_dict if key.startswith("lang")] + for key in keys_lang: + feat_dict.pop(key) + return task_json, feat_dict diff --git a/src/teach/modeling/ET/alfred/model/base.py b/src/teach/modeling/ET/alfred/model/base.py new file mode 100644 index 0000000..11525cb --- /dev/null +++ b/src/teach/modeling/ET/alfred/model/base.py @@ -0,0 +1,63 @@ +from alfred.utils import data_util +from torch import nn + + +class Model(nn.Module): + def __init__(self, args, embs_ann, vocab_out, pad, seg, for_inference=False): + """ + Abstract model + """ + nn.Module.__init__(self) + self.args = args + self.vocab_out = vocab_out + self.pad, self.seg = pad, seg + if for_inference: + model_dir = args["model_dir"] + dataset_info = data_util.read_dataset_info_for_inference(model_dir) + else: + dataset_info = data_util.read_dataset_info(args.data["train"][0]) + self.visual_tensor_shape = dataset_info["feat_shape"][1:] + + # create language and action embeddings + self.embs_ann = nn.ModuleDict({}) + for emb_name, emb_size in embs_ann.items(): + self.embs_ann[emb_name] = nn.Embedding(emb_size, args.demb) + + # dropouts + self.dropout_vis = nn.Dropout(args.dropout["vis"], inplace=True) + self.dropout_lang = nn.Dropout2d(args.dropout["lang"]) + + def init_weights(self, init_range=0.1): + """ + init linear layers in embeddings + """ + for emb_ann in self.embs_ann.values(): + emb_ann.weight.data.uniform_(-init_range, init_range) + + def compute_metrics(self, model_out, gt_dict, metrics_dict, verbose): + """ + compute model-specific metrics and put it to metrics dict + """ + raise NotImplementedError + + def forward(self, vocab, **inputs): + """ + forward the model for multiple time-steps (used for training) + """ + raise NotImplementedError() + + def compute_batch_loss(self, model_out, gt_dict): + """ + compute the loss function for a single batch + """ + raise NotImplementedError() + + def compute_loss(self, model_outs, gt_dicts): + """ + compute the loss function for several batches + """ + # compute losses for each batch + losses = {} + for dataset_key in model_outs.keys(): + losses[dataset_key] = self.compute_batch_loss(model_outs[dataset_key], gt_dicts[dataset_key]) + return losses diff --git a/src/teach/modeling/ET/alfred/model/learned.py b/src/teach/modeling/ET/alfred/model/learned.py new file mode 100644 index 0000000..78c81d8 --- /dev/null +++ b/src/teach/modeling/ET/alfred/model/learned.py @@ -0,0 +1,161 @@ +import collections +import json +import logging +import os +from importlib import import_module + +import gtimer as gt +from alfred.utils import data_util, model_util +from tensorboardX import SummaryWriter +from torch import nn +from tqdm import tqdm + +from teach.logger import create_logger + +logger = create_logger(__name__, level=logging.INFO) + + +class LearnedModel(nn.Module): + def __init__(self, args, embs_ann, vocab_out, for_inference=False): + """ + Abstract model + """ + nn.Module.__init__(self) + self.args = args + self.embs_ann = embs_ann + self.vocab_out = vocab_out + # sentinel tokens + self.pad, self.seg = 0, 1 + # summary self.writer + self.summary_writer = None + # create the model to be trained + ModelClass = import_module("alfred.model.{}".format(args.model)).Model + self.model = ModelClass(args, embs_ann, vocab_out, self.pad, self.seg, for_inference) + + def run_train(self, loaders, info, optimizer=None): + """ + training loop + """ + # prepare dictionaries + loaders_train = dict(filter(lambda x: "train" in x[0], loaders.items())) + assert len(set([len(loader) for loader in loaders_train.values()])) == 1 + vocabs_in = { + "{};{}".format(loader.dataset.name, loader.dataset.ann_type): loader.dataset.vocab_in + for loader in loaders.values() + } + epoch_length = len(next(iter(loaders_train.values()))) + logger.debug("In LearnedModel.run_train, epoch_length = %d" % epoch_length) + # initialize summary writer for tensorboardX + self.summary_writer = SummaryWriter(log_dir=self.args.dout) + # dump config + with open(os.path.join(self.args.dout, "config.json"), "wt") as f: + json.dump(vars(self.args), f, indent=2) + # optimizer + optimizer, schedulers = model_util.create_optimizer_and_schedulers( + info["progress"], self.args, self.parameters(), optimizer + ) + # make sure that all train loaders have the same length + assert len(set([len(loader) for loader in loaders_train.values()])) == 1 + model_util.save_log( + self.args.dout, + progress=info["progress"], + total=self.args.epochs, + stage="train", + best_loss=info["best_loss"], + iters=info["iters"], + ) + + # display dout + logger.info("Saving to: %s" % self.args.dout) + for epoch in range(info["progress"], self.args.epochs): + logger.info("Epoch {}/{}".format(epoch, self.args.epochs)) + self.train() + train_iterators = {key: iter(loader) for key, loader in loaders_train.items()} + metrics = {key: collections.defaultdict(list) for key in loaders_train} + gt.reset() + + for _ in tqdm(range(epoch_length), desc="train"): + # sample batches + batches = data_util.sample_batches(train_iterators, self.args.device, self.pad, self.args) + gt.stamp("data fetching", unique=False) + + # do the forward passes + model_outs, losses_train = {}, {} + for batch_name, (traj_data, input_dict, gt_dict) in batches.items(): + if "lang" not in input_dict: + raise RuntimeError("In learned.run_train, lang not in input_dict") + model_outs[batch_name] = self.model.forward( + vocabs_in[batch_name.split(":")[-1]], action=gt_dict["action"], **input_dict + ) + info["iters"]["train"] += len(traj_data) if ":" not in batch_name else 0 + gt.stamp("forward pass", unique=False) + # compute losses + losses_train = self.model.compute_loss( + model_outs, + {key: gt_dict for key, (_, _, gt_dict) in batches.items()}, + ) + + # do the gradient step + optimizer.zero_grad() + sum_loss = sum([sum(loss.values()) for name, loss in losses_train.items()]) + sum_loss.backward() + optimizer.step() + gt.stamp("optimizer", unique=False) + + # compute metrics + for dataset_name in losses_train.keys(): + self.model.compute_metrics( + model_outs[dataset_name], + batches[dataset_name][2], + metrics["train:" + dataset_name], + self.args.compute_train_loss_over_history, + ) + for key, value in losses_train[dataset_name].items(): + metrics["train:" + dataset_name]["loss/" + key].append(value.item()) + metrics["train:" + dataset_name]["loss/total"].append(sum_loss.detach().cpu().item()) + gt.stamp("metrics", unique=False) + if self.args.profile: + logger.info(gt.report(include_itrs=False, include_stats=False)) + + # save the checkpoint + logger.info("Saving models...") + stats = {"epoch": epoch} + model_util.save_model(self, "model_{:02d}.pth".format(epoch), stats, optimizer=optimizer) + model_util.save_model(self, "latest.pth", stats, symlink=True) + + # compute metrics for train + logger.info("Computing train metrics...") + metrics = {data: {k: sum(v) / len(v) for k, v in metr.items()} for data, metr in metrics.items()} + stats = { + "epoch": epoch, + "general": {"learning_rate": optimizer.param_groups[0]["lr"]}, + **metrics, + } + + # save the checkpoint + logger.info("Saving models...") + model_util.save_model(self, "model_{:02d}.pth".format(epoch), stats, optimizer=optimizer) + model_util.save_model(self, "latest.pth", stats, symlink=True) + # write averaged stats + for loader_id in stats.keys(): + if isinstance(stats[loader_id], dict): + for stat_key, stat_value in stats[loader_id].items(): + # for comparison with old epxs, maybe remove later + summary_key = "{}/{}".format( + loader_id.replace(":", "/").replace("lmdb/", "").replace(";lang", "").replace(";", "_"), + stat_key.replace(":", "/").replace("lmdb/", ""), + ) + self.summary_writer.add_scalar(summary_key, stat_value, info["iters"]["train"]) + # dump the training info + model_util.save_log( + self.args.dout, + progress=epoch + 1, + total=self.args.epochs, + stage="train", + best_loss=info["best_loss"], + iters=info["iters"], + ) + model_util.adjust_lr(self.args, epoch, schedulers) + logger.info( + "{} epochs are completed, all the models were saved to: {}".format(self.args.epochs, self.args.dout) + ) diff --git a/src/teach/modeling/ET/alfred/model/speaker.py b/src/teach/modeling/ET/alfred/model/speaker.py new file mode 100644 index 0000000..3c2e809 --- /dev/null +++ b/src/teach/modeling/ET/alfred/model/speaker.py @@ -0,0 +1,234 @@ +import logging + +import numpy as np +import torch +from alfred.model import base +from alfred.nn.enc_lang import EncoderLang +from alfred.nn.enc_visual import FeatureFlat +from alfred.nn.enc_vl import EncoderVL +from alfred.nn.encodings import PosLangEncoding +from alfred.utils import model_util +from torch import nn +from torch.nn import functional as F + +from teach.logger import create_logger + +logger = create_logger(__name__, level=logging.INFO) + + +class Model(base.Model): + def __init__(self, args, embs_ann, vocab_out, pad, seg, for_inference=False): + """ + speaker model + """ + super().__init__(args, embs_ann, vocab_out, pad, seg, for_inference) + + # encoder and visual embeddings + self.encoder_vl, self.encoder_lang = None, None + if any("frames" in ann_type for ann_type in args.data["ann_type"]): + # create a multi-modal encoder + self.encoder_vl = EncoderVL(args) + # create feature embeddings + self.vis_feat = FeatureFlat(input_shape=self.visual_tensor_shape, output_size=args.demb) + else: + # create an encoder for language only + self.encoder_lang = EncoderLang(args.encoder_layers, args, embs_ann) + + # decoder parts + decoder_layer = nn.TransformerDecoderLayer( + args.demb, + args.decoder_lang["heads"], + args.decoder_lang["demb"], + args.decoder_lang["dropout"], + ) + self.decoder = nn.TransformerDecoder(decoder_layer, args.decoder_lang["layers"]) + self.enc_pos = PosLangEncoding(args.demb) if args.decoder_lang["pos_enc"] else None + self.emb_subgoal = nn.Embedding(len(vocab_out), args.demb) + + # final touch + self.init_weights() + + def encode_vl(self, vocab, **inputs): + """ + apply the VL encoder to the inputs + """ + lang = inputs["lang"] if "lang" in inputs else None + frames = inputs["frames"] if "frames" in inputs else None + device = lang.device if lang is not None else frames.device + assert inputs is not None or frames is not None + batch_size = len(lang if lang is not None else frames) + # embed language if the model should see them + if lang is not None: + emb_lang = self.embed_lang(lang, self.embs_ann[vocab.name]) + lengths_lang = inputs["lengths_lang"] + else: + emb_lang = torch.zeros([batch_size, 0, self.args.demb]).to(device) + lengths_lang = torch.tensor([0] * batch_size) + + # embed frames if the model should see them + if frames is not None: + emb_frames = self.embed_frames(frames) + lengths_frames = inputs["lengths_frames"] + length_frames_max = inputs["length_frames_max"] + else: + emb_frames = torch.zeros([batch_size, 0, self.args.demb]).to(device) + lengths_frames, length_frames_max = torch.tensor([0] * batch_size), 0 + # speaker does not use the actions + emb_actions = torch.zeros([batch_size, 0, self.args.demb]).to(device) + lengths_actions = torch.tensor([0] * batch_size) + # encode inputs + hiddens, hiddens_padding = self.encoder_vl( + emb_lang, + emb_frames, + emb_actions, + lengths_lang, + lengths_frames, + lengths_actions, + length_frames_max, + attn_masks=False, + ) + return hiddens, hiddens_padding + + def encode_lang(self, vocab, lang_pad): + """ + apply the language encoder to the inputs + """ + embedder_lang = self.embs_ann[vocab.name] + emb_lang, lengths_lang = self.encoder_lang(lang_pad, embedder_lang, vocab, self.pad) + emb_padding = torch.zeros(emb_lang.shape[:2], device=emb_lang.device).bool() + for i, len_l in enumerate(lengths_lang): + emb_padding[i, len_l:] = True + return emb_lang, emb_padding + + def encode_inputs(self, vocab, **inputs): + """ + apply the VL or language encoder to the inputs + """ + if self.encoder_vl is not None: + hiddens, hiddens_padding = self.encode_vl(vocab, **inputs) + else: + hiddens, hiddens_padding = self.encode_lang(vocab, inputs["lang"]) + return hiddens, hiddens_padding + + def forward(self, vocab, **inputs): + """ + forward the model for multiple time-steps (used for training) + """ + # pass inputs to the encoder + hiddens, hiddens_padding = self.encode_inputs(vocab, **inputs) + hiddens = self.enc_pos(hiddens) if self.enc_pos else hiddens + # generate masks + lang_target = inputs["action"] + target_mask = model_util.triangular_mask(lang_target.size(1), lang_target.device) + # right shift the targets + lang_target = lang_target.clone().detach() + lang_target = torch.roll(lang_target, 1, 1) + lang_target[:, 0] = self.seg + # embed targets and add position encodings + target = self.embed_lang(lang_target, self.emb_subgoal) + target = self.enc_pos(target) if self.enc_pos else target + + # decode the outputs with transformer + decoder_out = self.decoder( + tgt=target.transpose(0, 1), + memory=hiddens.transpose(0, 1), + # to avoid looking at the future tokens (the ones on the right) + tgt_mask=target_mask, + # avoid looking on padding of the src + memory_key_padding_mask=hiddens_padding, + ).transpose(0, 1) + # apply a linear layer + decoder_out_flat = decoder_out.reshape(-1, self.args.demb) + lang_out_flat = decoder_out_flat.mm(self.emb_subgoal.weight.t()) + output = {"lang": lang_out_flat.view(len(decoder_out), -1, lang_out_flat.shape[-1])} + return output + + def embed_frames(self, frames_pad): + """ + take a list of frames tensors, pad it, apply dropout and extract embeddings + """ + self.dropout_vis(frames_pad) + frames_4d = frames_pad.view(-1, *frames_pad.shape[2:]) + frames_pad_emb = self.vis_feat(frames_4d).view(*frames_pad.shape[:2], -1) + return frames_pad_emb + + def embed_lang(self, lang_pad, embedder): + """ + embed goal+instr language + """ + lang_pad_emb = embedder(lang_pad) + lang_pad_emb = self.dropout_lang(lang_pad_emb) + return lang_pad_emb + + def compute_batch_loss(self, model_out, gt_dict): + """ + language translation loss function + """ + p_lang = model_out["lang"].view(-1, model_out["lang"].shape[-1]) + l_lang = gt_dict["action"].view(-1) + loss_lang = F.cross_entropy(p_lang, l_lang, reduction="none").mean() + return {"lang": loss_lang} + + def init_weights(self, init_range=0.1): + """ + init embeddings uniformly + """ + super().init_weights(init_range) + self.emb_subgoal.weight.data.uniform_(-init_range, init_range) + + def compute_metrics(self, model_out, gt_dict, metrics_dict, verbose=False): + """ + compute exact matching and f1 score for action predictions + """ + pred_tokens = model_out["lang"].max(2)[1].tolist() + pred_lang = model_util.tokens_to_lang(pred_tokens, self.vocab_out, {self.pad}, join=False) + gt_lang = model_util.tokens_to_lang(gt_dict["action"], self.vocab_out, {self.pad}, join=False) + pred_lang_strs = [" ".join(s) for s in pred_lang] + gt_lang_strs = [" ".join(s) for s in gt_lang] + model_util.compute_f1_and_exact(metrics_dict, pred_lang_strs, gt_lang_strs, "lang") + if verbose: + logger.debug("Lang GT:\n{}".format(gt_lang_strs[0])) + logger.debug("Lang predictions:\n{}".format(pred_lang_strs[0])) + logger.debug("EM = {}, F1 = {}".format(metrics_dict["lang/exact"][-1], metrics_dict["lang/f1"][-1])) + + def translate(self, vocab_in, max_decode=300, num_pad_stop=3, **inputs): + """ + lang and frames has shapes [1, LEN] + """ + # prepare + batch_size = len(inputs["lang"] if "lang" in inputs else inputs["frames"]) + device = (inputs["lang"] if "lang" in inputs else inputs["frames"]).device + # pass inputs to the encoder + hiddens, hiddens_padding = self.encode_inputs(vocab_in, **inputs) + assert len(hiddens) == batch_size + + # start the decoding + lang_cur = [[self.seg] for _ in range(batch_size)] + for i in range(max_decode): + tensor_cur = torch.tensor(lang_cur).to(device) + emb_cur = self.embed_lang(tensor_cur, self.emb_subgoal) + if self.enc_pos: + emb_cur = self.enc_pos(emb_cur) + mask_cur = model_util.triangular_mask(i + 1, device) + + decoder_out = self.decoder( + tgt=emb_cur.transpose(0, 1), + memory=hiddens.transpose(0, 1), + tgt_mask=mask_cur, + # avoid looking on padding of the src + memory_key_padding_mask=hiddens_padding, + ).transpose(0, 1) + + # apply a linear layer + decoder_out_flat = decoder_out.reshape(-1, self.args.demb) + lang_out_flat = decoder_out_flat.mm(self.emb_subgoal.weight.t()) + lang_out = lang_out_flat.view(batch_size, -1, lang_out_flat.shape[-1]) + tokens_out = lang_out.max(2)[1] + for j in range(batch_size): + lang_cur[j].append(tokens_out[i, -1].item()) + if len(tokens_out[0]) > num_pad_stop and (np.array(lang_cur)[:, -num_pad_stop:] == self.pad).all(): + break + + lang_result = [l[1:] for l in lang_cur] + lang_result = [[t for t in tokens if t != self.pad] for tokens in lang_result] + return lang_result diff --git a/src/teach/modeling/ET/alfred/model/train.py b/src/teach/modeling/ET/alfred/model/train.py new file mode 100755 index 0000000..dfc26f6 --- /dev/null +++ b/src/teach/modeling/ET/alfred/model/train.py @@ -0,0 +1,205 @@ +import logging +import os +import random +import shutil + +import numpy as np +import torch +from alfred import constants +from alfred.config import exp_ingredient, train_ingredient +from alfred.data import GuidesEdhDataset, GuidesSpeakerDataset +from alfred.model.learned import LearnedModel +from alfred.utils import data_util, helper_util, model_util +from sacred import Experiment + +from teach.logger import create_logger + +ex = Experiment("train", ingredients=[train_ingredient, exp_ingredient]) + +logger = create_logger(__name__, level=logging.INFO) + + +def prepare(train, exp): + """ + create logdirs, check dataset, seed pseudo-random generators + """ + # args and init + args = helper_util.AttrDict(**train, **exp) + args.dout = os.path.join(constants.ET_LOGS, args.name) + args.data["train"] = args.data["train"].split(",") + args.data["valid"] = args.data["valid"].split(",") if args.data["valid"] else [] + num_datas = len(args.data["train"]) + len(args.data["valid"]) + for key in ("ann_type",): + args.data[key] = args.data[key].split(",") + if len(args.data[key]) == 1: + args.data[key] = args.data[key] * num_datas + if len(args.data[key]) != num_datas: + raise ValueError("Provide either 1 {} or {} separated by commas".format(key, num_datas)) + # set seeds + torch.manual_seed(args.seed) + random.seed(a=args.seed) + np.random.seed(args.seed) + # make output dir + logger.info("Train args: %s" % str(args)) + if not os.path.isdir(args.dout): + os.makedirs(args.dout) + return args + + +def load_only_matching_layers(model, pretrained_model, train_lmdb_name): + pretrained_dict = {} + model_dict = model.state_dict() + + logger.debug("Pretrained Model keys: %s" % str(pretrained_model["model"].keys())) + logger.debug("Model state dict keys: %s" % str(model_dict.keys())) + + for name, param in pretrained_model["model"].items(): + model_name = name + if name not in model_dict.keys(): + model_name = name.replace("lmdb_human", train_lmdb_name) + if model_name not in model_dict.keys(): + logger.debug("No matching key ignoring %s" % model_name) + continue + + if param.size() == model_dict[model_name].size(): + logger.debug( + "Matched name and size: %s %s %s" % (name, str(param.size()), str(model_dict[model_name].size())) + ) + pretrained_dict[model_name] = param + else: + logger.debug("Mismatched size: %s %s %s" % (name, str(param.size()), str(model_dict[model_name].size()))) + logger.debug("Matched keys: %s" % str(pretrained_dict.keys())) + return pretrained_dict + + +def create_model(args, embs_ann, vocab_out): + """ + load a model and its optimizer + """ + prev_train_info = model_util.load_log(args.dout, stage="train") + if args.resume and os.path.exists(os.path.join(args.dout, "latest.pth")): + # load a saved model + loadpath = os.path.join(args.dout, "latest.pth") + model, optimizer = model_util.load_model(loadpath, args.device, prev_train_info["progress"] - 1) + assert model.vocab_out.contains_same_content(vocab_out) + model.args = args + else: + # create a new model + if not args.resume and os.path.isdir(args.dout): + shutil.rmtree(args.dout) + model = LearnedModel(args, embs_ann, vocab_out) + model = model.to(torch.device(args.device)) + optimizer = None + if args.pretrained_path: + if "/" not in args.pretrained_path: + # a relative path at the logdir was specified + args.pretrained_path = model_util.last_model_path(args.pretrained_path) + logger.info("Loading pretrained model from {}".format(args.pretrained_path)) + pretrained_model = torch.load(args.pretrained_path, map_location=torch.device(args.device)) + if args.use_alfred_weights: + pretrained_dict = load_only_matching_layers(model, pretrained_model, args.data["train"][0]) + model_dict = model.state_dict() + model_dict.update(pretrained_dict) + model.load_state_dict(model_dict) + loaded_keys = pretrained_dict.keys() + else: + model.load_state_dict(pretrained_model["model"], strict=False) + loaded_keys = set(model.state_dict().keys()).intersection(set(pretrained_model["model"].keys())) + assert len(loaded_keys) + logger.debug("Loaded keys: %s", str(loaded_keys)) + # put encoder on several GPUs if asked + if torch.cuda.device_count() > 1: + logger.info("Parallelizing the model") + model.model = helper_util.DataParallel(model.model) + return model, optimizer, prev_train_info + + +def load_data(name, args, ann_type, valid_only=False): + """ + load dataset and wrap them into torch loaders + """ + partitions = ([] if valid_only else ["train"]) + ["valid_seen", "valid_unseen"] + datasets = [] + for partition in partitions: + if args.model == "speaker": + dataset = GuidesSpeakerDataset(name, partition, args, ann_type) + elif args.model == "transformer": + dataset = GuidesEdhDataset(name, partition, args, ann_type) + else: + raise ValueError("Unknown model: {}".format(args.model)) + datasets.append(dataset) + return datasets + + +def wrap_datasets(datasets, args): + """ + wrap datasets with torch loaders + """ + batch_size = args.batch // len(args.data["train"]) + loader_args = { + "num_workers": args.num_workers, + "drop_last": (torch.cuda.device_count() > 1), + "collate_fn": helper_util.identity, + } + if args.num_workers > 0: + # do not prefetch samples, this may speed up data loading + loader_args["prefetch_factor"] = 1 + + loaders = {} + for dataset in datasets: + if dataset.partition == "train": + if args.data["train_load_type"] == "sample": + weights = [1 / len(dataset)] * len(dataset) + num_samples = 16 if args.fast_epoch else (args.data["length"] or len(dataset)) + num_samples = num_samples // len(args.data["train"]) + sampler = torch.utils.data.WeightedRandomSampler(weights, num_samples=num_samples, replacement=True) + loader = torch.utils.data.DataLoader(dataset, batch_size, sampler=sampler, **loader_args) + else: + loader = torch.utils.data.DataLoader(dataset, args.batch, shuffle=True, **loader_args) + else: + loader = torch.utils.data.DataLoader(dataset, args.batch, shuffle=(not args.fast_epoch), **loader_args) + loaders[dataset.id] = loader + return loaders + + +def process_vocabs(datasets, args): + """ + assign the largest output vocab to all datasets, compute embedding sizes + """ + # find the longest vocabulary for outputs among all datasets + for dataset in datasets: + logger.debug("dataset.id = %s, vocab_out = %s" % (dataset.id, str(dataset.vocab_out))) + vocab_out = sorted(datasets, key=lambda x: len(x.vocab_out))[-1].vocab_out + # make all datasets to use this vocabulary for outputs translation + for dataset in datasets: + dataset.vocab_translate = vocab_out + # prepare a dictionary for embeddings initialization: vocab names and their sizes + embs_ann = {} + for dataset in datasets: + embs_ann[dataset.name] = len(dataset.vocab_in) + return embs_ann, vocab_out + + +@ex.automain +def main(train, exp): + """ + train a network using an lmdb dataset + """ + # parse args + args = prepare(train, exp) + # load dataset(s) and process vocabs + datasets = [] + ann_types = iter(args.data["ann_type"]) + for name, ann_type in zip(args.data["train"], ann_types): + datasets.extend(load_data(name, args, ann_type)) + for name, ann_type in zip(args.data["valid"], ann_types): + datasets.extend(load_data(name, args, ann_type, valid_only=True)) + # assign vocabs to datasets and check their sizes for nn.Embeding inits + embs_ann, vocab_out = process_vocabs(datasets, args) + logger.debug("In train.main, vocab_out = %s" % str(vocab_out)) + # wrap datasets with loaders + loaders = wrap_datasets(datasets, args) + # create the model + model, optimizer, prev_train_info = create_model(args, embs_ann, vocab_out) + # start train loop + model.run_train(loaders, prev_train_info, optimizer=optimizer) diff --git a/src/teach/modeling/ET/alfred/model/transformer.py b/src/teach/modeling/ET/alfred/model/transformer.py new file mode 100644 index 0000000..a38ebec --- /dev/null +++ b/src/teach/modeling/ET/alfred/model/transformer.py @@ -0,0 +1,256 @@ +import torch +from alfred.model import base +from alfred.nn.dec_object import ObjectClassifier +from alfred.nn.enc_lang import EncoderLang +from alfred.nn.enc_visual import FeatureFlat +from alfred.nn.enc_vl import EncoderVL +from alfred.nn.encodings import DatasetLearnedEncoding +from alfred.utils import model_util +from torch import nn +from torch.nn import functional as F + + +class Model(base.Model): + def __init__(self, args, embs_ann, vocab_out, pad, seg, for_inference=False): + """ + transformer agent + """ + super().__init__(args, embs_ann, vocab_out, pad, seg, for_inference) + + # encoder and visual embeddings + self.encoder_vl = EncoderVL(args) + # pre-encoder for language tokens + self.encoder_lang = EncoderLang(args.encoder_lang["layers"], args, embs_ann) + # feature embeddings + self.vis_feat = FeatureFlat(input_shape=self.visual_tensor_shape, output_size=args.demb) + # dataset id learned encoding (applied after the encoder_lang) + self.dataset_enc = None + if args.enc["dataset"]: + self.dataset_enc = DatasetLearnedEncoding(args.demb, args.data["train"]) + # embeddings for actions + self.emb_action = nn.Embedding(len(vocab_out), args.demb) + # dropouts + self.dropout_action = nn.Dropout2d(args.dropout["transformer"]["action"]) + + # decoder parts + encoder_output_size = args.demb + self.dec_action = nn.Linear(encoder_output_size, args.demb) + self.dec_object = ObjectClassifier(encoder_output_size) + + # skip connection for object predictions + self.object_feat = FeatureFlat(input_shape=self.visual_tensor_shape, output_size=args.demb) + + # progress monitoring heads + if self.args.progress_aux_loss_wt > 0: + self.dec_progress = nn.Linear(encoder_output_size, 1) + if self.args.subgoal_aux_loss_wt > 0: + self.dec_subgoal = nn.Linear(encoder_output_size, 1) + + # final touch + self.init_weights() + self.reset() + + def forward(self, vocab, **inputs): + """ + forward the model for multiple time-steps (used for training) + """ + # embed language + output = {} + emb_lang, lengths_lang = self.embed_lang(inputs["lang"], vocab) + emb_lang = self.dataset_enc(emb_lang, vocab) if self.dataset_enc else emb_lang + + # embed frames and actions + emb_frames, emb_object = self.embed_frames(inputs["frames"]) + lengths_frames = inputs["lengths_frames"] + emb_actions = self.embed_actions(inputs["action"]) + assert emb_frames.shape == emb_actions.shape + lengths_actions = lengths_frames.clone() + length_frames_max = inputs["length_frames_max"] + + # concatenate language, frames and actions and add encodings + encoder_out, _ = self.encoder_vl( + emb_lang, + emb_frames, + emb_actions, + lengths_lang, + lengths_frames, + lengths_actions, + length_frames_max, + ) + # use outputs corresponding to visual frames for prediction only + encoder_out_visual = encoder_out[:, lengths_lang.max().item() : lengths_lang.max().item() + length_frames_max] + + # get the output actions + decoder_input = encoder_out_visual.reshape(-1, self.args.demb) + action_emb_flat = self.dec_action(decoder_input) + action_flat = action_emb_flat.mm(self.emb_action.weight.t()) + action = action_flat.view(*encoder_out_visual.shape[:2], *action_flat.shape[1:]) + + # get the output objects + emb_object_flat = emb_object.view(-1, self.args.demb) + decoder_input = decoder_input + emb_object_flat + object_flat = self.dec_object(decoder_input) + objects = object_flat.view(*encoder_out_visual.shape[:2], *object_flat.shape[1:]) + output.update({"action": action, "object": objects}) + + # (optionally) get progress monitor predictions + if self.args.progress_aux_loss_wt > 0: + progress = torch.sigmoid(self.dec_progress(encoder_out_visual)) + output["progress"] = progress + if self.args.subgoal_aux_loss_wt > 0: + subgoal = torch.sigmoid(self.dec_subgoal(encoder_out_visual)) + output["subgoal"] = subgoal + return output + + def embed_lang(self, lang_pad, vocab): + """ + take a list of annotation tokens and extract embeddings with EncoderLang + """ + assert lang_pad.max().item() < len(vocab) + embedder_lang = self.embs_ann[vocab.name] + emb_lang, lengths_lang = self.encoder_lang(lang_pad, embedder_lang, vocab, self.pad) + if self.args.detach_lang_emb: + emb_lang = emb_lang.clone().detach() + return emb_lang, lengths_lang + + def embed_frames(self, frames_pad): + """ + take a list of frames tensors, pad it, apply dropout and extract embeddings + """ + self.dropout_vis(frames_pad) + frames_4d = frames_pad.view(-1, *frames_pad.shape[2:]) + frames_pad_emb = self.vis_feat(frames_4d).view(*frames_pad.shape[:2], -1) + frames_pad_emb_skip = self.object_feat(frames_4d).view(*frames_pad.shape[:2], -1) + return frames_pad_emb, frames_pad_emb_skip + + def embed_actions(self, actions): + """ + embed previous actions + """ + emb_actions = self.emb_action(actions) + emb_actions = self.dropout_action(emb_actions) + return emb_actions + + def reset(self): + """ + reset internal states (used for real-time execution during eval) + """ + self.frames_traj = torch.zeros(1, 0, *self.visual_tensor_shape) + self.action_traj = torch.zeros(1, 0).long() + + def step(self, input_dict, vocab, prev_action=None): + """ + forward the model for a single time-step (used for real-time execution during eval) + """ + frames = input_dict["frames"] + device = frames.device + if prev_action is not None: + prev_action_int = vocab["action_low"].word2index(prev_action) + prev_action_tensor = torch.tensor(prev_action_int)[None, None].to(device) + self.action_traj = torch.cat((self.action_traj.to(device), prev_action_tensor), dim=1) + self.frames_traj = torch.cat((self.frames_traj.to(device), frames[None]), dim=1) + # at timestep t we have t-1 prev actions so we should pad them + action_traj_pad = torch.cat((self.action_traj.to(device), torch.zeros((1, 1)).to(device).long()), dim=1) + model_out = self.forward( + vocab=vocab["word"], + lang=input_dict["lang"], + lengths_lang=input_dict["lengths_lang"], + length_lang_max=input_dict["length_lang_max"], + frames=self.frames_traj.clone(), + lengths_frames=torch.tensor([self.frames_traj.size(1)]), + length_frames_max=self.frames_traj.size(1), + action=action_traj_pad, + ) + step_out = {} + for key, value in model_out.items(): + # return only the last actions, ignore the rest + step_out[key] = value[:, -1:] + return step_out + + def compute_batch_loss(self, model_out, gt_dict): + """ + loss function for Seq2Seq agent + """ + losses = dict() + + # action loss + action_pred = model_out["action"].view(-1, model_out["action"].shape[-1]) + action_gt = gt_dict["action"].view(-1) + pad_mask = action_gt != self.pad + + # Calculate loss only over future actions + action_pred_mask = gt_dict["driver_actions_pred_mask"].view(-1) + + action_loss = F.cross_entropy(action_pred, action_gt, reduction="none") + action_loss *= pad_mask.float() + if not self.args.compute_train_loss_over_history: + action_loss *= action_pred_mask.float() + action_loss = action_loss.mean() + losses["action"] = action_loss * self.args.action_loss_wt + + # object classes loss + if len(gt_dict["object"]) > 0: + object_pred = model_out["object"] + object_gt = torch.cat(gt_dict["object"], dim=0) + + if self.args.compute_train_loss_over_history: + interact_idxs = gt_dict["obj_interaction_action"].view(-1).nonzero(as_tuple=False).view(-1) + else: + interact_idxs = ( + (gt_dict["driver_actions_pred_mask"] * gt_dict["obj_interaction_action"]) + .view(-1) + .nonzero(as_tuple=False) + .view(-1) + ) + if interact_idxs.nelement() > 0: + object_pred = object_pred.view(object_pred.shape[0] * object_pred.shape[1], *object_pred.shape[2:]) + object_loss = model_util.obj_classes_loss(object_pred, object_gt, interact_idxs) + losses["object"] = object_loss * self.args.object_loss_wt + + # subgoal completion loss + if self.args.subgoal_aux_loss_wt > 0: + subgoal_pred = model_out["subgoal"].squeeze(2) + subgoal_gt = gt_dict["subgoals_completed"] + subgoal_loss = F.mse_loss(subgoal_pred, subgoal_gt, reduction="none") + subgoal_loss = subgoal_loss.view(-1) * pad_mask.float() + subgoal_loss = subgoal_loss.mean() + losses["subgoal_aux"] = self.args.subgoal_aux_loss_wt * subgoal_loss + + # progress monitoring loss + if self.args.progress_aux_loss_wt > 0: + progress_pred = model_out["progress"].squeeze(2) + progress_gt = gt_dict["goal_progress"] + progress_loss = F.mse_loss(progress_pred, progress_gt, reduction="none") + progress_loss = progress_loss.view(-1) * pad_mask.float() + progress_loss = progress_loss.mean() + losses["progress_aux"] = self.args.progress_aux_loss_wt * progress_loss + + # maximize entropy of the policy if asked + if self.args.entropy_wt > 0.0: + policy_entropy = -F.softmax(action_pred, dim=1) * F.log_softmax(action_pred, dim=1) + policy_entropy = policy_entropy.mean(dim=1) + policy_entropy *= pad_mask.float() + losses["entropy"] = -policy_entropy.mean() * self.args.entropy_wt + + return losses + + def init_weights(self, init_range=0.1): + """ + init embeddings uniformly + """ + super().init_weights(init_range) + self.dec_action.bias.data.zero_() + self.dec_action.weight.data.uniform_(-init_range, init_range) + self.emb_action.weight.data.uniform_(-init_range, init_range) + + def compute_metrics(self, model_out, gt_dict, metrics_dict, compute_train_loss_over_history): + """ + compute exact matching and f1 score for action predictions + """ + preds = model_util.extract_action_preds(model_out, self.pad, self.vocab_out, lang_only=True) + stop_token = self.vocab_out.word2index("Stop") + gt_actions = model_util.tokens_to_lang(gt_dict["action"], self.vocab_out, {self.pad, stop_token}) + model_util.compute_f1_and_exact(metrics_dict, [p["action"] for p in preds], gt_actions, "action") + model_util.compute_obj_class_precision( + metrics_dict, gt_dict, model_out["object"], compute_train_loss_over_history + ) diff --git a/src/teach/modeling/ET/alfred/nn/attention.py b/src/teach/modeling/ET/alfred/nn/attention.py new file mode 100644 index 0000000..b73b5b5 --- /dev/null +++ b/src/teach/modeling/ET/alfred/nn/attention.py @@ -0,0 +1,44 @@ +from torch import nn +from torch.nn import functional as F + + +class SelfAttn(nn.Module): + """ + self-attention with learnable parameters + """ + + def __init__(self, dhid): + super().__init__() + self.scorer = nn.Linear(dhid, 1) + # scorer: dhid x 1 + + def forward(self, inp): + # inp: batch_size x seq_len x dhid + scores = F.softmax(self.scorer(inp), dim=1) + # scores: batch_size x seq_len x 1 + cont = scores.transpose(1, 2).bmm(inp).squeeze(1) + # cont: batch_size x seq_len + return cont + + +class DotAttn(nn.Module): + """ + dot-attention (or soft-attention) + """ + + def forward(self, inp, h): + # inp: batch_size x seq_len x dhid + # h: batch_size x dhid + score = self.softmax(inp, h) + # score: batch_size x seq_len x 1 + score_expanded = score.expand_as(inp) + # score_expanded: batch_size x seq_len x dhid + # output: batch_size x dhid + return score_expanded.mul(inp).sum(1), score + + def softmax(self, inp, h): + raw_score = inp.bmm(h.unsqueeze(2)) + # raw_score: batch_size x seq_len x 1 + score = F.softmax(raw_score, dim=1) + # score: batch_size x seq_len x 1 + return score diff --git a/src/teach/modeling/ET/alfred/nn/dec_object.py b/src/teach/modeling/ET/alfred/nn/dec_object.py new file mode 100644 index 0000000..c38fc20 --- /dev/null +++ b/src/teach/modeling/ET/alfred/nn/dec_object.py @@ -0,0 +1,22 @@ +import os + +import torch +from alfred import constants +from torch import nn + + +class ObjectClassifier(nn.Module): + """ + object classifier module (a single FF layer) + """ + + def __init__(self, input_size): + super().__init__() + vocab_obj_path = os.path.join(constants.ET_ROOT, constants.OBJ_CLS_VOCAB) + vocab_obj = torch.load(vocab_obj_path) + num_classes = len(vocab_obj) + self.linear = nn.Linear(input_size, num_classes) + + def forward(self, x): + out = self.linear(x) + return out diff --git a/src/teach/modeling/ET/alfred/nn/enc_lang.py b/src/teach/modeling/ET/alfred/nn/enc_lang.py new file mode 100644 index 0000000..182c20f --- /dev/null +++ b/src/teach/modeling/ET/alfred/nn/enc_lang.py @@ -0,0 +1,89 @@ +import torch +from alfred.nn.encodings import InstrLangEncoding, PosLangEncoding +from torch import nn + + +class EncoderLang(nn.Module): + def __init__( + self, + num_layers, + args, + embs_ann, + subgoal_token="<>", + goal_token="<>", + ): + """ + transformer encoder for language inputs + """ + super(EncoderLang, self).__init__() + self.subgoal_token = subgoal_token + self.goal_token = goal_token + + # transofmer layers + encoder_layer = nn.TransformerEncoderLayer( + args.demb, + args.encoder_heads, + args.demb, + args.dropout["transformer"]["encoder"], + ) + if args.encoder_lang["shared"]: + enc_transformer = nn.TransformerEncoder(encoder_layer, num_layers) + self.enc_transformers = nn.ModuleDict({data: enc_transformer for data in embs_ann.keys()}) + else: + self.enc_transformers = nn.ModuleDict( + {data: nn.TransformerEncoder(encoder_layer, num_layers) for data in embs_ann.keys()} + ) + + # encodings + self.enc_pos = PosLangEncoding(args.demb) if args.encoder_lang["pos_enc"] else None + self.enc_instr = InstrLangEncoding(args.demb) if args.encoder_lang["instr_enc"] else None + self.enc_layernorm = nn.LayerNorm(args.demb) + self.enc_dropout = nn.Dropout(args.dropout["lang"], inplace=True) + + def forward(self, lang_pad, embedder, vocab, pad): + """ + pass embedded inputs through embeddings and encode them using a transformer + """ + # pad the input language sequences and embed them with a linear layer + mask_pad = lang_pad == pad + emb_lang = embedder(lang_pad) + # add positional encodings + mask_token = EncoderLang.mask_token(lang_pad, vocab, {self.subgoal_token, self.goal_token}) + emb_lang = self.encode_inputs(emb_lang, mask_token, mask_pad) + # pass the inputs through the encoder + hiddens = EncoderLang.encoder(self.enc_transformers, emb_lang, mask_pad, vocab) + lengths = (lang_pad != pad).sum(dim=1) + return hiddens, lengths + + @staticmethod + def mask_token(lang_pad, vocab, tokens): + """ + returns mask of the tokens + """ + tokens_mask = torch.zeros_like(lang_pad).long() + for token in tokens: + tokens_mask += lang_pad == vocab.word2index(token) + return tokens_mask.bool() + + @staticmethod + def encoder(encoders, emb_lang, mask_pad, vocab, mask_attn=None): + """ + compute encodings for all tokens using a normal flat encoder + """ + # skip mask: mask padded words + if mask_attn is None: + # attention mask: all tokens can attend to all others + mask_attn = torch.zeros((mask_pad.shape[1], mask_pad.shape[1]), device=mask_pad.device).float() + # encode the inputs + output = encoders[vocab.name](emb_lang.transpose(0, 1), mask_attn, mask_pad).transpose(0, 1) + return output + + def encode_inputs(self, emb_lang, mask_token, mask_pad): + """ + add positional encodings, apply layernorm and dropout + """ + emb_lang = self.enc_pos(emb_lang) if self.enc_pos else emb_lang + emb_lang = self.enc_instr(emb_lang, mask_token) if self.enc_instr else emb_lang + emb_lang = self.enc_dropout(emb_lang) + emb_lang = self.enc_layernorm(emb_lang) + return emb_lang diff --git a/src/teach/modeling/ET/alfred/nn/enc_visual.py b/src/teach/modeling/ET/alfred/nn/enc_visual.py new file mode 100644 index 0000000..e9c6e79 --- /dev/null +++ b/src/teach/modeling/ET/alfred/nn/enc_visual.py @@ -0,0 +1,240 @@ +import contextlib +import logging +import os +import types + +import numpy as np +import torch +import torch.nn as nn +from alfred import constants +from alfred.nn.transforms import Transforms +from alfred.utils import data_util +from torchvision import models +from torchvision.transforms import functional as F + +from teach.logger import create_logger + +logger = create_logger(__name__, level=logging.INFO) + + +class Resnet18(nn.Module): + """ + pretrained Resnet18 from torchvision + """ + + def __init__(self, device, checkpoint_path=None, share_memory=False): + super().__init__() + self.device = device + self.model = models.resnet18(pretrained=True) + self.model = nn.Sequential(*list(self.model.children())[:-2]) + if checkpoint_path is not None: + logger.info("Loading ResNet checkpoint from {}".format(checkpoint_path)) + model_state_dict = torch.load(checkpoint_path, map_location=device) + model_state_dict = { + key: value for key, value in model_state_dict.items() if "GU_" not in key and "text_pooling" not in key + } + model_state_dict = {key: value for key, value in model_state_dict.items() if "fc." not in key} + model_state_dict = {key.replace("resnet.", ""): value for key, value in model_state_dict.items()} + self.model.load_state_dict(model_state_dict) + self.model = self.model.to(torch.device(device)) + self.model = self.model.eval() + if share_memory: + self.model.share_memory() + self._transform = Transforms.get_transform("default") + + def extract(self, x): + x = self._transform(x).to(torch.device(self.device)) + return self.model(x) + + +class RCNN(nn.Module): + """ + pretrained FasterRCNN or MaskRCNN from torchvision + """ + + def __init__( + self, + archi, + device="cuda", + checkpoint_path=None, + share_memory=False, + load_heads=False, + ): + super().__init__() + self.device = device + self.feat_layer = "3" + if archi == "maskrcnn": + self.model = models.detection.maskrcnn_resnet50_fpn( + pretrained=(checkpoint_path is None), + pretrained_backbone=(checkpoint_path is None), + min_size=800, + ) + elif archi == "fasterrcnn": + self.model = models.detection.fasterrcnn_resnet50_fpn( + pretrained=(checkpoint_path is None), + pretrained_backbone=(checkpoint_path is None), + min_size=224, + ) + else: + raise ValueError("Unknown model type = {}".format(archi)) + + if archi == "maskrcnn": + self._transform = self.model.transform + else: + self._transform = Transforms.get_transform("default") + if not load_heads: + for attr in ("backbone", "body"): + self.model = getattr(self.model, attr) + + if checkpoint_path is not None: + self.load_from_checkpoint(checkpoint_path, load_heads, device, archi, "backbone.body") + self.model = self.model.to(torch.device(device)) + self.model = self.model.eval() + if share_memory: + self.model.share_memory() + if load_heads: + # if the model is used for predictions, prepare a vocabulary + self.vocab_pred = {i: class_name for i, class_name in enumerate(constants.OBJECTS_ACTIONS)} + + def extract(self, images): + if isinstance(self._transform, models.detection.transform.GeneralizedRCNNTransform): + images_normalized = self._transform(torch.stack([F.to_tensor(img) for img in images]))[0].tensors + else: + images_normalized = torch.stack([self._transform(img) for img in images]) + images_normalized = images_normalized.to(torch.device(self.device)) + model_body = self.model + if hasattr(self.model, "backbone"): + model_body = self.model.backbone.body + features = model_body(images_normalized) + return features[self.feat_layer] + + def load_from_checkpoint(self, checkpoint_path, load_heads, device, archi, prefix): + logger.info("Loading RCNN checkpoint from {}".format(checkpoint_path)) + state_dict = torch.load(checkpoint_path, map_location=device) + if not load_heads: + # load only the backbone + state_dict = {k.replace(prefix + ".", ""): v for k, v in state_dict.items() if prefix + "." in k} + else: + # load a full model, replace pre-trained head(s) with (a) new one(s) + num_classes, in_features = state_dict["roi_heads.box_predictor.cls_score.weight"].shape + box_predictor = models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes) + self.model.roi_heads.box_predictor = box_predictor + if archi == "maskrcnn": + # and replace the mask predictor with a new one + in_features_mask = self.model.roi_heads.mask_predictor.conv5_mask.in_channels + hidden_layer = 256 + mask_predictor = models.detection.mask_rcnn.MaskRCNNPredictor( + in_features_mask, hidden_layer, num_classes + ) + self.model.roi_heads.mask_predictor = mask_predictor + self.model.load_state_dict(state_dict) + + def predict_objects(self, image, confidence_threshold=0.0, verbose=False): + image = F.to_tensor(image).to(torch.device(self.device)) + output = self.model(image[None])[0] + preds = [] + for pred_idx in range(len(output["scores"])): + score = output["scores"][pred_idx].cpu().item() + if score < confidence_threshold: + continue + box = output["boxes"][pred_idx].cpu().numpy() + label = self.vocab_pred[output["labels"][pred_idx].cpu().item()] + if verbose: + logger.debug("{} at {}".format(label, box)) + pred = types.SimpleNamespace(label=label, box=box, score=score) + if "masks" in output: + pred.mask = output["masks"][pred_idx].cpu().numpy() + preds.append(pred) + return preds + + +class FeatureExtractor(nn.Module): + def __init__( + self, + archi, + device="cuda", + checkpoint=None, + share_memory=False, + compress_type=None, + load_heads=False, + ): + super().__init__() + self.feat_shape = data_util.get_feat_shape(archi, compress_type) + self.eval_mode = True + if archi == "resnet18": + assert not load_heads + self.model = Resnet18(device, checkpoint, share_memory) + else: + self.model = RCNN(archi, device, checkpoint, share_memory, load_heads=load_heads) + self.compress_type = compress_type + # load object class vocabulary + vocab_obj_path = os.path.join(constants.ET_ROOT, constants.OBJ_CLS_VOCAB) + self.vocab_obj = torch.load(vocab_obj_path) + + def featurize(self, images, batch=32): + feats = [] + with (torch.set_grad_enabled(False) if not self.model.model.training else contextlib.nullcontext()): + for i in range(0, len(images), batch): + images_batch = images[i : i + batch] + feats.append(self.model.extract(images_batch)) + feat = torch.cat(feats, dim=0) + if self.compress_type is not None: + feat = data_util.feat_compress(feat, self.compress_type) + assert self.feat_shape[1:] == feat.shape[1:] + return feat + + def predict_objects(self, image, verbose=False): + with torch.set_grad_enabled(False): + pred = self.model.predict_objects(image, verbose=verbose) + return pred + + def train(self, mode): + if self.eval_mode: + return + for module in self.children(): + module.train(mode) + + +class FeatureFlat(nn.Module): + """ + a few conv layers to flatten features that come out of ResNet + """ + + def __init__(self, input_shape, output_size): + super().__init__() + if input_shape[0] == -1: + input_shape = input_shape[1:] + layers, activation_shape = self.init_cnn(input_shape, channels=[256, 64], kernels=[1, 1], paddings=[0, 0]) + layers += [Flatten(), nn.Linear(np.prod(activation_shape), output_size)] + self.layers = nn.Sequential(*layers) + + def init_cnn(self, input_shape, channels, kernels, paddings): + layers = [] + planes_in, spatial = input_shape[0], input_shape[-1] + for planes_out, kernel, padding in zip(channels, kernels, paddings): + # do not use striding + stride = 1 + layers += [ + nn.Conv2d( + planes_in, + planes_out, + kernel_size=kernel, + stride=stride, + padding=padding, + ), + nn.BatchNorm2d(planes_out), + nn.ReLU(inplace=True), + ] + planes_in = planes_out + spatial = (spatial - kernel + 2 * padding) // stride + 1 + activation_shape = (planes_in, spatial, spatial) + return layers, activation_shape + + def forward(self, frames): + activation = self.layers(frames) + return activation + + +class Flatten(nn.Module): + def forward(self, x): + return x.view(x.size(0), -1) diff --git a/src/teach/modeling/ET/alfred/nn/enc_vl.py b/src/teach/modeling/ET/alfred/nn/enc_vl.py new file mode 100644 index 0000000..db26e9e --- /dev/null +++ b/src/teach/modeling/ET/alfred/nn/enc_vl.py @@ -0,0 +1,97 @@ +import torch +from alfred.nn.encodings import PosEncoding, PosLearnedEncoding, TokenLearnedEncoding +from alfred.utils import model_util +from torch import nn + + +class EncoderVL(nn.Module): + def __init__(self, args): + """ + transformer encoder for language, frames and action inputs + """ + super(EncoderVL, self).__init__() + + # transofmer layers + encoder_layer = nn.TransformerEncoderLayer( + args.demb, + args.encoder_heads, + args.demb, + args.dropout["transformer"]["encoder"], + ) + self.enc_transformer = nn.TransformerEncoder(encoder_layer, args.encoder_layers) + + # how many last actions to attend to + self.num_input_actions = args.num_input_actions + + # encodings + self.enc_pos = PosEncoding(args.demb) if args.enc["pos"] else None + self.enc_pos_learn = PosLearnedEncoding(args.demb) if args.enc["pos_learn"] else None + self.enc_token = TokenLearnedEncoding(args.demb) if args.enc["token"] else None + self.enc_layernorm = nn.LayerNorm(args.demb) + self.enc_dropout = nn.Dropout(args.dropout["emb"], inplace=True) + + def forward( + self, + emb_lang, + emb_frames, + emb_actions, + lengths_lang, + lengths_frames, + lengths_actions, + length_frames_max, + attn_masks=True, + ): + """ + pass embedded inputs through embeddings and encode them using a transformer + """ + # emb_lang is processed on each GPU separately so they size can vary + length_lang_max = lengths_lang.max().item() + emb_lang = emb_lang[:, :length_lang_max] + # create a mask for padded elements + length_mask_pad = length_lang_max + length_frames_max * (2 if lengths_actions.max() > 0 else 1) + mask_pad = torch.zeros((len(emb_lang), length_mask_pad), device=emb_lang.device).bool() + for i, (len_l, len_f, len_a) in enumerate(zip(lengths_lang, lengths_frames, lengths_actions)): + # mask padded words + mask_pad[i, len_l:length_lang_max] = True + # mask padded frames + mask_pad[i, length_lang_max + len_f : length_lang_max + length_frames_max] = True + # mask padded actions + mask_pad[i, length_lang_max + length_frames_max + len_a :] = True + + # encode the inputs + emb_all = self.encode_inputs(emb_lang, emb_frames, emb_actions, lengths_lang, lengths_frames, mask_pad) + + # create a mask for attention (prediction at t should not see frames at >= t+1) + if attn_masks: + mask_attn = model_util.generate_attention_mask( + length_lang_max, + length_frames_max, + emb_all.device, + self.num_input_actions, + ) + else: + # allow every token to attend to all others + mask_attn = torch.zeros((mask_pad.shape[1], mask_pad.shape[1]), device=mask_pad.device).float() + + # encode the inputs + output = self.enc_transformer(emb_all.transpose(0, 1), mask_attn, mask_pad).transpose(0, 1) + return output, mask_pad + + def encode_inputs(self, emb_lang, emb_frames, emb_actions, lengths_lang, lengths_frames, mask_pad): + """ + add encodings (positional, token and so on) + """ + if self.enc_pos is not None: + emb_lang, emb_frames, emb_actions = self.enc_pos( + emb_lang, emb_frames, emb_actions, lengths_lang, lengths_frames + ) + if self.enc_pos_learn is not None: + emb_lang, emb_frames, emb_actions = self.enc_pos_learn( + emb_lang, emb_frames, emb_actions, lengths_lang, lengths_frames + ) + if self.enc_token is not None: + emb_lang, emb_frames, emb_actions = self.enc_token(emb_lang, emb_frames, emb_actions) + emb_cat = torch.cat((emb_lang, emb_frames, emb_actions), dim=1) + emb_cat = self.enc_layernorm(emb_cat) + emb_cat = self.enc_dropout(emb_cat) + return emb_cat diff --git a/src/teach/modeling/ET/alfred/nn/encodings.py b/src/teach/modeling/ET/alfred/nn/encodings.py new file mode 100644 index 0000000..32b94b9 --- /dev/null +++ b/src/teach/modeling/ET/alfred/nn/encodings.py @@ -0,0 +1,173 @@ +import math + +import torch +from torch import nn + + +class PosEncoding(nn.Module): + """ + Transformer-style positional encoding with wavelets + """ + + def __init__(self, d_model, max_len=1250): + super().__init__() + self.d_model = d_model + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + self.register_buffer("pe", pe[None]) + + def forward(self, lang, frames, actions, lens_lang, lens_frames, pos=None): + if pos is None: + enc = self.pe[:, : lang.shape[1] + frames.shape[1]] + else: + enc = [[] for _ in range(len(lang))] + for batch_idx in range(pos.shape[0]): + for pos_idx in range(lang.shape[1] + frames.shape[1]): + enc[batch_idx].append(self.pe[0, pos[batch_idx, pos_idx]]) + enc = torch.stack([torch.stack(pos_batch) for pos_batch in enc]) + enc = enc / math.sqrt(self.d_model) + lang = lang + enc[:, : lang.shape[1]] + + for i in range(frames.shape[0]): + start_idx = lens_lang[i] + end_idx = lens_lang[i] + frames.shape[1] + if end_idx > enc.shape[1]: + end_idx = enc.shape[1] + start_idx = enc.shape[1] - frames.shape[1] + frames[i] = frames[i] + enc[0, start_idx:end_idx] + # use the same position indices for actions as for the frames + for i in range(actions.shape[0]): + start_idx = lens_lang[i] + end_idx = lens_lang[i] + actions.shape[1] + if end_idx > enc.shape[1]: + end_idx = enc.shape[1] + start_idx = enc.shape[1] - actions.shape[1] + actions[i] = actions[i] + enc[0, start_idx:end_idx] + return lang, frames, actions + + +class LearnedEncoding(nn.Module): + """ + Learned additive encoding implemented on top of nn.Embedding + """ + + def __init__(self, d_model, vocab_size, init_range=0.1): + super().__init__() + self.emb = nn.Embedding(vocab_size, d_model) + self.emb.weight.data.uniform_(-init_range, init_range) + + def forward(self, x, tokens): + tokens_emb = self.emb(tokens) + return x + tokens_emb + + +class PosLearnedEncoding(nn.Module): + """ + Learned additive positional encoding implemented on top of nn.Embedding + """ + + def __init__(self, d_model, max_pos=1250, init_range=0.1): + super().__init__() + self.emb = nn.Embedding(max_pos, d_model) + self.emb.weight.data.uniform_(-init_range, init_range) + + def forward(self, lang, frames, actions, lens_lang, lens_frames): + pos_lang = torch.stack([torch.arange(0, lang.shape[1])] * lang.shape[0]) + pos_frames = torch.stack([torch.arange(0, frames.shape[1]) + l for l in lens_lang]) + # use the same position indices for actions as for the frames + pos_actions = torch.stack([torch.arange(0, actions.shape[1]) + l for l in lens_lang]) + lang += self.emb(pos_lang.to(lang.device)) + frames += self.emb(pos_frames.to(frames.device)) + actions += self.emb(pos_actions.to(actions.device)) + return lang, frames, actions + + +class TokenLearnedEncoding(nn.Module): + """ + Learned additive img/word/action token encoding implemented on top of nn.Embedding + """ + + def __init__(self, d_model, vocab_size=3, init_range=0.1): + super().__init__() + self.emb = nn.Embedding(vocab_size, d_model) + self.emb.weight.data.uniform_(-init_range, init_range) + + def forward(self, lang, frames, actions): + token_lang = torch.ones(lang.shape[:2], device=lang.device, dtype=torch.long) * 0 + token_lang_emb = self.emb(token_lang) + lang += token_lang_emb + token_frames = torch.ones(frames.shape[:2], device=frames.device, dtype=torch.long) * 1 + token_frames_emb = self.emb(token_frames) + frames += token_frames_emb + token_actions = torch.ones(actions.shape[:2], device=actions.device, dtype=torch.long) * 2 + token_actions_emb = self.emb(token_actions) + actions += token_actions_emb + return lang, frames, actions + + +class PosLangEncoding(nn.Module): + """ + Transformer-style positional encoding with wavelets + """ + + def __init__(self, d_model, max_len=2000): + super().__init__() + self.d_model = d_model + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + self.register_buffer("pe", pe[None]) + + def forward(self, x, pos=None): + if pos is None: + enc = self.pe[:, : x.shape[1]] + else: + enc = [[] for _ in range(len(x))] + for batch_idx in range(pos.shape[0]): + for pos_idx in range(pos.shape[1]): + enc[batch_idx].append(self.pe[0, pos[batch_idx, pos_idx]]) + enc = torch.stack([torch.stack(pos_batch) for pos_batch in enc]) + x = x + enc / math.sqrt(self.d_model) + return x + + +class InstrLangEncoding(PosLangEncoding): + """ + Relative position in an instruction (a sentence) encoding with wavelets + """ + + def forward(self, x, tokens_mask): + counts = torch.zeros_like(tokens_mask)[:, 0].long() + instrs = torch.zeros_like(tokens_mask).long() + # offset the tokens by 1 + tokens_mask[:, 1:] = tokens_mask.clone()[:, :-1] + for i in range(tokens_mask.shape[1] - 1): + instrs[:, i] = counts + counts += tokens_mask[:, i + 1] == True + instrs[:, -1] = instrs[:, -2] + pe_tokens = self.pe[0, instrs] + x = x + pe_tokens / math.sqrt(self.d_model) + return x + + +class DatasetLearnedEncoding(nn.Module): + """ + Learned additive dataset id encoding implemented on top of nn.Embedding + """ + + def __init__(self, d_model, datasets, init_range=0.1): + super().__init__() + self.datasets = {dataset: i for i, dataset in enumerate(datasets)} + self.emb = nn.Embedding(len(datasets), d_model) + self.emb.weight.data.uniform_(-init_range, init_range) + + def forward(self, lang, vocab): + dataset_ids = torch.ones(lang.shape[0], device=lang.device, dtype=torch.long) + dataset_emb = self.emb(dataset_ids * self.datasets[vocab.name]) + lang_enc = lang + dataset_emb[:, None] + return lang_enc diff --git a/src/teach/modeling/ET/alfred/nn/transforms.py b/src/teach/modeling/ET/alfred/nn/transforms.py new file mode 100644 index 0000000..5f832b5 --- /dev/null +++ b/src/teach/modeling/ET/alfred/nn/transforms.py @@ -0,0 +1,90 @@ +from torchvision import transforms + + +class Transforms(object): + @staticmethod + def resize(img_size=224): + # expects a PIL Image + return transforms.Resize((img_size, img_size)) + + @staticmethod + def affine(degree=5, translate=0.04, scale=0.02): + # expects a PIL Image + return transforms.RandomAffine( + degrees=(-degree, degree), + translate=(translate, translate), + scale=(1 - scale, 1 + scale), + shear=None, + ) + + @staticmethod + def random_crop(img_size=224): + # expects a PIL Image + return transforms.RandomCrop((img_size, img_size)) + + @staticmethod + def normalize(): + # expects a PIL Image + return transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + ), + ] + ) + + @staticmethod + def cutout(p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0.0): + # expects a tensor + return transforms.RandomErasing(p=p, scale=scale, ratio=ratio, value=value) + + @staticmethod + def get_transform(transform="default"): + if transform == "default": + return transforms.Compose([Transforms.resize(224), Transforms.normalize()]) + elif transform == "none": + return transforms.ToTensor() + elif transform == "crops": + return transforms.Compose( + [ + Transforms.resize(240), + Transforms.random_crop(224), + Transforms.normalize(), + ] + ) + elif transform == "cutout": + return transforms.Compose([Transforms.resize(224), Transforms.normalize(), Transforms.cutout()]) + elif transform == "affine": + return transforms.Compose([Transforms.resize(224), Transforms.affine(), Transforms.normalize()]) + elif transform == "affine_crops": + return transforms.Compose( + [ + Transforms.resize(240), + Transforms.random_crop(224), + Transforms.affine(), + Transforms.normalize(), + ] + ) + elif transform == "affine_crops_cutout": + return transforms.Compose( + [ + Transforms.resize(240), + Transforms.random_crop(224), + Transforms.affine(), + Transforms.normalize(), + Transforms.cutout(), + ] + ) + elif transform == "affine_cutout": + return transforms.Compose( + [ + Transforms.resize(224), + Transforms.affine(), + Transforms.normalize(), + Transforms.cutout(), + ] + ) + else: + raise ValueError("Image augmentation {} is not implemented".format(transform)) diff --git a/src/teach/modeling/ET/alfred/utils/data_util.py b/src/teach/modeling/ET/alfred/utils/data_util.py new file mode 100644 index 0000000..b2bff49 --- /dev/null +++ b/src/teach/modeling/ET/alfred/utils/data_util.py @@ -0,0 +1,325 @@ +import json +import logging +import os +import pickle +import re +import shutil +import string +from copy import deepcopy + +import lmdb +import torch +from alfred import constants +from alfred.utils import helper_util +from PIL import Image +from torch.nn.utils.rnn import pad_sequence +from tqdm import tqdm + +from teach.logger import create_logger + +logger = create_logger(__name__, level=logging.INFO) + + +def read_images(image_path_list): + images = [] + for image_path in image_path_list: + image_orig = Image.open(image_path) + images.append(image_orig.copy()) + image_orig.close() + return images + + +def read_traj_images(json_path, image_folder): + with open(json_path) as json_file: + json_dict = json.load(json_file) + + images_dir = json_path.parents[2] / image_folder / json_path.parts[-2] / json_path.parts[-1].split(".")[0] + + fimages = [images_dir / im for im in json_dict["driver_image_history"] + json_dict["driver_images_future"]] + logger.debug("Loading images from %s" % images_dir) + logger.debug("Expected image files: %s" % "\n\t".join([str(x) for x in fimages])) + + if not all([os.path.exists(path) for path in fimages]): + return None + assert len(fimages) > 0 + images = read_images(fimages) + return images + + +def extract_features(images, extractor): + if images is None: + return None + feat = extractor.featurize(images, batch=8) + return feat.cpu() + + +def process_traj(traj_orig, traj_path, r_idx, preprocessor): + # copy trajectory + traj = traj_orig.copy() + # root & split + traj["root"] = str(traj_path) + partition = traj_path.parts[-2] + traj["split"] = partition + traj["repeat_idx"] = r_idx + # numericalize actions for train/valid splits + preprocessor.process_actions(traj_orig, traj) + # numericalize language + if "test" in partition: + preprocessor.process_language(traj_orig, traj, r_idx, is_test_split=True) + else: + preprocessor.process_language(traj_orig, traj, r_idx, is_test_split=False) + return traj + + +def gather_feats(files, output_path): + if output_path.is_dir(): + shutil.rmtree(output_path) + lmdb_feats = lmdb.open(str(output_path), 700 * 1024 ** 3, writemap=True) + with lmdb_feats.begin(write=True) as txn_feats: + for idx, path in tqdm(enumerate(files)): + traj_feats = torch.load(path).numpy() + txn_feats.put("{:06}".format(idx).encode("ascii"), traj_feats.tobytes()) + lmdb_feats.close() + + +def gather_jsons(files, output_path): + if output_path.exists(): + os.remove(output_path) + jsons = {} + for idx, path in tqdm(enumerate(files)): + with open(path, "rb") as f: + jsons_idx = pickle.load(f) + jsons["{:06}".format(idx).encode("ascii")] = jsons_idx + with output_path.open("wb") as f: + pickle.dump(jsons, f) + + +def get_preprocessor(PreprocessorClass, subgoal_ann, lock, vocab_path=None, task_type="edh"): + if vocab_path is None: + init_words = ["<>", "<>", "<>", "<>"] + else: + init_words = [] + vocabs_with_lock = { + "word": helper_util.VocabWithLock(deepcopy(init_words), lock), + "action_low": helper_util.VocabWithLock(deepcopy(init_words), lock), + "action_high": helper_util.VocabWithLock(deepcopy(init_words), lock), + } + if vocab_path is not None: + vocabs_loaded = torch.load(vocab_path) + for vocab_name, vocab in vocabs_with_lock.items(): + loaded_dict = vocabs_loaded[vocab_name].to_dict() + for _i, w in enumerate(loaded_dict["index2word"]): + vocab.word2index(w, train=True) + vocab.counts[w] = loaded_dict["counts"][w] + + actions_high_init_words = [ + "Navigate", + "Pickup", + "Place", + "Open", + "Close", + "ToggleOn", + "ToggleOff", + "Slice", + "Pour", + "object", + ] + + # Reset low actions vocab to empty because Simbot vocab is different + actions_low_init_words = [ + "Stop", + "Forward", + "Backward", + "Turn Left", + "Turn Right", + "Look Up", + "Look Down", + "Pan Left", + "Pan Right", + "Navigation", + "Pickup", + "Place", + "Open", + "Close", + "ToggleOn", + "ToggleOff", + "Slice", + "Pour", + ] + if task_type == "tfd": + actions_low_init_words.append("Text") + + vocabs_with_lock["action_low"] = helper_util.VocabWithLock(actions_low_init_words, lock) + vocabs_with_lock["action_high"] = helper_util.VocabWithLock(actions_high_init_words, lock) + vocab_obj = torch.load(os.path.join(constants.ET_ROOT, constants.OBJ_CLS_VOCAB)).to_dict() + logger.debug("In get_preprocessor, vocab_obj = %s" % vocab_obj["index2word"]) + for _i, w in enumerate(vocab_obj["index2word"]): + vocabs_with_lock["action_high"].word2index(w, train=True) + vocabs_with_lock["action_high"].counts[w] = vocab_obj["counts"][w] + + preprocessor = PreprocessorClass(vocabs_with_lock, subgoal_ann) + return preprocessor + + +def tensorize_and_pad(batch, device, pad): + """ + cast values to torch tensors, put them to the correct device and pad sequences + """ + device = torch.device(device) + input_dict, gt_dict, feat_dict = dict(), dict(), dict() + traj_data, feat_list = list(zip(*batch)) + for key in feat_list[0].keys(): + feat_dict[key] = [el[key] for el in feat_list] + # feat_dict keys that start with these substrings will be assigned to input_dict + input_keys = {"lang", "frames"} + # the rest of the keys will be assigned to gt_dict + + for k, v in feat_dict.items(): + dict_assign = input_dict if any([k.startswith(s) for s in input_keys]) else gt_dict + if k.startswith("lang"): + # no preprocessing should be done here + seqs = [torch.tensor(vv if vv is not None else [pad, pad], device=device).long() for vv in v] + pad_seq = pad_sequence(seqs, batch_first=True, padding_value=pad) + dict_assign[k] = pad_seq + dict_assign["lengths_" + k] = torch.tensor(list(map(len, seqs))) + length_max_key = "length_" + k + "_max" + if ":" in k: + # for translated length keys (e.g. lang:lmdb/1x_det) we should use different names + length_max_key = "length_" + k.split(":")[0] + "_max:" + ":".join(k.split(":")[1:]) + dict_assign[length_max_key] = max(map(len, seqs)) + elif k in {"object"}: + # convert lists with object indices to tensors + seqs = [torch.tensor(vv, device=device, dtype=torch.long) for vv in v if len(vv) > 0] + dict_assign[k] = seqs + elif k in {"frames"}: + # frames features were loaded from the disk as tensors + seqs = [vv.clone().detach().to(device).type(torch.float) for vv in v] + pad_seq = pad_sequence(seqs, batch_first=True, padding_value=pad) + dict_assign[k] = pad_seq + dict_assign["lengths_" + k] = torch.tensor(list(map(len, seqs))) + dict_assign["length_" + k + "_max"] = max(map(len, seqs)) + else: + # default: tensorize and pad sequence + seqs = [torch.tensor(vv, device=device, dtype=torch.long) for vv in v] + pad_seq = pad_sequence(seqs, batch_first=True, padding_value=pad) + dict_assign[k] = pad_seq + return traj_data, input_dict, gt_dict + + +def sample_batches(iterators, device, pad, args): + """ + sample a batch from each iterator, return Nones if the iterator is empty + """ + batches_dict = {} + for dataset_id, iterator in iterators.items(): + try: + batches = next(iterator) + except StopIteration as e: + return None + dataset_name = dataset_id.split(":")[1] + traj_data, input_dict, gt_dict = tensorize_and_pad(batches, device, pad) + batches_dict[dataset_name] = (traj_data, input_dict, gt_dict) + return batches_dict + + +def load_vocab(name, ann_type="lang"): + """ + load a vocabulary from the dataset + """ + path = os.path.join(constants.ET_DATA, name, constants.VOCAB_FILENAME) + logger.info("In load_vocab, loading vocab from %s" % path) + vocab_dict = torch.load(path) + # set name and annotation types + for vocab in vocab_dict.values(): + vocab.name = name + vocab.ann_type = ann_type + return vocab_dict + + +def load_vocab_for_inference(model_dir, name, ann_type="lang"): + path = os.path.join(model_dir, constants.VOCAB_FILENAME) + logger.info("In load_vocab, loading vocab from %s" % path) + vocab_dict = torch.load(path) + # set name and annotation types + for vocab in vocab_dict.values(): + vocab.name = name + vocab.ann_type = ann_type + return vocab_dict + + +def get_feat_shape(visual_archi, compress_type=None): + """ + Get feat shape depending on the training archi and compress type + """ + if visual_archi == "fasterrcnn": + # the RCNN model should be trained with min_size=224 + feat_shape = (-1, 2048, 7, 7) + elif visual_archi == "maskrcnn": + # the RCNN model should be trained with min_size=800 + feat_shape = (-1, 2048, 10, 10) + elif visual_archi == "resnet18": + feat_shape = (-1, 512, 7, 7) + else: + raise NotImplementedError("Unknown archi {}".format(visual_archi)) + + if compress_type is not None: + if not re.match(r"\d+x", compress_type): + raise NotImplementedError("Unknown compress type {}".format(compress_type)) + compress_times = int(compress_type[:-1]) + feat_shape = ( + feat_shape[0], + feat_shape[1] // compress_times, + feat_shape[2], + feat_shape[3], + ) + return feat_shape + + +def feat_compress(feat, compress_type): + """ + Compress features by channel average pooling + """ + assert re.match(r"\d+x", compress_type) and len(feat.shape) == 4 + times = int(compress_type[:-1]) + assert feat.shape[1] % times == 0 + feat = feat.reshape((feat.shape[0], times, feat.shape[1] // times, feat.shape[2], feat.shape[3])) + feat = feat.mean(dim=1) + return feat + + +def read_dataset_info(data_name): + """ + Read dataset a feature shape and a feature extractor checkpoint path + """ + path = os.path.join(constants.ET_DATA, data_name, "params.json") + with open(path, "r") as f_params: + params = json.load(f_params) + return params + + +def read_dataset_info_for_inference(model_dir): + """ + Read dataset a feature shape and a feature extractor checkpoint path from file stored in model checkpoint + """ + path = os.path.join(model_dir, "params.json") + logger.info("Reading dataset info from %s for model dir %s" % (path, model_dir)) + with open(path, "r") as f_params: + params = json.load(f_params) + return params + + +def remove_spaces(s): + cs = " ".join(s.split()) + return cs + + +def remove_spaces_and_lower(s): + cs = remove_spaces(s) + cs = cs.lower() + return cs + + +def remove_punctuation(s): + cs = s.translate(str.maketrans("", "", string.punctuation)) + cs = remove_spaces_and_lower(cs) + return cs diff --git a/src/teach/modeling/ET/alfred/utils/eval_util.py b/src/teach/modeling/ET/alfred/utils/eval_util.py new file mode 100644 index 0000000..c95533f --- /dev/null +++ b/src/teach/modeling/ET/alfred/utils/eval_util.py @@ -0,0 +1,37 @@ +import logging + +from alfred.nn.enc_visual import FeatureExtractor +from alfred.utils import model_util + +from teach.logger import create_logger + +logger = create_logger(__name__, level=logging.INFO) + + +def load_agent(model_path, dataset_info, args, for_inference=False): + """ + load a pretrained agent and its feature extractor + """ + logger.info("In load_agent, model_path = %s, dataset_info = %s" % (str(model_path), str(dataset_info))) + learned_model, _ = model_util.load_model(model_path, args.device, for_inference=for_inference) + model = learned_model.model + model.eval() + model.args.device = args.device + extractor = FeatureExtractor( + archi=dataset_info["visual_archi"], + device=args.device, + checkpoint=args.visual_checkpoint, + compress_type=dataset_info["compress_type"], + ) + return model, extractor + + +def load_object_predictor(args): + if args.object_predictor is None: + return None + return FeatureExtractor( + archi="maskrcnn", + device=args.device, + checkpoint=args.object_predictor, + load_heads=True, + ) diff --git a/src/teach/modeling/ET/alfred/utils/helper_util.py b/src/teach/modeling/ET/alfred/utils/helper_util.py new file mode 100644 index 0000000..f0f2587 --- /dev/null +++ b/src/teach/modeling/ET/alfred/utils/helper_util.py @@ -0,0 +1,52 @@ +import torch +from vocab import Vocab as VocabBase + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +class DataParallel(torch.nn.DataParallel): + """ + Allow nn.DataParallel to call model's attributes. + """ + + def __getattr__(self, name): + try: + return super().__getattr__(name) + except AttributeError: + return getattr(self.module, name) + + +class VocabWithLock(VocabBase): + """vocab.Vocab with a lock for parallel computations.""" + + def __init__(self, words=(), lock=None): + self.lock = lock + super().__init__(words) + + def word2index(self, word, train=False): + """Original function copy with the self.lock call.""" + if isinstance(word, (list, tuple)): + return [self.word2index(w, train=train) for w in word] + with self.lock: + self.counts[word] += train + if word in self._word2index: + return self._word2index[word] + else: + if train: + self._index2word += [word] + self._word2index[word] = len(self._word2index) + else: + return self._handle_oov_word(word) + index = self._word2index[word] + return index + + +def identity(x): + """ + pickable equivalent of lambda x: x + """ + return x diff --git a/src/teach/modeling/ET/alfred/utils/metric_util.py b/src/teach/modeling/ET/alfred/utils/metric_util.py new file mode 100755 index 0000000..96215c8 --- /dev/null +++ b/src/teach/modeling/ET/alfred/utils/metric_util.py @@ -0,0 +1,51 @@ +import collections +import re +import string + + +def normalize_answer(s): + """ + Lower text and remove punctuation, articles and extra whitespace. + """ + + def remove_articles(text): + regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) + return re.sub(regex, " ", text) + + def white_space_fix(text): + return " ".join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def get_tokens(s): + if not s: + return [] + return normalize_answer(s).split() + + +def compute_exact(a_gold, a_pred): + return int(normalize_answer(a_gold) == normalize_answer(a_pred)) + + +def compute_f1(a_gold, a_pred): + gold_toks = get_tokens(a_gold) + pred_toks = get_tokens(a_pred) + common = collections.Counter(gold_toks) & collections.Counter(pred_toks) + num_same = sum(common.values()) + if len(gold_toks) == 0 or len(pred_toks) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(gold_toks == pred_toks) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(pred_toks) + recall = 1.0 * num_same / len(gold_toks) + f1 = (2 * precision * recall) / (precision + recall) + return f1 diff --git a/src/teach/modeling/ET/alfred/utils/model_util.py b/src/teach/modeling/ET/alfred/utils/model_util.py new file mode 100644 index 0000000..87e5205 --- /dev/null +++ b/src/teach/modeling/ET/alfred/utils/model_util.py @@ -0,0 +1,399 @@ +import collections +import copy +import json +import logging +import os +from importlib import import_module + +import numpy as np +import torch +from alfred import constants +from alfred.utils import metric_util +from torch.nn import functional as F + +from teach.logger import create_logger + +logger = create_logger(__name__, level=logging.INFO) + + +def adjust_lr(args, epoch, schedulers): + """ + adjust optimizer learning rate w.r.t the schedulers + """ + if epoch >= args.lr["warmup_epoch"]: + schedulers["base"].step() + else: + schedulers["warmup"].step() + + +def create_optimizer_and_schedulers(first_epoch, args, parameters, optimizer=None): + """ + create a scheduler for the learning rate + """ + # create an optimizer if it was not provided + init_lr = args.lr["init"] * args.lr["warmup_scale"] + if args.lr["warmup_scale"] != 1: + assert args.lr["warmup_epoch"] > 0 + if optimizer is None: + assert args.optimizer in ("adam", "adamw") + OptimizerClass = torch.optim.Adam if args.optimizer == "adam" else torch.optim.AdamW + optimizer = OptimizerClass(parameters, lr=init_lr, weight_decay=args.weight_decay) + else: + for param_group in optimizer.param_groups: + param_group["lr"] = init_lr + + # create a learning rate scheduler + assert args.lr["profile"] in ("linear", "cosine", "triangular", "triangular2") + if args.lr["profile"] == "linear": + lr_scheduler = torch.optim.lr_scheduler.StepLR( + optimizer, gamma=args.lr["decay_scale"], step_size=args.lr["decay_epoch"] + ) + elif args.lr["profile"] == "cosine": + lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, + T_max=(args.epochs - args.lr["warmup_epoch"] - 1), + eta_min=args.lr["final"], + ) + else: + assert min(args.lr["cycle_epoch_up"], args.lr["cycle_epoch_down"]) > 0 + lr_scheduler = torch.optim.lr_scheduler.CyclicLR( + optimizer, + base_lr=args.lr["init"], + max_lr=args.lr["final"], + step_size_up=args.lr["cycle_epoch_up"], + step_size_down=args.lr["cycle_epoch_down"], + mode=args.lr["profile"], + cycle_momentum=False, + ) + + # create a learning rate scheduler for the warmup period + warmup_scheduler = None + if args.lr["warmup_epoch"]: + warmup_scheduler = torch.optim.lr_scheduler.ExponentialLR( + optimizer, + gamma=(1 / args.lr["warmup_scale"] ** (1 / args.lr["warmup_epoch"])), + ) + + # in case if we start not from the first epoch, fastforward the scheduler + for epoch in range(first_epoch): + if epoch >= args.lr["warmup_epoch"]: + lr_scheduler.step() + else: + warmup_scheduler.step() + return optimizer, {"base": lr_scheduler, "warmup": warmup_scheduler} + + +def load_model(fsave, device, check_epoch=None, for_inference=False): + """ + load pth model from disk + """ + logger.info("Loading from {} to {}".format(fsave, device)) + save = torch.load(fsave, map_location=device) + LearnedModel = import_module("alfred.model.learned").LearnedModel + save["args"]["model_dir"] = os.path.dirname(fsave) + model = LearnedModel(save["args"], save["embs_ann"], save["vocab_out"], for_inference) + model.load_state_dict(save["model"]) + OptimizerClass = torch.optim.Adam if save["args"].optimizer == "adam" else torch.optim.AdamW + optimizer = OptimizerClass(model.parameters(), lr=1e-3, weight_decay=save["args"].weight_decay) + optimizer.load_state_dict(save["optim"]) + if check_epoch: + assert save["metric"]["epoch"] == check_epoch, "Epochs in info.json and latest.pth do not match" + model = model.to(torch.device(device)) + optimizer_to(optimizer, torch.device(device)) + return model, optimizer + + +def load_model_args(fsave): + """ + load model's args from disk + """ + save = torch.load(fsave, map_location=lambda storage, loc: storage) + return save["args"] + + +def save_model(model, model_name, stats, optimizer=None, symlink=False): + """ + save the model to args.dout/model_name or create a symlink from the latest model to args.dout/model_name + """ + save_path = os.path.join(model.args.dout, model_name) + if not symlink: + # nn.DaraParallel related renaming + state_dict = {key.replace("model.module.", "model."): value for key, value in model.state_dict().items()} + assert optimizer is not None + torch.save( + { + "metric": stats, + "model": state_dict, + "optim": optimizer.state_dict(), + "args": model.args, + "vocab_out": model.vocab_out, + "embs_ann": model.embs_ann, + }, + save_path, + ) + else: + # create symlink to last saved model + model_path = os.path.join(model.args.dout, "model_{:02d}.pth".format(stats["epoch"])) + if os.path.islink(save_path): + os.unlink(save_path) + os.symlink(model_path, save_path) + + +def tensorboard(writer, metrics, split, iter, frequency, batch_size): + if (iter // batch_size) % frequency == 0: + for metric_name, metric_value_list in metrics.items(): + metric_value = np.mean(metric_value_list[-frequency:]) + writer.add_scalar("{}/{}".format(split, metric_name), metric_value, iter) + + +def save_log(dout, progress, total, stage, **kwargs): + """ + logging a method json for besteffort mode and jobs monitoring on Alex's machine + """ + info_path = os.path.join(dout, "info.json") + info_dicts = [] + if os.path.exists(info_path): + with open(info_path, "r") as f: + info_dicts = json.load(f) + info_dict = {"stage": stage, "progress": progress, "total": total} + info_dict.update(kwargs) + info_dicts.append(info_dict) + with open(info_path, "w") as f: + json.dump(info_dicts, f) + + +def load_log(dout, stage): + """ + loading a method json to continue training from the correct place + """ + info_path = os.path.join(dout, "info.json") + if os.path.exists(info_path): + with open(info_path) as f: + info_dicts = json.load(f) + info_dict = [el for el in info_dicts if el["stage"] == stage][-1] + else: + info_dict = {"progress": 0, "best_loss": {}, "iters": {}} + if isinstance(info_dict["best_loss"], dict): + info_dict["best_loss"] = collections.defaultdict(lambda: 1e10, info_dict["best_loss"]) + if isinstance(info_dict["iters"], dict): + info_dict["iters"] = collections.defaultdict(lambda: 0, info_dict["iters"]) + return info_dict + + +def update_log(dout, stage, update, **kwargs): + """ + updating a method json for monitoring on Alex's machine + """ + assert update in ("increase", "rewrite") + info_path = os.path.join(dout, "info.json") + assert os.path.exists(info_path) + with open(info_path) as f: + info_dicts = json.load(f) + info_dict = copy.deepcopy([el for el in info_dicts if el["stage"] == stage][-1]) + # update the values + for key, value in kwargs.items(): + assert key in info_dict + new_value = value + info_dict[key] if update == "increase" else value + info_dict[key] = new_value + # decide what to do with the list with updated values + if info_dicts[-1]["stage"] == stage: + # rewrite the values + info_dicts[-1] = info_dict + else: + # append a new list element + info_dicts.append(info_dict) + # dump to the disk + with open(info_path, "w") as f: + json.dump(info_dicts, f) + + +def triangular_mask(size, device, diagonal_shift=1): + """ + generate upper triangular matrix filled with ones + """ + square = torch.triu(torch.ones(size, size, device=device), diagonal=diagonal_shift) + square = square.masked_fill(square == 1.0, float("-inf")) + return square + + +def generate_attention_mask(len_lang, len_frames, device, num_input_actions=0): + """ + generate mask for attention (a timestep at t does not attend to timesteps after t)""" + # 1. language should attend only to language + lang_to_lang = torch.zeros((len_lang, len_lang), device=device).float() + lang_to_rest = torch.ones((len_lang, len_frames * 2), device=device).float() * float("-inf") + lang_to_all = torch.cat((lang_to_lang, lang_to_rest), dim=1) + # 2.1 frames should attend to all language tokens + frames_to_lang = torch.zeros((len_frames, len_lang), device=device).float() + # 2.2 frames should attend to frames with timestep <= t + frames_to_frames = triangular_mask(len_frames, device) + # 2.3 frames should attend to actions with timestep < t. first make all actions invisible + frames_to_actions = torch.ones((len_frames, len_frames), device=device).float() * float("-inf") + # 2.3 then unmask `num_input_actions` previous actions for each frame (excluding index t) + for a_idx in range(num_input_actions): + for f_idx in range(len_frames): + if f_idx - 1 - a_idx < 0: + # the index is out of bound + continue + frames_to_actions[f_idx, f_idx - 1 - a_idx] = 0.0 + frames_to_all = torch.cat((frames_to_lang, frames_to_frames, frames_to_actions), dim=1) + # 3. actions should attend to the same indices as frames + actions_to_all = frames_to_all.clone() + # 4. concatenate all the masks + all_to_all = torch.cat((lang_to_all, frames_to_all, actions_to_all), dim=0) + return all_to_all + + +def process_prediction(action, objects, pad, vocab_action, clean_special_tokens, predict_object=True): + """ + process a single trajectory, return it as a dict + """ + # remove padding tokens + if pad in action: + pad_start_idx = action.index(pad) + action = action[:pad_start_idx] + objects = objects[:pad_start_idx] + if clean_special_tokens: + # remove <> tokens + stop_token = vocab_action.word2index("Stop") + if stop_token in action: + stop_start_idx = action.index(stop_token) + action = action[:stop_start_idx] + objects = objects[:stop_start_idx] + # index to API actions + words = vocab_action.index2word(action) + + if predict_object: + pred_object = objects[None].max(2)[1].cpu().numpy() + else: + pred_object = None + pred_processed = { + "action": " ".join(words), + "object": pred_object, + } + return pred_processed + + +def extract_action_preds(model_out, pad, vocab_action, clean_special_tokens=True, lang_only=False): + """ + output processing for a VLN agent + """ + zipped_data = zip(model_out["action"].max(2)[1].tolist(), model_out["object"]) + predict_object = not lang_only + preds_list = [ + process_prediction(action, objects, pad, vocab_action, clean_special_tokens, predict_object) + for action, objects in zipped_data + ] + return preds_list + + +def compute_f1_and_exact(metrics, preds, labels, loss_key): + """ + compute f1 and extract match scores for agent output + """ + m = collections.defaultdict(list) + for pred_str, label_str in zip(preds, labels): + pred_list, label_list = pred_str.lower().split(" "), label_str.lower().split(" ") + # compute f1 score for the full sequence of actions + m["{}/f1".format(loss_key)].append(metric_util.compute_f1(label_str, pred_str)) + # compute exact matching for each timestep individually + for pred_action, label_action in zip(pred_list, label_list): + m["{}/exact".format(loss_key)].append(metric_util.compute_exact(label_action, pred_action)) + m_averaged = {k: sum(v) / len(v) for k, v in m.items()} + for k, v in m_averaged.items(): + metrics[k].append(v) + + +def compute_obj_class_precision(metrics, gt_dict, classes_out, compute_train_loss_over_history): + """ + compute precision of predictions for interaction object classes + """ + if len(gt_dict["object"]) > 0: + if compute_train_loss_over_history: + interact_idxs = torch.nonzero(gt_dict["obj_interaction_action"]) + else: + interact_idxs = torch.nonzero(gt_dict["driver_actions_pred_mask"] * gt_dict["obj_interaction_action"]) + obj_classes_prob = classes_out[tuple(interact_idxs.T)] + obj_classes_pred = obj_classes_prob.max(1)[1] + obj_classes_gt = torch.cat(gt_dict["object"], dim=0) + precision = torch.sum(obj_classes_pred == obj_classes_gt) / len(obj_classes_gt) + metrics["action/object"].append(precision.item()) + else: + metrics["action/object"].append(0.0) + + +def obj_classes_loss(pred_obj_cls, gt_obj_cls, interact_idxs): + """ + Compute a cross-entropy loss for the object class predictions. + """ + pred_obj_cls_inter = pred_obj_cls[interact_idxs] + # the interaction objects should be non zeros + assert not (gt_obj_cls == 0).any() + # compute the loss for interaction objects + obj_cls_loss = F.cross_entropy(pred_obj_cls_inter, gt_obj_cls, reduction="mean") + return obj_cls_loss + + +def tokens_to_lang(tokens, vocab, skip_tokens=None, join=True): + """ + convert tokens into human-readable words + """ + if skip_tokens is None: + skip_tokens = {} + + def _tokens_to_lang(seq): + if isinstance(seq, torch.Tensor): + seq = seq.tolist() + lang = [vocab.index2word(t) for t in seq if t not in skip_tokens] + lang = " ".join(lang) if join else lang + return lang + + if isinstance(tokens[0], int): + # a list of ints is provided, only one sequence + output = _tokens_to_lang(tokens) + else: + # a list of lists is provided, several sequences + output = [_tokens_to_lang(seq) for seq in tokens] + return output + + +def translate_to_vocab(tokens, vocab, vocab_translate, skip_new_tokens=False): + """ + translate tokens from orig vocab to translate vocab + """ + if vocab_translate.contains_same_content(vocab): + return tokens + lang_orig = tokens_to_lang(tokens, vocab, join=False) + tokens_new = [] + for word in lang_orig: + if skip_new_tokens and word not in vocab_translate.counts: + word = "<>" + tokens_new.append(vocab_translate.word2index(word)) + if not skip_new_tokens: + lang_new = tokens_to_lang(tokens_new, vocab_translate, join=False) + assert lang_orig == lang_new + return tokens_new + + +def last_model_path(exp_name): + """ + get path of the last model in the exp + """ + model_path = os.path.join(constants.ET_LOGS, exp_name, "latest.pth") + assert os.path.islink(model_path) + return model_path + + +def optimizer_to(optim, device): + for param in optim.state.values(): + # Not sure there are any global tensors in the state dict + if isinstance(param, torch.Tensor): + param.data = param.data.to(device) + if param._grad is not None: + param._grad.data = param._grad.data.to(device) + elif isinstance(param, dict): + for subparam in param.values(): + if isinstance(subparam, torch.Tensor): + subparam.data = subparam.data.to(device) + if subparam._grad is not None: + subparam._grad.data = subparam._grad.data.to(device) diff --git a/src/teach/modeling/ET/data/.gitkeep b/src/teach/modeling/ET/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/teach/modeling/ET/files/human.vocab b/src/teach/modeling/ET/files/human.vocab new file mode 100644 index 0000000..55b095a Binary files /dev/null and b/src/teach/modeling/ET/files/human.vocab differ diff --git a/src/teach/modeling/ET/files/obj_cls.vocab b/src/teach/modeling/ET/files/obj_cls.vocab new file mode 100644 index 0000000..771cc98 Binary files /dev/null and b/src/teach/modeling/ET/files/obj_cls.vocab differ diff --git a/src/teach/modeling/ET/files/overview.png b/src/teach/modeling/ET/files/overview.png new file mode 100644 index 0000000..a159582 Binary files /dev/null and b/src/teach/modeling/ET/files/overview.png differ diff --git a/src/teach/modeling/ET/files/synth.vocab b/src/teach/modeling/ET/files/synth.vocab new file mode 100644 index 0000000..01339bc Binary files /dev/null and b/src/teach/modeling/ET/files/synth.vocab differ diff --git a/src/teach/modeling/ET/logs/.gitkeep b/src/teach/modeling/ET/logs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/teach/modeling/__init__.py b/src/teach/modeling/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/teach/replay/episode_replay.py b/src/teach/replay/episode_replay.py index 9097dc7..2f0a1d5 100644 --- a/src/teach/replay/episode_replay.py +++ b/src/teach/replay/episode_replay.py @@ -363,6 +363,7 @@ def _add_interaction(self, idx, interact_oid, logged_success): def _set_up_new_episode(self, obs_dir, turn_on_lights, task=None): api_success = True + self.simulator.reset_stored_data() logger.info("Starting episode...") self.simulator.start_new_episode( world=self.episode.world, diff --git a/src/teach/simulators/simulator_THOR.py b/src/teach/simulators/simulator_THOR.py index aed9ff0..8e95fff 100644 --- a/src/teach/simulators/simulator_THOR.py +++ b/src/teach/simulators/simulator_THOR.py @@ -424,6 +424,24 @@ def start_new_episode( custom_object_metadata=self.__custom_object_metadata, ) + def save(self, file_name=None): + """ + Save the session using the current state as the final simulator state. This does not shut down the simulator. + Call done() instead if simulator should be shut down after this + :param file_name: If file_name is not None, the simulator session is saved in the same format as original games + """ + # Add final state to log. + state = self.get_scene_object_locs_and_states() + self.current_episode.final_state = Initialization( + time_start=time.time() - self.start_time, + agents=state["agents"], + objects=state["objects"], + custom_object_metadata=self.__custom_object_metadata, + ) + + # Save log file + super().save(file_name=file_name) + def done(self, file_name=None): """ Shut down the simulator and save the session with final simulator state; Should be called at end of collection/ @@ -1806,20 +1824,24 @@ def __update_custom_coffee_prop(self, event, objs_before_event=None): reliability and checks that a container just got placed in a coffee maker and the coffee maker was on """ cur_objects = self.get_objects(event) - coffee_makers = [obj for obj in cur_objects if "CoffeeMachine" in obj["objectType"]] - coffee_maker_ids = set([obj["objectId"] for obj in coffee_makers]) + coffee_maker_ids = set( + [obj["objectId"] for obj in cur_objects if "CoffeeMachine" in obj["objectType"] and obj["isToggled"]] + ) for obj in cur_objects: + prev_filled_with_liquid = False if objs_before_event is not None: prev_state = self.__get_object_by_id(objs_before_event, obj["objectId"]) - else: - prev_state = None + if prev_state: + prev_filled_with_liquid = prev_state["isFilledWithLiquid"] parent_receptacles = self.get_parent_receptacles(obj, cur_objects) + placed_in_toggled_coffee_maker = False + if parent_receptacles is not None and len(set(parent_receptacles).intersection(coffee_maker_ids)) > 0: + placed_in_toggled_coffee_maker = True if ( - parent_receptacles is not None - and len(set(parent_receptacles).intersection(coffee_maker_ids)) > 0 + placed_in_toggled_coffee_maker and obj["canFillWithLiquid"] and obj["isFilledWithLiquid"] - and (prev_state is None or not prev_state["isFilledWithLiquid"]) + and not prev_filled_with_liquid ): self.__update_custom_object_metadata(obj["objectId"], "simbotIsFilledWithCoffee", True) @@ -1852,13 +1874,15 @@ def __update_sink_interaction_outcomes(self, event): for child_obj in objs_in_sink: if child_obj["isDirty"]: - ac = dict(action="CleanObject", objectId=child_obj["objectId"]) + ac = dict(action="CleanObject", objectId=child_obj["objectId"], forceAction=True) if debug_print_all_sim_steps: logger.info("step %s", ac) self.controller.step(ac) if child_obj["canFillWithLiquid"]: - ac = dict(action="FillObjectWithLiquid", objectId=child_obj["objectId"], fillLiquid="water") + ac = dict( + action="FillObjectWithLiquid", objectId=child_obj["objectId"], fillLiquid="water", forceAction=True + ) if debug_print_all_sim_steps: logger.info("step %s", ac) self.controller.step(ac) diff --git a/src/teach/simulators/simulator_base.py b/src/teach/simulators/simulator_base.py index 65c9855..d6821f6 100644 --- a/src/teach/simulators/simulator_base.py +++ b/src/teach/simulators/simulator_base.py @@ -111,6 +111,17 @@ def set_task_by_id(self, task_id: int, task_params=None, comments=""): def set_task_by_name(self, task_name: str, task_params=None, comments=""): raise NotImplementedError("Derived class must implement this!") + def reset_stored_data(self): + """ + This removes data of previous tasks / episodes from the simulator object and should be used with caution + This should precede calls to start_new_episode() and set_task() to ensure that a future call to save() or done() + will save session data properly. + """ + logger.info("Resetting dataset object and removing previously stored episodes...") + task_type = self._dataset.task_type + comments = self._dataset.comments + self._dataset = Dataset(task_type=task_type, definitions=None, comments=comments, version="2.0") + def start_new_episode( self, world=None, diff --git a/src/teach/utils.py b/src/teach/utils.py index e7c4466..618ecba 100644 --- a/src/teach/utils.py +++ b/src/teach/utils.py @@ -4,9 +4,11 @@ import copy import json +import os from pathlib import Path import numpy as np +from PIL import Image from teach.dataset.task_THOR import Task_THOR from teach.logger import create_logger @@ -378,3 +380,19 @@ def dynamically_load_class(package_path, class_name): module = __import__(package_path, fromlist=[class_name]) klass = getattr(module, class_name) return klass + + +def load_images(image_dir, image_file_names): + images = list() + if not image_file_names: + return images + if not os.path.exists(image_dir): + raise Exception(f"{image_dir} doesn't exist") + for f in image_file_names: + image_file = os.path.join(image_dir, f) + if not os.path.exists(image_file): + continue + image_orig = Image.open(image_file) + images.append(image_orig.copy()) + image_orig.close() + return images