diff --git a/.gitignore b/.gitignore
index f4d759f..e8aaad7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ build
 dist
 *.egg-info/
 src/teach/analysis/.ipynb_checkpoints/
-pip-wheel-metadata/
\ No newline at end of file
+pip-wheel-metadata/
+*.pyc
\ No newline at end of file
diff --git a/README.md b/README.md
index 7d20cd6..5d70c86 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 Aishwarya Padmakumar*, Jesse Thomason*, Ayush Shrivastava, Patrick Lange, Anjali Narayan-Chen, Spandana Gella, Robinson Piramuthu, Gokhan Tur, Dilek Hakkani-Tur
 
 TEACh is a dataset of human-human interactive dialogues to complete tasks in a simulated household environment. 
-The code is licensed under the MIT License (see SOFTWARELICENSE), images are licensed under Apache 2.0 
+The code and model weights are licensed under the MIT License (see SOFTWARELICENSE), images are licensed under Apache 2.0 
 (see IMAGESLICENSE) and other data files are licensed under CDLA-Sharing 1.0 (see DATALICENSE).
 Please include appropriate licensing and attribution when using our data and code, and please cite our paper.
 
@@ -113,8 +113,263 @@ teach_eval \
     --inference_output_dir $OUTPUT_DIR \
     --split valid_seen \
     --metrics_file $METRICS_FILE
+```    
+
+## TEACh Benchmark Challenge
+
+For participation in the challenge, you will need to submit a docker image container your code and model.
+Docker containers using your image will serve your model as HTTP API following the [TEACh API Specification](#TEACh API Specification).
+For your convenience, we included the `teach_api` command which implements this API and is compatible with models implementing `teach.inference.teach_model.TeachModel` also used by `teach_inference`.
+
+We have also included two sample Docker images using `teach.inference.sample_model.SampleModel` and `teach.inference.et_model.ETModel` respectively in 
+[`docker/`](./docker).
+
+When evaluating a submissions, the submitted container will be started with access to a single GPU and no internet access. For details see [Step 3 - Start your container](#step-3---start-your-container).
+
+The main evaluation code invoking your submission will also be run as Docker container. It reuses the `teach_inference` CLI command together with `teach.inference.remote_model.RemoteModel` to call the HTTP API running in your container. For details on how to start it locally see [Step 4 - Start the evaluation](#step-4---start-the-evaluation).
+
+### Testing Locally
+
+Assuming you have [downloaded the data](#downloading-the-dataset) to `/home/ubuntu/teach-dataset` and followed [Prerequisites](#prerequisites) and [Remote Server Setup](#remote-server-setup).
+
+
+#### Step 0 - Setup Environment
+
+```buildoutcfg
+export HOST_DATA_DIR=/home/ubuntu/teach-dataset
+export HOST_IMAGES_DIR=/home/ubuntu/images
+export HOST_OUTPUT_DIR=/home/ubuntu/output
+export API_PORT=5000
+export SUBMISSION_PK=168888
+export INFERENCE_GPUS='"device=0"'
+export API_GPUS='"device=1"'
+export SPLIT=valid_seen
+export DOCKER_NETWORK=no-internet
+
+mkdir -p $HOST_IMAGES_DIR $HOST_OUTPUT_DIR
+docker network create --driver=bridge --internal $DOCKER_NETWORK
+```
+Note: If you run on a machine that only has a single GPU, set `API_GPUS='"device=0"'`.
+
+#### Step 1 - Build the `remote-inference-runner` container
+
+```buildoutcfg
+docker build -t remote-inference-runner -f docker/Dockerfile.RemoteInferenceRunner .
 ```
 
+#### Step 2 - Build your container
+
+Note: When customizing the images for your own usage, do not edit the following or your submission will fail:
+- `teach_api` options: `--data_dir /data --images_dir /images --split $SPLIT`
+- `EXPOSE 5000` and don't change the port the flask API listens on
+
+For the `SampleModel` example, the corresponding command is:
+
+```buildoutcfg
+docker build -t teach-model-api-samplemodel -f docker/Dockerfile.TEAChAPI-SampleModel .
+```
+
+For the `baseline models`, follow the corresponding command replacing `MODEL_VARIANT=et` with 
+the desired variant e.g. `et_plus_a`.
+
+```buildoutcfg
+mkdir -p ./models
+mv $HOST_DATA_DIR/baseline_models ./models/
+mv $HOST_DATA_DIR/et_pretrained_models ./models/
+docker build --build-arg MODEL_VARIANT=et -t teach-model-api-etmodel -f docker/Dockerfile.TEAChAPI-ETModel .
+```
+
+#### Step 3 - Start your container
+
+For the `SampleModel` example, the corresponding command is:
+
+```buildoutcfg
+docker run -d --rm \
+    --gpus $API_GPUS \
+    --name TeachModelAPI \
+    --network $DOCKER_NETWORK \
+    -e SPLIT=$SPLIT \
+    -v $HOST_DATA_DIR:/data:ro \
+    -v $HOST_IMAGES_DIR/$SUBMISSION_PK:/images:ro \
+    -t teach-model-api-samplemodel    
+```
+
+For the baseline models, just replace the image name e.g. if you followed the commands above
+
+```buildoutcfg
+docker run -d --rm \
+    --gpus $API_GPUS \
+    --name TeachModelAPI \
+    --network $DOCKER_NETWORK \
+    -e SPLIT=$SPLIT \
+    -v $HOST_DATA_DIR:/data:ro \
+    -v $HOST_IMAGES_DIR/$SUBMISSION_PK:/images:ro \
+    -t teach-model-api-etmodel    
+```
+
+Verify the API is running with
+
+```buildoutcfg
+docker exec TeachModelAPI curl @TeachModelAPI:5000/ping
+
+Output:
+{"action":"Look Up","obj_relative_coord":[0.1,0.2]}
+```
+
+#### Step 4 - Start the evaluation
+
+```buildoutcfg
+docker run --rm \
+    --privileged \
+    -e DISPLAY=:0 \
+    -e NVIDIA_DRIVER_CAPABILITIES=all \
+    --name RemoteInferenceRunner \
+    --network $DOCKER_NETWORK \
+    --gpus $INFERENCE_GPUS \
+    -v /tmp/.X11-unix:/tmp/.X11-unix:ro \
+    -v $HOST_DATA_DIR:/data:ro \
+    -v $HOST_IMAGES_DIR/$SUBMISSION_PK:/images \
+    -v $HOST_OUTPUT_DIR/$SUBMISSION_PK:/output \
+    remote-inference-runner teach_inference \
+        --data_dir /data \
+        --output_dir /output \
+        --images_dir /images \
+        --split $SPLIT \
+        --metrics_file /output/metrics_file \
+        --model_module teach.inference.remote_model \
+        --model_class RemoteModel \
+        --model_api_host_and_port "@TeachModelAPI:$API_PORT"
+```
+
+#### Step 5 - Results
+
+The evaluation metrics will be in `$HOST_OUTPUT_DIR/$SUBMISSION_PK/metrics_file`.
+Images for each episode will be in `$HOST_IMAGES_DIR/$SUBMISSION_PK`.
+
+### Running without docker
+
+You may want to test your implementation without rebuilding Docker images. You can test your model by directly calling the `teach_api` CLI command e.g.
+
+Using the `teach.inference.sample_model.SampleModel`:
+
+```buildoutcfg
+export DATA_DIR=/home/ubuntu/teach-dataset
+export IMAGE_DIR=/tmp/images
+
+teach_api \
+    --data_dir $DATA_DIR \
+    --images_dir $IMAGE_DIR
+```
+
+Using the `teach.inference.et_model.ETModel` assuming you already moved the models from the teach-dataset location to 
+`./models` following instructions in [Step 2 - Build your container](#step-2---build-your-container).
+
+```buildoutcfg
+export DATA_DIR=/home/ubuntu/teach-dataset
+export IMAGE_DIR=/tmp/images
+
+teach_api \
+    --data_dir $DATA_DIR \
+    --images_dir $IMAGE_DIR \
+    --split valid_seen \
+    --model_module teach.inference.et_model \
+    --model_class ETModel \
+    --model_dir ./models/baseline_models/et \
+    --visual_checkpoint ./models/et_pretrained_models/fasterrcnn_model.pth
+    --object_predictor ./models/et_pretrained_models/maskrcnn_model.pth \
+    --seed 4 
+```
+
+The corresponding command for running `teach_inference` against such an API
+without container uses `teach.inference.remote_model.RemoteModel`.
+
+```buildoutcfg
+export DATA_DIR=/home/ubuntu/teach-dataset
+export OUTPUT_DIR=/home/ubuntu/output/valid_seen
+export METRICS_FILE=/home/ubuntu/output/valid_seen/metrics
+export IMAGE_DIR=/tmp/images
+
+teach_inference \
+    --data_dir $DATA_DIR  \
+    --output_dir $OUTPUT_DIR \    
+    --split valid_seen \
+    --metrics_file $METRICS_FILE \    
+    --model_module teach.inference.remote_model \
+    --model_class RemoteModel \        
+    --model_api_host_and_port 'localhost:5000' \
+    --images_dir $IMAGE_DIR
+    
+```
+
+### Smaller split
+
+It may be useful for faster turn around time to locally create a smaller split in `$DATA_DIR/edh_instances/test_seen` 
+with a handful of files from `$DATA_DIR/edh_instances/valid_seen` for faster turn around times. 
+
+### Runtime Checks
+
+The TEACh Benchmark Challenge places a maximum time limit of 36 hours when using all GPUs of a `p3.16xlarge` instance.
+The best way to verify that your code is likely to satisfy this requirement would be to use a script to run two Docker evaluation processes in sequence on a `p3.16xlarge` EC2 instance, one for the `valid_seen` split and one for the `valid_unseen` split.
+Note that you will need to specify `export API_GPUS='"device=1,2,3,4,5,6,7"'` (we reserve GPU 0 for `ai2thor` in our runs) to use all GPUs and your model code will need to place different instances of the model on different GPUs for this test (see the use of `process_index` in `ETModel.set_up_model()` for an example).
+Also note that while the test splits are close in size to the validation splits, they are not identical so your runtime estimate will necessarily be an approximation. 
+
+### TEACh API Specification
+
+As mentioned above, `teach_api` already implements this API and it is usually not necessary to implement this yourself. During evaluations of submissions, edh_instances without ground truth and images corresponding to the edh_instances' histories will be available in `/data`. `/images` will contain images produced during inference at runtime. `teach_api` already handles loading and passes them to your implementation of `teach.inference.teach_model.TeachModel`.
+
+#### Start EDH Instance
+
+This endpoint will be called once at the start of processing a new EDH instance. Currently, we ensure that the API processes only a single EDH instance from start to finish i.e. once called it can be assumed that the previous EDH instance has completed.
+
+URL : `/start_new_edh_instance`  
+Method : `POST`  
+Payload:  
+
+```json
+{
+    "edh_name": "[name of the EDH instance file]"
+}
+```
+
+Responses:
+
+Status Code: `200`  
+Response: `success`
+
+Status Code: `500`  
+Response: `[error message]`
+
+
+#### Get next action
+
+This endpoint will be called at each timestep during inference to get the next predicted action from the model.
+
+URL : `/get_next_action`  
+Method : `POST`  
+Payload:  
+
+```json
+{
+    "edh_name": "[name of the EDH instance file]",
+    "img_name": "[name of the image taken in the simulator after the previous action]",
+    "prev_action": "[JSON string representation of previous action]", // this is optional
+}
+```
+
+Responses:
+
+Status Code: `200`  
+
+```json
+{
+    "action": "[An action name from all_agent_actions]",
+    "obj_relative_coord": [0.1, 0.5] // see teach.inference.teach_model.TeachModel.get_next_action
+}
+```
+
+Status Code: `500`  
+Response: `[error message]`
+
 ## Security
 
 See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
@@ -123,3 +378,5 @@ See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more inform
 
 The code is licensed under the MIT License (see SOFTWARELICENSE), images are licensed under Apache 2.0 
 (see IMAGESLICENSE) and other data files are licensed under CDLA-Sharing 1.0 (see DATALICENSE).
+
+
diff --git a/docker/Dockerfile.RemoteInferenceRunner b/docker/Dockerfile.RemoteInferenceRunner
new file mode 100644
index 0000000..91aa91a
--- /dev/null
+++ b/docker/Dockerfile.RemoteInferenceRunner
@@ -0,0 +1,36 @@
+FROM ubuntu:18.04
+
+# install python3.8
+RUN apt update && \
+  apt install --no-install-recommends -y build-essential  software-properties-common && \
+  add-apt-repository -y ppa:deadsnakes/ppa && \
+  apt install --no-install-recommends -y python3.8 python3.8-dev python3.8-distutils && \
+  apt clean && rm -rf /var/lib/apt/lists/*
+
+# register the version in alternatives and set higher priority to 3.8
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 2
+
+RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \
+    ffmpeg \
+    vim \
+    curl
+
+# upgrade pip to latest version
+RUN curl -s https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+    python3 get-pip.py --force-reinstall && \
+    rm get-pip.py
+
+COPY ./requirements.txt ./requirements.txt
+RUN pip install -r requirements.txt
+
+# copy all source code into the image
+COPY . .
+
+ENV PYTHONPATH /src
+RUN pip install -e .
+
+# Download AI2Thor executable
+RUN python3 -c "from teach.settings import get_settings; from teach.simulators.simulator_THOR import COMMIT_ID, TEAChController; TEAChController(base_dir=get_settings().AI2THOR_BASE_DIR, download_only=True, commit_id=COMMIT_ID);"
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/Dockerfile.TEAChAPI-ETModel b/docker/Dockerfile.TEAChAPI-ETModel
new file mode 100644
index 0000000..bac456b
--- /dev/null
+++ b/docker/Dockerfile.TEAChAPI-ETModel
@@ -0,0 +1,29 @@
+FROM python:3.8
+ARG MODEL_VARIANT et
+
+# download model files into et_models and copy over
+RUN mkdir -p et_models
+COPY models/baseline_models/$MODEL_VARIANT et_models
+
+# Download Faster RCNN and Mask RCNN and copy over
+RUN mkdir -p et_pretrained_models
+COPY models/et_pretrained_models et_pretrained_models
+
+# upgrade pip to latest version
+RUN curl -s https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+    python3 get-pip.py --force-reinstall && \
+    rm get-pip.py
+
+COPY ./requirements.txt ./requirements.txt
+RUN pip install -r requirements.txt
+COPY . .
+RUN pip install -e .
+EXPOSE 5000
+
+ENV ET_ROOT=/src/teach/modeling/ET/
+ENV PYTHONPATH=/src:$ET_ROOT
+ENV SPLIT=valid_seen
+
+CMD teach_api --model_module teach.inference.et_model --model_class ETModel --data_dir /data --images_dir /images \
+--split $SPLIT --visual_checkpoint /et_pretrained_models/fasterrcnn_model.pth \
+--object_predictor /et_pretrained_models/maskrcnn_model.pth --model_dir /et_models
diff --git a/docker/Dockerfile.TEAChAPI-SampleModel b/docker/Dockerfile.TEAChAPI-SampleModel
new file mode 100644
index 0000000..4d10ee8
--- /dev/null
+++ b/docker/Dockerfile.TEAChAPI-SampleModel
@@ -0,0 +1,16 @@
+FROM python:3.8
+
+# upgrade pip to latest version
+RUN curl -s https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+    python3 get-pip.py --force-reinstall && \
+    rm get-pip.py
+
+COPY ./requirements.txt ./requirements.txt
+RUN pip install -r requirements.txt
+COPY . .
+RUN pip install -e .
+EXPOSE 5000
+
+ENV SPLIT=valid_seen
+
+CMD teach_api --model_module teach.inference.sample_model --model_class SampleModel --data_dir /data --images_dir /images --split $SPLIT --seed 4
diff --git a/requirements.txt b/requirements.txt
index 50c4be8..1998244 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,27 @@ pydub==0.24.1
 python-Levenshtein
 tqdm
 pillow
-pydantic==1.8.2
\ No newline at end of file
+pydantic==1.8.2
+numpy
+pandas
+opencv-python
+vocab
+revtok
+Pillow
+sacred
+etaprogress
+scikit-video
+lmdb
+gtimer
+filelock
+termcolor
+torch==1.7.1
+torchvision==0.8.2
+tensorboardX==1.8
+Flask
+flask_restful
+future
+pandoc
+six
+typing
+ConfigArgParse
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 14e82e6..63584e3 100644
--- a/setup.py
+++ b/setup.py
@@ -26,10 +26,11 @@
             "teach_eval = teach.cli.eval:main",
             "teach_inference = teach.cli.inference:main",
             "teach_replay = teach.cli.replay:main",
+            "teach_api = teach.cli.api:main",
         ]
     },
     include_package_data=True,
-    python_requires=">=3.6",
+    python_requires=">=3.7",
     zip_safe=False,
     install_requires=[
         "ai2thor==3.1.0",
diff --git a/src/teach/cli/api.py b/src/teach/cli/api.py
new file mode 100644
index 0000000..1a4f35b
--- /dev/null
+++ b/src/teach/cli/api.py
@@ -0,0 +1,221 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+import json
+import os
+from argparse import ArgumentParser
+from os.path import isfile
+
+from flask import Flask, jsonify, request
+from flask_restful import reqparse
+from PIL import Image
+
+from teach.utils import dynamically_load_class, load_images
+
+app = Flask(__name__)
+app.config["JSONIFY_PRETTYPRINT_REGULAR"] = False
+app.logger.info("initialize flask server")
+
+
+def parse_args():
+    arg_parser = ArgumentParser()
+    arg_parser.add_argument(
+        "--data_dir",
+        type=str,
+        required=True,
+        help='Base data directory containing subfolders "games" and "edh_instances',
+    )
+    arg_parser.add_argument(
+        "--images_dir",
+        type=str,
+        required=True,
+        help="Images directory containing inference image output",
+    )
+    arg_parser.add_argument(
+        "--split",
+        type=str,
+        default="valid_seen",
+        choices=["train", "valid_seen", "valid_unseen", "test_seen", "test_unseen"],
+        help="One of train, valid_seen, valid_unseen, test_seen, test_unseen",
+    )
+    arg_parser.add_argument(
+        "--model_module",
+        type=str,
+        default="teach.inference.sample_model",
+        help="Path of the python module to load the model class from.",
+    )
+    arg_parser.add_argument(
+        "--model_class", type=str, default="SampleModel", help="Name of the TeachModel class to use during inference."
+    )
+    arg_parser.add_argument(
+        "--use_edh_file", dest="use_edh_file", action="store_true", help="Use edh file instead of request json."
+    )
+    arg_parser.add_argument(
+        "--use_img_file", dest="use_img_file", action="store_true", help="Use img file instead of request bytes."
+    )
+    return arg_parser.parse_known_args()
+
+
+teach_args, model_args = parse_args()
+model_class = dynamically_load_class(teach_args.model_module, teach_args.model_class)
+process_index, num_processes = 1, 1
+model = model_class(process_index, num_processes, model_args=model_args)
+
+
+def _get_edh_instance(req_args):
+    if teach_args.use_edh_file:
+        if not req_args.edh_name:
+            return None, "request parameter edh_name does not have a value"
+        edh_instance_path = os.path.join(teach_args.data_dir, "edh_instances", teach_args.split, req_args.edh_name)
+        if not isfile(edh_instance_path):
+            return None, f"edh file={edh_instance_path} does not exist"
+        with open(edh_instance_path) as handle:
+            edh_instance = json.load(handle)
+    else:
+        edh_instance = json.loads(req_args.edh_instance)
+    return edh_instance, None
+
+
+def _get_img(req_args):
+    if not req_args.img_name:
+        return None, "request parameter img_name does not have a value"
+    if teach_args.use_img_file:
+        img_path = os.path.join(teach_args.images_dir, req_args.img_name)
+        if not isfile(img_path):
+            return None, f"image file={img_path} does not exist"
+        img = Image.open(img_path)
+    else:
+        img_file = request.files.get("img")
+        if not img_file:
+            return None, f"image is not set in request with key='img'"
+        img = Image.open(img_file)
+    return img, None
+
+
+def _get_edh_history_images(edh_name, edh_instance):
+    edh_history_images = []
+    history_file_names = edh_instance["driver_image_history"]
+    if not history_file_names:
+        return edh_history_images, None
+
+    try:
+        if not teach_args.use_img_file:
+            images = request.files.getlist("edh_history_images")
+            if images:
+                for img in images:
+                    edh_history_images.append(Image.open(img))
+
+        if not edh_history_images:
+            image_dir = os.path.join(teach_args.data_dir, "images", teach_args.split, edh_instance["game_id"])
+            edh_history_images = load_images(image_dir, history_file_names)
+
+    except Exception:
+        err_msg = f"failed to load history images edh_name={edh_name}"
+        app.logger.error(err_msg, exc_info=True)
+        return None, err_msg
+
+    if not edh_history_images:
+        err_msg = f"history images are empty for edh_name={edh_name} for history_file_names={history_file_names}"
+        app.logger.error(err_msg)
+        return None, err_msg
+
+    return edh_history_images, None
+
+
+@app.route("/get_next_action", methods=["POST"])
+def get_next_action():
+    req_args = get_next_action_parse_args()
+    edh_instance, err_msg = _get_edh_instance(req_args)
+    if err_msg:
+        return err_msg, 500
+    img, err_msg = _get_img(req_args)
+    if err_msg:
+        return err_msg, 500
+    prev_action = json.loads(req_args.prev_action) if req_args.prev_action else None
+    try:
+        action, obj_relative_coord = model.get_next_action(img, edh_instance, prev_action)
+    except Exception as e:
+        err_msg = f"failed to get_next_action with edh_name={req_args.edh_name}"
+        app.logger.error(err_msg, exc_info=True)
+        return err_msg, 500
+    app.logger.debug(f"model.get_next_action returns action={action}, obj_relative_coord={obj_relative_coord}")
+    resp = jsonify(action=action, obj_relative_coord=obj_relative_coord)
+    return resp, 200
+
+
+@app.route("/start_new_edh_instance", methods=["POST"])
+def start_new_edh_instance():
+    req_args = start_new_edh_instance_parse_args()
+    app.logger.info(f"start_new_edh_instance with edh_name={req_args.edh_name}")
+    edh_instance, err_msg = _get_edh_instance(req_args)
+    if err_msg:
+        return err_msg, 500
+    edh_history_images, err_msg = _get_edh_history_images(req_args.edh_name, edh_instance)
+    if err_msg:
+        return err_msg, 500
+    try:
+        model.start_new_edh_instance(edh_instance, edh_history_images)
+    except Exception as e:
+        err_msg = f"failed to start_new_edh_instance with edh_name={req_args.edh_name}"
+        app.logger.error(err_msg, exc_info=True)
+        return err_msg, 500
+    return "success", 200
+
+
+@app.route("/")
+@app.route("/ping")
+@app.route("/test")
+def test():
+    resp = jsonify(action="Look Up", obj_relative_coord=[0.1, 0.2])
+    return resp, 200
+
+
+def get_next_action_parse_args():
+    parser = reqparse.RequestParser()
+    parser.add_argument(
+        "img_name",
+        type=str,
+        help="Image name for PIL Image containing agent's egocentric image.",
+    )
+    parser.add_argument(
+        "edh_name",
+        type=str,
+        help="EDH instance file name.",
+    )
+    parser.add_argument(
+        "prev_action",
+        type=str,
+        help="One of None or a dict with keys 'action' and 'obj_relative_coord' containing returned values.",
+    )
+    parser.add_argument(
+        "edh_instance",
+        type=str,
+        help="One of None or a dict with keys 'action' and 'obj_relative_coord' containing returned values.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def start_new_edh_instance_parse_args():
+    parser = reqparse.RequestParser()
+    parser.add_argument(
+        "edh_name",
+        type=str,
+        help="EDH instance file name.",
+    )
+    parser.add_argument(
+        "edh_instance",
+        type=str,
+        help="One of None or a dict with keys 'action' and 'obj_relative_coord' containing returned values.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    app.run(host="0.0.0.0", port=5000)
+    app.logger.info("started flask server")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/teach/cli/download.py b/src/teach/cli/download.py
index fd1dacb..0053a48 100644
--- a/src/teach/cli/download.py
+++ b/src/teach/cli/download.py
@@ -10,6 +10,7 @@
 import boto3
 from botocore import UNSIGNED
 from botocore.config import Config
+from tqdm import tqdm
 
 DEFAULT_DATASET_BUCKET_NAME = "teach-dataset"
 DEFAULT_DIRECTORY = "/tmp/teach-dataset"
@@ -19,9 +20,26 @@
     "experiment_games.tar.gz",
     "images_and_states.tar.gz",
     "tfd_instances.tar.gz",
+    "baseline_models.tar.gz",
+    "et_pretrained_models.tar.gz",
 ]
 
 
+def update_download_progressbar(t):
+    def inner(bytes_amount):
+        t.update(bytes_amount)
+
+    return inner
+
+
+def download_with_progressbar(s3_resource, bucket_name, key, directory):
+    file_object = s3_resource.Object(bucket_name=bucket_name, key=key)
+    total_file_size = file_object.content_length
+    bucket = s3_resource.Bucket(bucket_name)
+    with tqdm(total=total_file_size, unit="B", unit_scale=True, desc=key) as t:
+        bucket.download_file(Key=key, Filename=f"{directory}/{key}", Callback=update_download_progressbar(t))
+
+
 def download_dataset(directory, key=None, bucket_name=DEFAULT_DATASET_BUCKET_NAME):
     """
     Download file from the S3 bucket to the target directory.
@@ -31,33 +49,39 @@ def download_dataset(directory, key=None, bucket_name=DEFAULT_DATASET_BUCKET_NAM
         if not os.path.exists(directory):
             os.makedirs(directory)
         s3_resource = boto3.resource("s3", region_name="us-east-1", config=Config(signature_version=UNSIGNED))
-        bucket = s3_resource.Bucket(bucket_name)
         if key:
             print(f"Downloading s3://{bucket_name}/{key} to {directory}")
-            bucket.download_file(Key=key, Filename=f"{directory}/{key}")
+            download_with_progressbar(s3_resource, bucket_name, key, directory)
         else:
             for file_name in FILE_LIST:
                 print(f"Downloading s3://{bucket_name}/{file_name} to {directory}")
-                bucket.download_file(Key=file_name, Filename=f"{directory}/{file_name}")
+                download_with_progressbar(s3_resource, bucket_name, file_name, directory)
     except Exception as e:
         print(f"Exception reading from: {bucket_name}")
         print(f"Exception: {str(e)}")
 
 
+def extract_all_with_progress(archive, directory):
+    members = archive.getmembers()
+    for member in tqdm(iterable=members, total=len(members)):
+        archive.extract(member=member, path=directory)
+
+
 def extract_dataset(directory, file_name=None):
     """
     Extract extract archive file(s) in the given directory.
     """
     print(f"Extracting dataset to {directory}")
     if file_name:
+        print(f"Extracting file: {file_name}")
         with tarfile.open(os.path.join(directory, file_name)) as archive:
-            archive.extractall(directory)
-        print(f"Extracted file: {file_name}")
+            extract_all_with_progress(archive, directory)
+
     else:
         for file_name in FILE_LIST:
+            print(f"Extracting file: {file_name}")
             with tarfile.open(os.path.join(directory, file_name)) as archive:
-                archive.extractall(directory)
-            print(f"Extracted file: {file_name}")
+                extract_all_with_progress(archive, directory)
 
 
 def process_arguments():
@@ -105,7 +129,7 @@ def main():
 
     print("Input directory:", directory)
     print("Input skip-extract:", skip_extract)
-    print("Input skip-download:", skip_extract)
+    print("Input skip-download:", skip_download)
     print("Input file:", file_name)
 
     if not skip_download:
diff --git a/src/teach/cli/eval.py b/src/teach/cli/eval.py
index 16cf4bf..65f764a 100644
--- a/src/teach/cli/eval.py
+++ b/src/teach/cli/eval.py
@@ -36,8 +36,8 @@ def main():
         "--split",
         type=str,
         default="valid_seen",
-        choices=["train", "valid_seen", "valid_unseen"],
-        help="One of train, valid_seen, valid_unseen",
+        choices=["train", "valid_seen", "valid_unseen", "test_seen", "test_unseen"],
+        help="One of train, valid_seen, valid_unseen, test_seen, test_unseen",
     )
     arg_parser.add_argument(
         "--max_init_tries",
@@ -105,7 +105,7 @@ def main():
 
     results["traj_stats"] = traj_stats
     with open(args.metrics_file, "w") as h:
-        json.dump(traj_stats, h)
+        json.dump(results, h)
 
 
 if __name__ == "__main__":
diff --git a/src/teach/cli/inference.py b/src/teach/cli/inference.py
index 57fe9fd..5675889 100644
--- a/src/teach/cli/inference.py
+++ b/src/teach/cli/inference.py
@@ -2,11 +2,12 @@
 
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: MIT-0
-
-import argparse
+import glob
 import json
+import multiprocessing as mp
 import os
 from argparse import ArgumentParser
+from datetime import datetime
 
 from teach.eval.compute_metrics import aggregate_metrics
 from teach.inference.inference_runner import InferenceRunner, InferenceRunnerConfig
@@ -24,6 +25,18 @@ def main():
         required=True,
         help='Base data directory containing subfolders "games" and "edh_instances',
     )
+    arg_parser.add_argument(
+        "--images_dir",
+        type=str,
+        required=True,
+        help="Images directory for episode replay output",
+    )
+    arg_parser.add_argument(
+        "--use_img_file",
+        dest="use_img_file",
+        action="store_true",
+        help="synchronous save images with model api use the image file instead of streaming image",
+    )
     arg_parser.add_argument(
         "--output_dir",
         type=str,
@@ -34,8 +47,8 @@ def main():
         "--split",
         type=str,
         default="valid_seen",
-        choices=["train", "valid_seen", "valid_unseen"],
-        help="One of train, valid_seen, valid_unseen",
+        choices=["train", "valid_seen", "valid_unseen", "test_seen", "test_unseen"],
+        help="One of train, valid_seen, valid_unseen, test_seen, test_unseen",
     )
     arg_parser.add_argument(
         "--edh_instance_file",
@@ -72,33 +85,49 @@ def main():
         "--model_class", type=str, default="SampleModel", help="Name of the TeachModel class to use during inference."
     )
     arg_parser.add_argument(
-        "model_args", nargs=argparse.REMAINDER, help="Any unknown arguments will be captured and passed to the model"
+        "--replay_timeout", type=int, default=500, help="The timeout for playing back the interactions in an episode."
     )
-    args = arg_parser.parse_args()
+
+    start_time = datetime.now()
+    args, model_args = arg_parser.parse_known_args()
 
     if args.edh_instance_file:
         edh_instance_files = [args.edh_instance_file]
     else:
+        inference_output_files = glob.glob(os.path.join(args.output_dir, "inference__*.json"))
+        finished_edh_instance_files = [os.path.join(fn.split("__")[1]) for fn in inference_output_files]
         edh_instance_files = [
             os.path.join(args.data_dir, "edh_instances", args.split, f)
             for f in os.listdir(os.path.join(args.data_dir, "edh_instances", args.split))
+            if f not in finished_edh_instance_files
         ]
+        if not edh_instance_files:
+            print(
+                f"all the edh instances have been ran for input_dir={os.path.join(args.data_dir, 'edh_instances', args.split)}"
+            )
+            exit(1)
 
     runner_config = InferenceRunnerConfig(
         data_dir=args.data_dir,
         split=args.split,
         output_dir=args.output_dir,
+        images_dir=args.images_dir,
         metrics_file=args.metrics_file,
         num_processes=args.num_processes,
         max_init_tries=args.max_init_tries,
         max_traj_steps=args.max_traj_steps,
         max_api_fails=args.max_api_fails,
         model_class=dynamically_load_class(args.model_module, args.model_class),
-        model_args=args.model_args,
+        replay_timeout=args.replay_timeout,
+        model_args=model_args,
+        use_img_file=args.use_img_file,
     )
 
     runner = InferenceRunner(edh_instance_files, runner_config)
     metrics = runner.run()
+    inference_end_time = datetime.now()
+    logger.info("Time for inference: %s" % str(inference_end_time - start_time))
+
     results = aggregate_metrics(metrics, args)
     print("-------------")
     print(
@@ -121,10 +150,17 @@ def main():
     print("PLW GC: %.3f" % (results["path_length_weighted_goal_condition_success_rate"]))
     print("-------------")
 
-    results["traj_metrics"] = metrics
+    results["traj_stats"] = metrics
     with open(args.metrics_file, "w") as h:
-        json.dump(metrics, h)
+        json.dump(results, h)
+
+    end_time = datetime.now()
+    logger.info("Total time for inference and evaluation: %s" % str(end_time - start_time))
 
 
 if __name__ == "__main__":
+    # Using spawn method, parent process creates a new and independent child process,
+    # which avoid sharing unnecessary resources.
+    # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+    mp.set_start_method("spawn", force=True)
     main()
diff --git a/src/teach/eval/compute_metrics.py b/src/teach/eval/compute_metrics.py
index 2818303..287c87b 100644
--- a/src/teach/eval/compute_metrics.py
+++ b/src/teach/eval/compute_metrics.py
@@ -14,10 +14,14 @@ def evaluate_traj(success, edh_instance, traj_len, final_gc_total, final_gc_sati
         edh_instance["expected_init_goal_conditions_total"], edh_instance["expected_init_goal_conditions_satisfied"]
     )
     final_gc_satisfied = min(final_gc_total, final_gc_satisfied)
-    goal_condition_success_rate = 1.0 - (
-        (final_gc_total - final_gc_satisfied)
-        / (edh_instance["expected_init_goal_conditions_total"] - init_gc_satisfied)
-    )
+
+    total_goal_conditions = edh_instance["expected_init_goal_conditions_total"] - init_gc_satisfied
+    # TODO: Remove this after testing and recheck EDH instances to remove any where there is nothing to do
+    if total_goal_conditions != 0:
+        unsatisfied_goal_conditions = final_gc_total - final_gc_satisfied
+        goal_condition_success_rate = 1.0 - (unsatisfied_goal_conditions / total_goal_conditions)
+    else:
+        goal_condition_success_rate = 1
 
     # SPL
     gt_path_len = len(edh_instance["driver_actions_future"])
@@ -35,7 +39,7 @@ def evaluate_traj(success, edh_instance, traj_len, final_gc_total, final_gc_sati
         "path_len_weighted_success_spl": float(plw_s_spl),
         "goal_condition_spl": float(pc_spl),
         "path_len_weighted_goal_condition_spl": float(plw_pc_spl),
-        "path_len_weight": int(gt_path_len),
+        "gt_path_len": int(gt_path_len),
         "success": int(success),
         "traj_len": int(traj_len),
     }
@@ -135,6 +139,7 @@ def load_traj_metrics(output_file, pred_actions_file, args):
     traj_metrics = create_new_traj_metrics(edh_instance)
     traj_metrics["game_id"] = edh_instance["game_id"]
     traj_metrics["instance_id"] = edh_instance["instance_id"]
+    traj_metrics["gt_path_len"] = len(edh_instance["driver_actions_future"])
     traj_metrics.update(
         evaluate_traj(
             success, edh_instance, len(pred_actions), final_goal_conditions_total, final_goal_conditions_satisfied
diff --git a/src/teach/inference/et_model.py b/src/teach/inference/et_model.py
new file mode 100644
index 0000000..925f329
--- /dev/null
+++ b/src/teach/inference/et_model.py
@@ -0,0 +1,188 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+import argparse
+import os
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import torch
+from alfred import constants
+from alfred.data import GuidesEdhDataset
+from alfred.data.preprocessor import Preprocessor
+from alfred.utils import data_util, eval_util, model_util
+
+from teach.inference.actions import obj_interaction_actions
+from teach.inference.teach_model import TeachModel
+from teach.logger import create_logger
+
+logger = create_logger(__name__)
+
+
+class ETModel(TeachModel):
+    """
+    Wrapper around ET Model for inference
+    """
+
+    def __init__(self, process_index: int, num_processes: int, model_args: List[str]):
+        """Constructor
+
+        :param process_index: index of the eval process that launched the model
+        :param num_processes: total number of processes launched
+        :param model_args: extra CLI arguments to teach_eval will be passed along to the model
+        """
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--seed", type=int, default=1, help="Random seed")
+        parser.add_argument("--device", type=str, default="cuda", help="cpu or cuda")
+        parser.add_argument("--model_dir", type=str, required=True, help="Model folder name under $ET_LOGS")
+        parser.add_argument("--checkpoint", type=str, default="latest.pth", help="latest.pth or model_**.pth")
+        parser.add_argument("--object_predictor", type=str, required=True, help="Path to MaskRCNN model checkpoint")
+        parser.add_argument("--visual_checkpoint", type=str, required=True, help="Path to FasterRCNN model checkpoint")
+        parser.add_argument(
+            "--skip_edh_history",
+            action="store_true",
+            default=False,
+            help="Specify this to ignore actions and image frames in EDH history",
+        )
+
+        args = parser.parse_args(model_args)
+        args.dout = args.model_dir
+        self.args = args
+
+        logger.info("ETModel using args %s" % str(args))
+        np.random.seed(args.seed)
+
+        self.et_model_args = None
+        self.object_predictor = None
+        self.model = None
+        self.extractor = None
+        self.vocab = None
+        self.preprocessor = None
+        self.set_up_model(process_index)
+
+        self.input_dict = None
+        self.cur_edh_instance = None
+
+    def set_up_model(self, process_index):
+        os.makedirs(self.args.dout, exist_ok=True)
+        model_path = os.path.join(self.args.model_dir, self.args.checkpoint)
+        logger.info("Loading model from %s" % model_path)
+
+        self.et_model_args = model_util.load_model_args(model_path)
+        dataset_info = data_util.read_dataset_info_for_inference(self.args.model_dir)
+        train_data_name = self.et_model_args.data["train"][0]
+        train_vocab = data_util.load_vocab_for_inference(self.args.model_dir, train_data_name)
+
+        self.object_predictor = eval_util.load_object_predictor(self.args)
+        if model_path is not None:
+            torch.cuda.empty_cache()
+            gpu_count = torch.cuda.device_count()
+            logger.info(f"gpu_count: {gpu_count}")
+            device = f"cuda:{process_index % gpu_count}" if self.args.device == "cuda" else self.args.device
+            self.args.device = device
+            logger.info(f"Loading model agent using device: {device}")
+            self.model, self.extractor = eval_util.load_agent(model_path, dataset_info, self.args, for_inference=True)
+
+        self.vocab = {"word": train_vocab["word"], "action_low": self.model.vocab_out}
+        self.preprocessor = Preprocessor(vocab=self.vocab)
+
+    def start_new_edh_instance(self, edh_instance, edh_history_images, edh_name=None):
+        self.model.reset()
+
+        self.cur_edh_instance = data_util.process_traj(
+            edh_instance, Path(os.path.join("test", edh_instance["instance_id"])), 0, self.preprocessor
+        )
+        feat_numpy = {"lang": GuidesEdhDataset.load_lang(self.cur_edh_instance)}
+        _, self.input_dict, _ = data_util.tensorize_and_pad(
+            [(self.cur_edh_instance, feat_numpy)], self.args.device, constants.PAD
+        )
+
+        if not self.args.skip_edh_history and edh_history_images is not None and len(edh_history_images) > 0:
+            img_features = self.extractor.featurize(edh_history_images, batch=32)
+            self.model.frames_traj = img_features
+            self.model.frames_traj = torch.unsqueeze(self.model.frames_traj, dim=0)
+            self.model.action_traj = torch.tensor(
+                [
+                    self.vocab["action_low"].word2index(action["action_name"])
+                    for action in edh_instance["driver_action_history"]
+                ],
+                device=self.args.device,
+            )
+            self.model.action_traj = torch.unsqueeze(self.model.action_traj, 0)
+
+    def get_next_action(self, img, edh_instance, prev_action, img_name=None, edh_name=None):
+        """
+        Sample function producing random actions at every time step. When running model inference, a model should be
+        called in this function instead.
+        :param img: PIL Image containing agent's egocentric image
+        :param edh_instance: EDH instance
+        :param prev_action: One of None or a dict with keys 'action' and 'obj_relative_coord' containing returned values
+        from a previous call of get_next_action
+        :param img_name: image file name
+        :param edh_name: EDH instance file name
+        :return action: An action name from all_agent_actions
+        :return obj_relative_coord: A relative (x, y) coordinate (values between 0 and 1) indicating an object in the image;
+        The TEACh wrapper on AI2-THOR examines the ground truth segmentation mask of the agent's egocentric image, selects
+        an object in a 10x10 pixel patch around the pixel indicated by the coordinate if the desired action can be
+        performed on it, and executes the action in AI2-THOR.
+        """
+        img_feat = self.extractor.featurize([img], batch=1)
+        self.input_dict["frames"] = img_feat
+
+        with torch.no_grad():
+            prev_api_action = None
+            if prev_action is not None and "action" in prev_action:
+                prev_api_action = prev_action["action"]
+            m_out = self.model.step(self.input_dict, self.vocab, prev_action=prev_api_action)
+
+        m_pred = model_util.extract_action_preds(
+            m_out, self.model.pad, self.vocab["action_low"], clean_special_tokens=False
+        )[0]
+        action = m_pred["action"]
+
+        obj = None
+        if action in obj_interaction_actions and len(m_pred["object"]) > 0 and len(m_pred["object"][0]) > 0:
+            obj = m_pred["object"][0][0]
+
+        predicted_click = None
+        if obj is not None:
+            predicted_click = self.get_obj_click(obj, img)
+        logger.debug("Predicted action: %s, obj = %s, click = %s" % (str(action), str(obj), str(predicted_click)))
+
+        # Assume previous action succeeded if no better info available
+        prev_success = True
+        if prev_action is not None and "success" in prev_action:
+            prev_success = prev_action["success"]
+
+        # remove blocking actions
+        action = self.obstruction_detection(action, prev_success, m_out, self.model.vocab_out)
+        return action, predicted_click
+
+    def get_obj_click(self, obj_class_idx, img):
+        rcnn_pred = self.object_predictor.predict_objects(img)
+        obj_class_name = self.object_predictor.vocab_obj.index2word(obj_class_idx)
+        candidates = list(filter(lambda p: p.label == obj_class_name, rcnn_pred))
+        if len(candidates) == 0:
+            return [np.random.uniform(), np.random.uniform()]
+        index = np.argmax([p.score for p in candidates])
+        mask = candidates[index].mask[0]
+        predicted_click = list(np.array(mask.nonzero()).mean(axis=1))
+        predicted_click = [
+            predicted_click[0] / mask.shape[1],
+            predicted_click[1] / mask.shape[0],
+        ]
+        return predicted_click
+
+    def obstruction_detection(self, action, prev_action_success, m_out, vocab_out):
+        """
+        change 'MoveAhead' action to a turn in case if it has failed previously
+        """
+        if action != "Forward" or prev_action_success:
+            return action
+        dist_action = m_out["action"][0][0].detach().cpu()
+        idx_rotateR = vocab_out.word2index("Turn Right")
+        idx_rotateL = vocab_out.word2index("Turn Left")
+        action = "Turn Left" if dist_action[idx_rotateL] > dist_action[idx_rotateR] else "Turn Right"
+        logger.debug("Blocking action is changed to: %s" % str(action))
+        return action
diff --git a/src/teach/inference/inference_runner.py b/src/teach/inference/inference_runner.py
index dd9630c..8e34f22 100644
--- a/src/teach/inference/inference_runner.py
+++ b/src/teach/inference/inference_runner.py
@@ -5,7 +5,10 @@
 import multiprocessing as mp
 import os
 import time
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
+from os.path import isdir
+from pathlib import Path
 from typing import List, Type
 
 from PIL import Image
@@ -17,7 +20,12 @@
 from teach.inference.teach_model import TeachModel
 from teach.logger import create_logger
 from teach.replay.episode_replay import EpisodeReplay
-from teach.utils import create_task_thor_from_state_diff, save_dict_as_json, with_retry
+from teach.utils import (
+    create_task_thor_from_state_diff,
+    load_images,
+    save_dict_as_json,
+    with_retry,
+)
 
 definitions = Definitions(version="2.0")
 action_id_to_info = definitions.map_actions_id2info
@@ -29,13 +37,16 @@ class InferenceRunnerConfig:
     data_dir: str
     split: str
     output_dir: str
+    images_dir: str
     model_class: Type[TeachModel]
     model_args: List[str]
     metrics_file: str = "metrics.json"
     num_processes: int = 1
     max_init_tries: int = 3
     max_traj_steps: int = 1000
-    max_api_fails: int = 3
+    max_api_fails: int = 30
+    use_img_file: bool = False
+    replay_timeout: int = 500
 
 
 class InferenceRunner:
@@ -66,15 +77,20 @@ def _get_metrics_files(config):
     @staticmethod
     def _launch_processes(edh_instance_files, config: InferenceRunnerConfig):
         processes = []
+        ers = []
         try:
             for process_index in range(config.num_processes):
-                process = InferenceRunner._launch_process(process_index, edh_instance_files, config)
+                er = EpisodeReplay("thor", ["ego", "allo", "targetobject"])
+                ers.append(er)
+                process = InferenceRunner._launch_process(process_index, edh_instance_files, config, er)
                 processes.append(process)
         finally:
             InferenceRunner._join_processes(processes)
+            for er in ers:
+                er.simulator.shutdown_simulator()
 
     @staticmethod
-    def _launch_process(process_index, edh_instance_files, config: InferenceRunnerConfig):
+    def _launch_process(process_index, edh_instance_files, config: InferenceRunnerConfig, er: EpisodeReplay):
         num_files = len(edh_instance_files)
         num_files_per_process = InferenceRunner._get_num_files_per_process(
             num_files=num_files, num_processes=config.num_processes
@@ -87,65 +103,109 @@ def _launch_process(process_index, edh_instance_files, config: InferenceRunnerCo
 
         files_to_process = edh_instance_files[start_index:end_index]
 
-        process = mp.Process(target=InferenceRunner._run, args=(process_index, files_to_process, config))
+        process = mp.Process(target=InferenceRunner._run, args=(process_index, files_to_process, config, er))
 
         process.start()
         time.sleep(0.1)
         return process
 
     @staticmethod
-    def _run(process_index, files_to_process, config: InferenceRunnerConfig):
+    def _run(process_index, files_to_process, config: InferenceRunnerConfig, er: EpisodeReplay):
         metrics_file = InferenceRunner._get_metrics_file_name_for_process(process_index, config.metrics_file)
         metrics = dict()
 
         model = config.model_class(process_index, config.num_processes, model_args=config.model_args)
 
         for file_index, instance_file in enumerate(files_to_process):
-            instance_id, instance_metrics = InferenceRunner._run_edh_instance(instance_file, config, model)
-            metrics[instance_id] = instance_metrics
-            save_dict_as_json(metrics, metrics_file)
+            try:
+                instance_id, instance_metrics = InferenceRunner._run_edh_instance(instance_file, config, model, er)
+                metrics[instance_id] = instance_metrics
+                save_dict_as_json(metrics, metrics_file)
+
+                logger.info(f"Instance {instance_id}, metrics: {instance_metrics}")
+                logger.info(f"Process {process_index} completed {file_index + 1} / {len(files_to_process)} instances")
+            except Exception:
+                err_msg = f"exception happened for instance={instance_file}, continue with the rest"
+                logger.error(err_msg, exc_info=True)
+                continue
 
-            logger.info(f"Process {process_index} completed {file_index + 1} / {len(files_to_process)} instances")
+    @staticmethod
+    def _load_edh_history_images(edh_instance, config: InferenceRunnerConfig):
+        image_file_names = edh_instance["driver_image_history"]
+        image_dir = os.path.join(config.data_dir, "images", config.split, edh_instance["game_id"])
+        return load_images(image_dir, image_file_names)
 
     @staticmethod
-    def _run_edh_instance(instance_file, config: InferenceRunnerConfig, model: TeachModel):
+    def _run_edh_instance(instance_file, config: InferenceRunnerConfig, model: TeachModel, er: EpisodeReplay):
         edh_instance = InferenceRunner._load_edh_instance(instance_file)
 
         edh_check_task = create_task_thor_from_state_diff(edh_instance["state_changes"])
         game_file = InferenceRunner._get_game_file(edh_instance, config)
 
         metrics = create_new_traj_metrics(edh_instance)
-        logger.debug(f"Processing instance {edh_instance['instance_id']}")
+        instance_id = edh_instance["instance_id"]
+        logger.debug(f"Processing instance {instance_id}")
 
         try:
             init_success, er = with_retry(
-                fn=lambda: InferenceRunner._initialize_episode_replay(edh_instance, game_file, edh_check_task),
+                fn=lambda: InferenceRunner._initialize_episode_replay(
+                    edh_instance, game_file, edh_check_task, config.replay_timeout, er
+                ),
                 retries=config.max_init_tries - 1,
                 check_first_return_value=True,
             )
         except Exception:
             init_success = False
-            logger.error("Failed to initialize episode replay", exc_info=True)
+            logger.error(f"Failed to initialize episode replay for instance={instance_id}", exc_info=True)
+
+        edh_history_images = None
+        try:
+            if not config.use_img_file:
+                edh_history_images = InferenceRunner._load_edh_history_images(edh_instance, config)
+        except Exception:
+            init_success = False
+            logger.error(f"Failed to load_edh_history_images for {instance_id}", exc_info=True)
 
         metrics["init_success"] = init_success
         if not init_success:
             return edh_instance["instance_id"], metrics
 
-        prev_action = None
-        er.simulator.is_record_mode = True
-        pred_actions = list()
-
-        traj_steps_taken = 0
-        for _ in range(config.max_traj_steps):
-            traj_steps_taken += 1
-            img = InferenceRunner._get_latest_ego_image(er)
-            action, obj_relative_coord = model.get_next_action(img, edh_instance, prev_action)
-            step_success = InferenceRunner._execute_action(er.simulator, action, obj_relative_coord)
-            InferenceRunner._update_metrics(metrics, action, obj_relative_coord, step_success)
-            prev_action = {"action": action, "obj_relative_coord": obj_relative_coord}
-            pred_actions.append(prev_action)
-            if InferenceRunner._should_end_inference(action, metrics, config.max_api_fails):
-                break
+        model_started_success = False
+        try:
+            model_started_success = model.start_new_edh_instance(edh_instance, edh_history_images, instance_file)
+        except Exception:
+            model_started_success = False
+            metrics["error"] = 1
+            logger.error(f"Failed to start_new_edh_instance for {instance_id}", exc_info=True)
+
+        if model_started_success:
+            prev_action = None
+            er.simulator.is_record_mode = True
+            pred_actions = list()
+
+            traj_steps_taken = 0
+            for _ in range(config.max_traj_steps):
+                traj_steps_taken += 1
+                try:
+                    img = InferenceRunner._get_latest_ego_image(er)
+                    image_name = InferenceRunner._save_image(config, edh_instance, img, traj_steps_taken)
+                    action, obj_relative_coord = model.get_next_action(
+                        img, edh_instance, prev_action, image_name, instance_file
+                    )
+                    step_success = InferenceRunner._execute_action(er.simulator, action, obj_relative_coord)
+                    InferenceRunner._update_metrics(metrics, action, obj_relative_coord, step_success)
+                    prev_action = {"action": action, "obj_relative_coord": obj_relative_coord}
+                    pred_actions.append(prev_action)
+                except Exception as e:
+                    logger.error(
+                        f"_run_edh_instance Exception: {str(e)} for instance_id={instance_id}, "
+                        f"traj_steps_taken={traj_steps_taken}",
+                        exc_info=True,
+                    )
+                    metrics["error"] = 1
+                    break
+                if InferenceRunner._should_end_inference(action, metrics, config.max_api_fails):
+                    break
 
         (
             success,
@@ -162,15 +222,16 @@ def _run_edh_instance(instance_file, config: InferenceRunnerConfig, model: Teach
         )
         metrics.update(metrics_diff)
 
-        pred_actions_file = os.path.join(config.output_dir, "pred_actions__" + edh_instance["instance_id"] + ".json")
+        os.makedirs(config.output_dir, exist_ok=True)
+        pred_actions_file = os.path.join(config.output_dir, "pred_actions__" + instance_id + ".json")
         with open(pred_actions_file, "w") as handle:
             json.dump(pred_actions, handle)
 
         er.simulator.dir_out = config.output_dir
-        output_file = os.path.join(config.output_dir, "inference__" + edh_instance["instance_id"] + ".json")
-        er.simulator.done(file_name=output_file)
+        output_file = os.path.join(config.output_dir, "inference__" + instance_id + ".json")
+        er.simulator.save(file_name=output_file)
 
-        return edh_instance["instance_id"], metrics
+        return instance_id, metrics
 
     @staticmethod
     def _check_episode_progress(er, task):
@@ -184,8 +245,8 @@ def _check_episode_progress(er, task):
         return success, final_goal_conditions_total, final_goal_conditions_satisfied
 
     @staticmethod
-    def _initialize_episode_replay(edh_instance, game_file, task):
-        er = EpisodeReplay("thor", ["ego", "allo", "targetobject"])
+    def _initialize_episode_replay(edh_instance, game_file, task, replay_timeout, er: EpisodeReplay):
+        start_time = time.perf_counter()
         er.set_episode_by_fn_and_idx(game_file, 0, 0)
         edh_interactions = list()
         for interaction in edh_instance["interactions"][: edh_instance["pred_start_idx"]]:
@@ -193,7 +254,14 @@ def _initialize_episode_replay(edh_instance, game_file, task):
             edh_interactions.append(Interaction.from_dict(interaction, action["action_type"]))
         er.episode.interactions = edh_interactions
 
-        init_success, _ = er.play_episode(task=task, shutdown_on_finish=False)
+        init_success = False
+        with ThreadPoolExecutor() as tp:
+            future = tp.submit(er.play_episode, task=task, shutdown_on_finish=False)
+            logger.info(f"Started episode replay with timeout: {replay_timeout} sec")
+            init_success, _ = future.result(timeout=replay_timeout)
+
+        elapsed_time = time.perf_counter() - start_time
+        logger.info(f"Elapsed time for episode replay: {elapsed_time}")
 
         return init_success, er if init_success else None
 
@@ -262,3 +330,26 @@ def _get_num_files_per_process(num_files, num_processes):
     def _join_processes(processes):
         for process in processes:
             process.join()
+
+    @staticmethod
+    def _save_image(config, edh_instance, img, traj_steps_taken):
+        image_name = f"img__{edh_instance['instance_id']}_{traj_steps_taken}.jpeg"
+        if config.use_img_file:
+            InferenceRunner._save_image_sync(img, image_name, config)
+        else:
+            InferenceRunner._save_image_async(img, image_name, config)
+        return image_name
+
+    @staticmethod
+    def _save_image_async(img, image_name, config: InferenceRunnerConfig):
+        process = mp.Process(target=InferenceRunner._save_image_sync, args=(img, image_name, config))
+        process.start()
+        return image_name
+
+    @staticmethod
+    def _save_image_sync(img, image_name, config: InferenceRunnerConfig):
+        if not isdir(config.images_dir):
+            Path(config.images_dir).mkdir(parents=True, exist_ok=True)
+        image_path = os.path.join(config.images_dir, image_name)
+        img.save(image_path)
+        return image_name
diff --git a/src/teach/inference/remote_model.py b/src/teach/inference/remote_model.py
new file mode 100644
index 0000000..2aec046
--- /dev/null
+++ b/src/teach/inference/remote_model.py
@@ -0,0 +1,100 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+import json
+import logging
+import sys
+from argparse import ArgumentParser
+from io import BytesIO
+from typing import List
+
+import requests
+
+from teach.inference.teach_model import TeachModel
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+
+TEACH_MODEL_API_URL_PREDICT = "http://{}/get_next_action"
+TEACH_MODEL_API_URL_START_EDH = "http://{}/start_new_edh_instance"
+TEACH_MODEL_API_URL_TEST = "http://{}/test"
+
+
+class RemoteModelException(Exception):
+    def __init__(self, message):
+        super().__init__(message)
+
+
+def assign_api_by_process_idx(host_and_ports, process_index):
+    splits = host_and_ports.split(",")
+    if process_index >= len(splits):
+        raise RemoteModelException(f"process_index={process_index} can't be handled by available APIs:{splits}")
+    return splits[process_index].strip()
+
+
+class RemoteModel(TeachModel):
+    def __init__(self, process_index: int, num_processes: int, model_args: List[str]):
+
+        parser = ArgumentParser()
+        parser.add_argument(
+            "--model_api_host_and_port",
+            type=str,
+            default="localhost:5000",
+            help="Teach Model API hosts and ports, E.g.:api1:5000,api2:5000",
+        )
+        args = parser.parse_args(model_args)
+
+        host_and_port = assign_api_by_process_idx(args.model_api_host_and_port, process_index)
+        self.test_url = TEACH_MODEL_API_URL_TEST.format(host_and_port)
+        self.predict_url = TEACH_MODEL_API_URL_PREDICT.format(host_and_port)
+        self.start_edh_url = TEACH_MODEL_API_URL_START_EDH.format(host_and_port)
+
+    def get_next_action(self, img, edh_instance, prev_action, img_name=None, edh_name=None):
+        if not img or not edh_instance:
+            logger.warning("either img or edh_instance is None")
+            return None, None
+        img_in_memory = BytesIO()
+        img.save(img_in_memory, "jpeg")
+        img_in_memory.seek(0)
+        data = {
+            "img_name": img_name,
+            "edh_name": edh_name,
+            "prev_action": json.dumps(prev_action) if prev_action else None,
+            "edh_instance": json.dumps(edh_instance),
+        }
+
+        resp = requests.post(self.predict_url, data=data, files={"img": (img_name, img_in_memory, "image/jpeg")})
+
+        if resp.status_code != 200:
+            logger.debug(f"failed sending data={data}")
+            raise RemoteModelException(resp.text)
+
+        resp_json = resp.json()
+        action = resp_json.get("action")
+        obj_relative_coord = resp_json.get("obj_relative_coord")
+        return action, obj_relative_coord
+
+    def test_connection(self):
+        resp = requests.get(self.test_url)
+        return resp.status_code == 200
+
+    def start_new_edh_instance(self, edh_instance, edh_history_images, edh_name=None):
+        images = []
+        if edh_history_images:
+            idx = 0
+            for image in edh_history_images:
+                img_in_memory = BytesIO()
+                image.save(img_in_memory, "jpeg")
+                img_in_memory.seek(0)
+                images.append(("edh_history_images", (f"history{idx}", img_in_memory, "image/jpeg")))
+                idx += 1
+
+        data = {"edh_name": edh_name, "edh_instance": json.dumps(edh_instance)}
+        resp = requests.post(self.start_edh_url, data=data, files=images)
+
+        if resp.status_code != 200:
+            logger.debug(f"failed sending data={data}")
+            raise RemoteModelException(resp.text)
+
+        return True
diff --git a/src/teach/inference/sample_model.py b/src/teach/inference/sample_model.py
index e95b3f6..425422a 100644
--- a/src/teach/inference/sample_model.py
+++ b/src/teach/inference/sample_model.py
@@ -33,14 +33,15 @@ def __init__(self, process_index: int, num_processes: int, model_args: List[str]
         logger.info(f"SampleModel using seed {args.seed}")
         np.random.seed(args.seed)
 
-    def get_next_action(self, img, edh_instance, prev_action):
+    def get_next_action(self, img, edh_instance, prev_action, img_name=None, edh_name=None):
         """
-        Sample function producing random actions at every time step. When running model inference, a model should be
-        called in this function instead.
+        This method will be called at each timestep during inference to get the next predicted action from the model.
         :param img: PIL Image containing agent's egocentric image
         :param edh_instance: EDH instance
         :param prev_action: One of None or a dict with keys 'action' and 'obj_relative_coord' containing returned values
         from a previous call of get_next_action
+        :param img_name: image file name
+        :param edh_name: EDH instance file name
         :return action: An action name from all_agent_actions
         :return obj_relative_coord: A relative (x, y) coordinate (values between 0 and 1) indicating an object in the image;
         The TEACh wrapper on AI2-THOR examines the ground truth segmentation mask of the agent's egocentric image, selects
@@ -55,3 +56,14 @@ def get_next_action(self, img, edh_instance, prev_action):
                 np.random.uniform(high=0.99),
             ]
         return action, obj_relative_coord
+
+    def start_new_edh_instance(self, edh_instance, edh_history_images, edh_name=None):
+        """
+        Since this class produces random actions at every time step, no particular setup is needed. When running model
+        inference, this would be a suitable place to preprocess the dialog, action and image history
+        :param edh_instance: EDH instance
+        :param edh_history_images: List of images as PIL Image objects (loaded from files in
+                                   edh_instance['driver_image_history'])
+        :param edh_name: EDH instance file name
+        """
+        pass
diff --git a/src/teach/inference/teach_model.py b/src/teach/inference/teach_model.py
index 2a9b0e7..5528739 100644
--- a/src/teach/inference/teach_model.py
+++ b/src/teach/inference/teach_model.py
@@ -19,16 +19,30 @@ def __init__(self, process_index: int, num_processes: int, model_args: List[str]
         """
 
     @abstractmethod
-    def get_next_action(self, img, edh_instance, prev_action):
+    def get_next_action(self, img, edh_instance, prev_action, img_name=None, edh_name=None):
         """
         This method will be called at each timestep during inference to get the next predicted action from the model.
         :param img: PIL Image containing agent's egocentric image
         :param edh_instance: EDH instance
         :param prev_action: One of None or a dict with keys 'action' and 'obj_relative_coord' containing returned values
         from a previous call of get_next_action
+        :param img_name: image file name
+        :param edh_name: EDH instance file name
         :return action: An action name from all_agent_actions
         :return obj_relative_coord: A relative (x, y) coordinate (values between 0 and 1) indicating an object in the image;
         The TEACh wrapper on AI2-THOR examines the ground truth segmentation mask of the agent's egocentric image, selects
         an object in a 10x10 pixel patch around the pixel indicated by the coordinate if the desired action can be
         performed on it, and executes the action in AI2-THOR.
         """
+
+    @abstractmethod
+    def start_new_edh_instance(self, edh_instance, edh_history_images, edh_name=None):
+        """
+        This method will be called at the start of each EDH instance after the environment has been set to the
+        initial state by replaying history actions but before any actions are requested from the model by calling
+        get_next_action
+        :param edh_instance: EDH instance
+        :param edh_history_images: List of images as PIL Image objects (loaded from files in
+                                   edh_instance['driver_image_history'])
+        :param edh_name: EDH instance file name
+        """
diff --git a/src/teach/logger.py b/src/teach/logger.py
index 8c45c93..9d00805 100644
--- a/src/teach/logger.py
+++ b/src/teach/logger.py
@@ -3,6 +3,7 @@
 
 
 import logging
+import sys
 
 from teach.settings import get_settings
 
@@ -18,4 +19,8 @@ def create_logger(name: str = None, level=logging.DEBUG):
         logging.basicConfig(level=logging.DEBUG)
         logger = logging.getLogger(name if name else __name__)
         logger.setLevel(level)
+        handler = logging.StreamHandler(sys.stdout)
+        formatter = logging.Formatter("[%(threadName)s-%(process)s-%(levelname)s] %(name)s: %(message)s")
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
         return logger
diff --git a/src/teach/modeling/ET/LICENSE b/src/teach/modeling/ET/LICENSE
new file mode 100644
index 0000000..65554f2
--- /dev/null
+++ b/src/teach/modeling/ET/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 ALFRED
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/teach/modeling/ET/README.md b/src/teach/modeling/ET/README.md
new file mode 100644
index 0000000..5d34ff6
--- /dev/null
+++ b/src/teach/modeling/ET/README.md
@@ -0,0 +1,81 @@
+# Episodic Transformer based EDH Baseline Model
+
+This subdirectory is based on the [Episodic Transformer (E.T.) repository](https://github.com/alexpashevich/E.T.) which builds on the [ALFRED repository](https://github.com/askforalfred/alfred). 
+The E.T. model is adapted here for the TEACh EDH benchmark.
+Note that we have removed files not used when running E.T. on TEACh, and many files have been significantly modified. 
+
+The following instructions to train and evaluate an E.T. model on TEACh assume that you have the TEACh dataset downloaded. 
+If running on a laptop, it might be desirable to mimic the folder structure of the TEACh dataset, but using only a small number of games from each split, and their corresponding images and EDH instances. 
+
+Set some useful environment variables
+```buildoutcfg
+export ET_DATA=/tmp/teach-dataset
+export TEACH_ROOT_DIR=/path/to/teach/repo
+export ET_LOGS=/path/to/store/checkpoints
+export VENV_DIR=/path/to/folder/to/store/venv
+export TEACH_SRC_DIR=$TEACH_ROOT_DIR/src
+export ET_ROOT=$TEACH_SRC_DIR/guides/modeling/ET
+export INFERENCE_OUTPUT_PATH=/path/to/store/inference/execution/files
+```
+Create a virtual environment
+
+```buildoutcfg
+python3 -m venv $VENV_DIR/teach_env
+source $VENV_DIR/teach_env/bin/activate
+cd TEACH_ROOT_DIR
+pip install --upgrade pip 
+pip install -r requirements.txt
+export PYTHONPATH=$TEACH_SRC_DIR:$ET_ROOT:$PYTHONPATH
+```
+
+Download the ET pretrained checkpoint for Faster RCNN and Mask RCNN models
+```buildoutcfg
+wget http://pascal.inrialpes.fr/data2/apashevi/et_checkpoints.zip
+unzip et_checkpoints.zip
+mv pretrained $ET_LOGS/
+rm et_checkpoints.zip
+```
+
+Perform ET preprocessing (this extracts image features and does some processing of EDH jsons)
+```buildoutcfg
+python -m alfred.data.create_lmdb \
+    with args.visual_checkpoint=$ET_LOGS/pretrained/fasterrcnn_model.pth \
+    args.data_input=edh_instances \
+    args.task_type=edh \
+    args.data_output=lmdb_edh \
+    args.vocab_path=None
+```
+Note: If running on laptop on a small subset of the data, use `args.vocab_path=$ET_ROOT/files/human.vocab` and add `args.device=cpu`.
+
+
+Train a model (adjust the `train.epochs` value in this command to specify the number of desired train epochs)
+```buildoutcfg
+python -m alfred.model.train with exp.model=transformer \
+    exp.name=teach_et_trial \
+    exp.data.train=lmdb_edh \
+    train.epochs=20  \
+    train.seed=2
+```
+Note: If running on laptop on a small subset of the data, add `exp.device=cpu` and `exp.num_workers=1`
+
+Copy certain necessary files to the model folder so that we do not have to access training info at inference time.
+```buildoutcfg
+cp $ET_DATA/lmdb_edh/data.vocab $ET_LOGS/teach_et_trial
+cp $ET_DATA/lmdb_edh/params.json $ET_LOGS/teach_et_trial
+```
+
+Evaluate the trained model
+```buildoutcfg
+cd $TEACH_ROOT_DIR
+python src/teach/cli/inference.py \
+    --model_module teach.inference.et_model \
+    --model_class ETModel \
+    --data_dir $ET_DATA \
+    --output_dir $INFERENCE_OUTPUT_PATH/inference__teach_et_trial \
+    --split valid_seen \
+    --metrics_file $INFERENCE_OUTPUT_PATH/metrics__teach_et_trial.json \
+    --seed 4 \
+    --model_dir teach_et_trial \
+    --object_predictor $ET_LOGS/pretrained/maskrcnn_model.pth \
+    --device cpu
+```
\ No newline at end of file
diff --git a/src/teach/modeling/ET/alfred/README.md b/src/teach/modeling/ET/alfred/README.md
new file mode 100644
index 0000000..53b39b5
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/README.md
@@ -0,0 +1,46 @@
+# Files Structure
+
+```
+/data
+    create_lmdb.py       (script to create an LMDB dataset out of trajectory files)
+    preprocessor.py      (class to preprocess trajectories annotations and actions)
+    process_tests.py     (script to process test splits for leaderboard evaluation)
+    zoo/base.py          (base class for LMDB dataset loading using multiple threads)
+    zoo/alfred.py        (class to load an LMDB dataset for an E.T. training)
+    zoo/speaker.py       (class to load an LMDB dataset for a translation pretraining)
+/env
+    reward.py            (rewards definitions)
+    tasks.py             (tasks definitions)
+    thor_env.py          (interface between AI2Thor and E.T. code)
+/eval
+    eval_agent.py        (script to evaluate an agent on full tasks or subgoals)
+    eval_master.py       (class for multi-process evaluation)
+    eval_subgoals.py     (functions for subgoal evaluation)
+    eval_task.py         (functions for full task evaluation)
+    leaderboard.py       (script to evaluate an agent on test splits)
+/gen
+    constants.py         (list of constants)
+    generate_trajs.py    (script to generate new trajectories)
+    goal_library.py      (library defining goals using PDDL)
+    render_trajs.py      (script to render existing trajectories)
+/model
+    train.py             (script for models training)
+    base.py              (base class for E.T. and translator models)
+    learned.py           (class with main train routines)
+    speaker.py           (translator model)
+    transformer.py       (E.T. model)
+/nn
+    attention.py         (basic attention mechanisms)
+    dec_object.py        (object decoder class)
+    enc_lang.py          (language encoder class)
+    enc_visual.py        (visual observations encoder class)
+    enc_vl.py            (multimodal encoder class)
+    encodings.py         (positional and temporal encodings)
+    transforms.py        (visual observations transformations)
+/utils
+    data_util.py         (data handling utils)
+    eval_util.py         (evaluation utils)
+    helper_util.py       (help utils)
+    metric_util.py       (utils to compute scores)
+    model_util.py        (utils for E.T. and translation models)
+```
diff --git a/src/teach/modeling/ET/alfred/config.py b/src/teach/modeling/ET/alfred/config.py
new file mode 100644
index 0000000..844f136
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/config.py
@@ -0,0 +1,214 @@
+from sacred import Ingredient
+from sacred.settings import SETTINGS
+
+exp_ingredient = Ingredient("exp")
+train_ingredient = Ingredient("train")
+eval_ingredient = Ingredient("eval")
+dagger_ingredient = Ingredient("dagger")
+
+SETTINGS.CONFIG.READ_ONLY_CONFIG = False
+
+
+@exp_ingredient.config
+def cfg_exp():
+    # HIGH-LEVEL MODEL SETTINGS
+    # where to save model and/or logs
+    name = "default"
+    # model to use
+    model = "transformer"
+    # which device to use
+    device = "cuda"
+    # number of data loading workers or evaluation processes (0 for main thread)
+    num_workers = 12
+    # we can fine-tune a pre-trained model
+    pretrained_path = None
+    # run the code on a small chunk of data
+    fast_epoch = False
+
+    # Set this to 1 if running on a Mac and to large numbers like 250 if running on EC2
+    lmdb_max_readers = 1
+
+    # DATA SETTINGS
+    data = {
+        # dataset name(s) for training and validation
+        "train": None,
+        # additional dataset name(s) can be specified for validation only
+        "valid": "",
+        # specify the length of each dataset
+        "length": 30000,
+        # what to use as annotations: {'lang', 'lang_frames', 'frames'}
+        "ann_type": "lang",
+        # Train dataloader type - sample or shuffle ("sample" results in sampling length points per epoch with
+        # replacement and "shuffle" results in iterating through the train dataset in random order per epoch
+        "train_load_type": "shuffle",
+    }
+
+    lang_pretrain_over_history_subgoals = False
+
+
+@eval_ingredient.config
+def cfg_eval():
+    # which experiment to evaluate (required)
+    exp = None
+    # which checkpoint to load ('latest.pth', 'model_**.pth')
+    checkpoint = "latest.pth"
+    # which split to use ('train', 'valid_seen', 'valid_unseen')
+    split = "valid_seen"
+    use_sample_for_train = True
+    use_random_actions = False
+    no_lang = False
+    no_vision = False
+
+    # shuffle the trajectories
+    shuffle = False
+    # max steps before episode termination
+    max_steps = 1000
+    # max API execution failures before episode termination
+    max_fails = 10
+    # subgoals to evaluate independently, eg:all or GotoLocation,PickupObject or 0,1
+    subgoals = ""
+    # smooth nav actions (might be required based on training data)
+    smooth_nav = False
+    # forward model with expert actions (only for subgoals)
+    no_model_unroll = False
+    # no teacher forcing with expert (only for subgoals)
+    no_teacher_force = False
+    # run in the debug mode
+    debug = False
+    # X server number
+    x_display = "0"
+    # range of checkpoints to evaluate, (9, 20, 2) means epochs 9, 11, 13, 15, 17, 19
+    # if None, only 'latest.pth' will be evaluated
+    eval_range = (9, 20, 1)
+    # object predictor path
+    object_predictor = None
+
+    # Is this evaluation for EDH instances or TFD instances?
+    eval_type = "edh"
+
+    # Set this to 1 if running on a Mac and to large numbers like 250 if running on EC2
+    # lmdb_max_readers = 1
+
+    # Set this to true if the model was trained (and should for inference try to get a wide view)
+    wide_view = False
+
+    force_retry = False
+
+
+@train_ingredient.config
+def cfg_train():
+    # GENERAL TRANING SETTINGS
+    # random seed
+    seed = 1
+    # load a checkpoint from a previous epoch (if available)
+    resume = True
+    # whether to print execution time for different parts of the code
+    profile = False
+
+    # For ablations
+    no_lang = False
+    no_vision = False
+
+    # HYPER PARAMETERS
+    # batch size
+    batch = 8
+    # number of epochs
+    epochs = 20
+    # optimizer type, must be in ('adam', 'adamw')
+    optimizer = "adamw"
+    # L2 regularization weight
+    weight_decay = 0.33
+    # learning rate settings
+    lr = {
+        # learning rate initial value
+        "init": 1e-4,
+        # lr scheduler type: {'linear', 'cosine', 'triangular', 'triangular2'}
+        "profile": "linear",
+        # (LINEAR PROFILE) num epoch to adjust learning rate
+        "decay_epoch": 10,
+        # (LINEAR PROFILE) scaling multiplier at each milestone
+        "decay_scale": 0.1,
+        # (COSINE & TRIANGULAR PROFILE) learning rate final value
+        "final": 1e-5,
+        # (TRIANGULAR PROFILE) period of the cycle to increase the learning rate
+        "cycle_epoch_up": 0,
+        # (TRIANGULAR PROFILE) period of the cycle to decrease the learning rate
+        "cycle_epoch_down": 0,
+        # warm up period length in epochs
+        "warmup_epoch": 0,
+        # initial learning rate will be divided by this value
+        "warmup_scale": 1,
+    }
+    # weight of action loss
+    action_loss_wt = 1.0
+    # weight of object loss
+    object_loss_wt = 1.0
+    # weight of subgoal completion predictor
+    # subgoal_aux_loss_wt = 0.1
+    subgoal_aux_loss_wt = 0
+    # weight of progress monitor
+    # progress_aux_loss_wt = 0.1
+    progress_aux_loss_wt = 0
+    # maximizing entropy loss (by default it is off)
+    entropy_wt = 0.0
+
+    # Should train loss be computed over history actions? (default False)
+    compute_train_loss_over_history = False
+
+    # TRANSFORMER settings
+    # size of transformer embeddings
+    demb = 768
+    # number of heads in multi-head attention
+    encoder_heads = 12
+    # number of layers in transformer encoder
+    encoder_layers = 2
+    # how many previous actions to use as input
+    num_input_actions = 1
+    # which encoder to use for language encoder (by default no encoder)
+    encoder_lang = {
+        "shared": True,
+        "layers": 2,
+        "pos_enc": True,
+        "instr_enc": False,
+    }
+    # which decoder to use for the speaker model
+    decoder_lang = {
+        "layers": 2,
+        "heads": 12,
+        "demb": 768,
+        "dropout": 0.1,
+        "pos_enc": True,
+    }
+    # do not propagate gradients to the look-up table and the language encoder
+    detach_lang_emb = False
+
+    # DROPOUTS
+    dropout = {
+        # dropout rate for language (goal + instr)
+        "lang": 0.0,
+        # dropout rate for Resnet feats
+        "vis": 0.3,
+        # dropout rate for processed lang and visual embeddings
+        "emb": 0.0,
+        # transformer model specific dropouts
+        "transformer": {
+            # dropout for transformer encoder
+            "encoder": 0.1,
+            # remove previous actions
+            "action": 0.0,
+        },
+    }
+
+    # ENCODINGS
+    enc = {
+        # use positional encoding
+        "pos": True,
+        # use learned positional encoding
+        "pos_learn": False,
+        # use learned token ([WORD] or [IMG]) encoding
+        "token": False,
+        # dataset id learned encoding
+        "dataset": False,
+    }
+
+    use_alfred_weights = False
diff --git a/src/teach/modeling/ET/alfred/constants.py b/src/teach/modeling/ET/alfred/constants.py
new file mode 100644
index 0000000..6a72af8
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/constants.py
@@ -0,0 +1,114 @@
+import os
+
+########################################################################################################################
+# General Settings
+
+ET_ROOT = os.environ["ET_ROOT"]
+ET_DATA = os.environ["ET_DATA"] if "ET_DATA" in os.environ else None
+ET_LOGS = os.environ["ET_LOGS"] if "ET_LOGS" in os.environ else None
+
+PAD = 0
+
+########################################################################################################################
+
+# TRAIN AND EVAL SETTINGS
+# evaluation on multiple GPUs
+NUM_EVAL_WORKERS_PER_GPU = 3
+# vocabulary file name
+VOCAB_FILENAME = "data.vocab"
+# vocabulary with object classes
+OBJ_CLS_VOCAB = "files/obj_cls.vocab"
+
+#############################
+
+OBJECTS_ACTIONS = [
+    "None",
+    "AlarmClock",
+    "Apple",
+    "AppleSliced",
+    "ArmChair",
+    "BaseballBat",
+    "BasketBall",
+    "Bathtub",
+    "BathtubBasin",
+    "Bed",
+    "Book",
+    "Bowl",
+    "Box",
+    "Bread",
+    "BreadSliced",
+    "ButterKnife",
+    "CD",
+    "Cabinet",
+    "Candle",
+    "Cart",
+    "CellPhone",
+    "Cloth",
+    "CoffeeMachine",
+    "CoffeeTable",
+    "CounterTop",
+    "CreditCard",
+    "Cup",
+    "Desk",
+    "DeskLamp",
+    "DiningTable",
+    "DishSponge",
+    "Drawer",
+    "Dresser",
+    "Egg",
+    "Faucet",
+    "FloorLamp",
+    "Fork",
+    "Fridge",
+    "GarbageCan",
+    "Glassbottle",
+    "HandTowel",
+    "Kettle",
+    "KeyChain",
+    "Knife",
+    "Ladle",
+    "Laptop",
+    "Lettuce",
+    "LettuceSliced",
+    "Microwave",
+    "Mug",
+    "Newspaper",
+    "Ottoman",
+    "Pan",
+    "Pen",
+    "Pencil",
+    "PepperShaker",
+    "Pillow",
+    "Plate",
+    "Plunger",
+    "Pot",
+    "Potato",
+    "PotatoSliced",
+    "RemoteControl",
+    "Safe",
+    "SaltShaker",
+    "Shelf",
+    "SideTable",
+    "Sink",
+    "SinkBasin",
+    "SoapBar",
+    "SoapBottle",
+    "Sofa",
+    "Spatula",
+    "Spoon",
+    "SprayBottle",
+    "Statue",
+    "StoveBurner",
+    "TVStand",
+    "TennisRacket",
+    "TissueBox",
+    "Toilet",
+    "ToiletPaper",
+    "ToiletPaperHanger",
+    "Tomato",
+    "TomatoSliced",
+    "Vase",
+    "Watch",
+    "WateringCan",
+    "WineBottle",
+]
diff --git a/src/teach/modeling/ET/alfred/data/__init__.py b/src/teach/modeling/ET/alfred/data/__init__.py
new file mode 100644
index 0000000..ea8b64a
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/data/__init__.py
@@ -0,0 +1,2 @@
+from alfred.data.zoo.guides_edh import GuidesEdhDataset
+from alfred.data.zoo.guides_speaker import GuidesSpeakerDataset
diff --git a/src/teach/modeling/ET/alfred/data/create_lmdb.py b/src/teach/modeling/ET/alfred/data/create_lmdb.py
new file mode 100644
index 0000000..c62a171
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/data/create_lmdb.py
@@ -0,0 +1,305 @@
+import copy
+import json
+import logging
+import os
+import pickle
+import re
+import shutil
+import threading
+from pathlib import Path
+
+import torch
+from alfred import constants
+from alfred.data.preprocessor import Preprocessor
+from alfred.nn.enc_visual import FeatureExtractor
+from alfred.utils import data_util, helper_util, model_util
+from progressbar import ProgressBar
+from sacred import Experiment, Ingredient
+from vocab import Vocab
+
+from teach.logger import create_logger
+
+logger = create_logger(__name__, level=logging.INFO)
+
+args_ingredient = Ingredient("args")
+ex = Experiment("create_data", ingredients=[args_ingredient])
+
+
+@args_ingredient.config
+def cfg_args():
+    # name of the output dataset
+    data_output = "lmdb_teach_edh"
+    # where to load the original ALFRED dataset images and jsons from
+    data_input = "edh_instances"
+    task_type = "edh"
+    # whether to overwrite old data in case it exists
+    overwrite = False
+    # number of processes to run the data processing in (0 for main thread)
+    num_workers = 4
+    # debug run with only 16 entries
+    fast_epoch = False
+
+    # VISUAL FEATURES SETTINGS
+    # visual archi (resnet18, fasterrcnn, maskrcnn)
+    visual_archi = "fasterrcnn"
+    # where to load a pretrained model from
+    visual_checkpoint = None
+    # which images to use (by default: RGBs)
+    image_folder = "images"
+    # feature compression
+    compress_type = "4x"
+    # which device to use
+    device = "cuda"
+
+    # LANGUAGE ANNOTATIONS SETTINGS
+    # generate dataset with subgoal annotations instead of human annotations
+    subgoal_ann = False
+    # use an existing vocabulary if specified (None for starting from scratch)
+    vocab_path = "files/base.vocab"
+
+
+def process_feats(traj_paths, extractor, lock, image_folder, save_path):
+    (save_path / "feats").mkdir(exist_ok=True)
+    if str(save_path).endswith("/worker00"):
+        with lock:
+            progressbar = ProgressBar(max_value=traj_paths.qsize())
+            progressbar.start()
+    while True:
+        with lock:
+            if traj_paths.qsize() == 0:
+                break
+            traj_path = Path(traj_paths.get())
+        filename_new = "{}:{}".format(traj_path.parts[-2], re.sub(".json", ".pt", traj_path.name))
+        # extract features with th extractor
+        images = data_util.read_traj_images(traj_path, image_folder)
+        if images is None or len(images) == 0:
+            raise RuntimeError(
+                "Failed to find images with image_folder =",
+                image_folder,
+                ", traj_path =",
+                traj_path.parts,
+            )
+        feat = data_util.extract_features(images, extractor)
+        if feat is not None:
+            torch.save(feat, save_path / "feats" / filename_new)
+        with lock:
+            with open(save_path.parents[0] / "processed_feats.txt", "a") as f:
+                f.write(str(traj_path) + "\n")
+            model_util.update_log(save_path.parents[0], stage="feats", update="increase", progress=1)
+            if str(save_path).endswith("/worker00"):
+                progressbar.update(progressbar.max_value - traj_paths.qsize())
+    if str(save_path).endswith("/worker00"):
+        progressbar.finish()
+
+
+def process_jsons(traj_paths, preprocessor, lock, save_path):
+    save_path.mkdir(exist_ok=True)
+    (save_path / "masks").mkdir(exist_ok=True)
+    (save_path / "jsons").mkdir(exist_ok=True)
+    if str(save_path).endswith("/worker00"):
+        with lock:
+            progressbar = ProgressBar(max_value=len(traj_paths))
+            progressbar.start()
+    while True:
+        with lock:
+            if len(traj_paths) == 0:
+                break
+            traj_path = Path(traj_paths.pop())
+        with traj_path.open() as f:
+            traj_orig = json.load(f)
+
+        trajs = [data_util.process_traj(traj_orig, traj_path, 0, preprocessor)]
+
+        # save masks and traj jsons
+        filename = "{}:{}".format(traj_path.parts[-2], re.sub(".json", ".pkl", traj_path.name))
+        with (save_path / "jsons" / filename).open("wb") as f:
+            pickle.dump(trajs, f)
+        # report the progress
+        with lock:
+            model_util.update_log(save_path.parents[0], stage="jsons", update="increase", progress=1)
+            if str(save_path).endswith("/worker00"):
+                progressbar.update(progressbar.max_value - len(traj_paths))
+    if str(save_path).endswith("/worker00"):
+        progressbar.finish()
+
+
+def get_traj_paths(input_path, processed_files_path, fast_epoch):
+    if (input_path / "processed.txt").exists():
+        # the dataset was generated locally
+        with (input_path / "processed.txt").open() as f:
+            traj_paths = [line.strip() for line in f.readlines()]
+            traj_paths = [line.split(";")[0] for line in traj_paths if line.split(";")[1] == "1"]
+            traj_paths = [str(input_path / line) for line in traj_paths]
+    else:
+        # the dataset was downloaded from ALFRED servers
+        traj_paths_all = sorted([str(path) for path in input_path.glob("*/*.json")])
+        traj_paths = traj_paths_all
+    if fast_epoch:
+        traj_paths = traj_paths[::20]
+    num_files = len(traj_paths)
+    if processed_files_path is not None and processed_files_path.exists():
+        if str(processed_files_path).endswith(constants.VOCAB_FILENAME):
+            traj_paths = []
+        else:
+            with processed_files_path.open() as f:
+                processed_files = set([line.strip() for line in f.readlines()])
+            traj_paths = [traj for traj in traj_paths if traj not in processed_files]
+    traj_paths = [Path(path) for path in traj_paths]
+    return traj_paths, num_files
+
+
+def run_in_parallel(func, num_workers, output_path, args, use_processes=False):
+    if num_workers == 0:
+        args.append(output_path / "worker00")
+        func(*args)
+    else:
+        threads = []
+        for idx in range(num_workers):
+            args_worker = copy.copy(args) + [output_path / "worker{:02d}".format(idx)]
+            if not use_processes:
+                ThreadClass = threading.Thread
+            else:
+                ThreadClass = torch.multiprocessing.Process
+            thread = ThreadClass(target=func, args=args_worker)
+            thread.start()
+            threads.append(thread)
+        for thread in threads:
+            thread.join()
+
+
+def gather_data(output_path, num_workers):
+    for dirname in ("feats", "masks", "jsons"):
+        if (output_path / dirname).is_dir():
+            shutil.rmtree(output_path / dirname)
+        (output_path / dirname).mkdir()
+    for dirname in ("feats", "masks", "jsons"):
+        for path_file in output_path.glob("worker*/{}/*".format(dirname)):
+            if path_file.stat().st_size == 0:
+                continue
+            path_symlink = output_path / dirname / path_file.name
+            link_file = True
+            if path_symlink.is_symlink():
+                # this file was already linked
+                if path_file.stat().st_size > path_symlink.stat().st_size:
+                    # we should replace the previously linked file with a new one
+                    link_file = True
+                    path_symlink.unlink()
+                else:
+                    # we should keep the previously linked file
+                    link_file = False
+            if link_file:
+                path_symlink.symlink_to(path_file)
+
+    partitions = ("train", "valid_seen", "valid_unseen", "test_seen", "test_unseen")
+    if not (output_path / ".deleting_worker_dirs").exists():
+        for partition in partitions:
+            logger.info("Processing %s trajectories" % partition)
+            feats_files = output_path.glob("feats/{}:*.pt".format(partition))
+            feats_files = sorted([str(path) for path in feats_files])
+            jsons_files = [p.replace("/feats/", "/jsons/").replace(".pt", ".pkl") for p in feats_files]
+            (output_path / partition).mkdir(exist_ok=True)
+            data_util.gather_feats(feats_files, output_path / partition / "feats")
+            data_util.gather_jsons(jsons_files, output_path / partition / "jsons.pkl")
+
+    logger.info("Removing worker directories")
+    (output_path / ".deleting_worker_dirs").touch()
+    for worker_idx in range(max(num_workers, 1)):
+        worker_dir = output_path / "worker{:02d}".format(worker_idx)
+        shutil.rmtree(worker_dir)
+    for dirname in ("feats", "masks", "jsons"):
+        shutil.rmtree(output_path / dirname)
+    os.remove(output_path / ".deleting_worker_dirs")
+    os.remove(output_path / "processed_feats.txt")
+
+
+@ex.automain
+def main(args):
+    torch.multiprocessing.set_start_method("spawn")
+    args = helper_util.AttrDict(**args)
+    if args.data_output is None:
+        raise RuntimeError("Please, specify the name of output dataset")
+
+    # set up the paths
+    output_path = Path(constants.ET_DATA) / args.data_output
+    input_path = Path(constants.ET_DATA) / args.data_input
+    logger.info("Creating a dataset {} using data from {}".format(args.data_output, input_path))
+    if not input_path.is_dir():
+        raise RuntimeError("The input dataset {} does not exist".format(input_path))
+    if output_path.is_dir() and args.overwrite:
+        logger.info("Erasing the old directory")
+        shutil.rmtree(output_path)
+    output_path.mkdir(exist_ok=True)
+
+    # read which files need to be processed
+    trajs_list, num_files = get_traj_paths(input_path, output_path / constants.VOCAB_FILENAME, args.fast_epoch)
+    model_util.save_log(
+        output_path,
+        progress=num_files - len(trajs_list),
+        total=num_files,
+        stage="jsons",
+    )
+    logger.info("Creating a dataset with {} trajectories using {} workers".format(num_files, args.num_workers))
+    logger.info("Processing JSONs and masks ({} were already processed)".format(num_files - len(trajs_list)))
+
+    # first process jsons and masks
+    if len(trajs_list) > 0:
+        lock = threading.Lock()
+        preprocessor = data_util.get_preprocessor(Preprocessor, args.subgoal_ann, lock, args.vocab_path, args.task_type)
+        run_in_parallel(
+            process_jsons,
+            args.num_workers,
+            output_path,
+            args=[trajs_list, preprocessor, lock],
+        )
+        vocab_copy = {}
+        for key, vocab in preprocessor.vocab.items():
+            vocab_copy[key] = Vocab.from_dict(vocab.to_dict())
+        torch.save(vocab_copy, output_path / constants.VOCAB_FILENAME)
+
+    # read which features need to be extracted
+    trajs_list, num_files_again = get_traj_paths(input_path, output_path / "processed_feats.txt", args.fast_epoch)
+    assert num_files == num_files_again
+    model_util.save_log(
+        output_path,
+        progress=num_files - len(trajs_list),
+        total=num_files,
+        stage="feats",
+    )
+    logger.info("Extracting features ({} were already processed)".format(num_files - len(trajs_list)))
+
+    # then extract features
+    extractor = FeatureExtractor(
+        args.visual_archi,
+        args.device,
+        args.visual_checkpoint,
+        share_memory=True,
+        compress_type=args.compress_type,
+    )
+    if len(trajs_list) > 0:
+        manager = torch.multiprocessing.Manager()
+        lock = manager.Lock()
+        trajs_queue = manager.Queue()
+        for path in trajs_list:
+            trajs_queue.put(path)
+        args_process_feats = [trajs_queue, extractor, lock, args.image_folder]
+        run_in_parallel(
+            process_feats,
+            args.num_workers,
+            output_path,
+            args=args_process_feats,
+            use_processes=True,
+        )
+
+    # finally, gather all the data
+    gather_data(output_path, args.num_workers)
+    # save dataset info to a file
+    feat_shape = extractor.feat_shape
+    params = {
+        "feat_shape": feat_shape,
+        "visual_checkpoint": args.visual_checkpoint,
+        "visual_archi": args.visual_archi,
+        "compress_type": args.compress_type,
+    }
+    with (output_path / "params.json").open("w") as f:
+        json.dump(params, f, sort_keys=True, indent=4)
+    logger.info("The dataset was saved to {}".format(output_path))
diff --git a/src/teach/modeling/ET/alfred/data/preprocessor.py b/src/teach/modeling/ET/alfred/data/preprocessor.py
new file mode 100644
index 0000000..b315a9b
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/data/preprocessor.py
@@ -0,0 +1,84 @@
+import copy
+
+import revtok
+from alfred.utils import data_util
+from vocab import Vocab
+
+
+class Preprocessor(object):
+    def __init__(self, vocab, subgoal_ann=False, is_test_split=False, frame_size=300):
+        self.subgoal_ann = subgoal_ann
+        self.is_test_split = is_test_split
+        self.frame_size = frame_size
+
+        if vocab is None:
+            self.vocab = {
+                "word": Vocab(["<<pad>>", "<<seg>>", "<<goal>>", "<<mask>>"]),
+                "action_low": Vocab(["<<pad>>", "<<seg>>", "<<stop>>", "<<mask>>"]),
+                "action_high": Vocab(["<<pad>>", "<<seg>>", "<<stop>>", "<<mask>>"]),
+            }
+        else:
+            self.vocab = vocab
+
+        self.word_seg = self.vocab["word"].word2index("<<seg>>", train=False)
+
+    @staticmethod
+    def numericalize(vocab, words, train=True):
+        """
+        converts words to unique integers
+        """
+        if not train:
+            new_words = set(words) - set(vocab.counts.keys())
+            if new_words:
+                # replace unknown words with <<pad>>
+                words = [w if w not in new_words else "<<pad>>" for w in words]
+        return vocab.word2index(words, train=train)
+
+    def process_language(self, ex, traj, r_idx, is_test_split=False):
+        if self.is_test_split:
+            is_test_split = True
+
+        instr_anns = [utterance for (speaker, utterance) in ex["dialog_history"]]
+        instr_anns = [revtok.tokenize(data_util.remove_spaces_and_lower(instr_ann)) for instr_ann in instr_anns]
+        instr_anns = [[w.strip().lower() for w in instr_ann] for instr_ann in instr_anns]
+        traj["ann"] = {
+            "instr": [instr_ann + ["<<instr>>"] for instr_ann in instr_anns],
+        }
+        traj["ann"]["instr"] += [["<<stop>>"]]
+        if "num" not in traj:
+            traj["num"] = {}
+        traj["num"]["lang_instr"] = [
+            self.numericalize(self.vocab["word"], x, train=not is_test_split) for x in traj["ann"]["instr"]
+        ]
+
+    def tokenize_and_numericalize(self, dialog_history, numericalize=True, train=False):
+        instr_anns = [utterance for (speaker, utterance) in dialog_history]
+
+        # tokenize annotations
+        instr_anns = [revtok.tokenize(data_util.remove_spaces_and_lower(instr_ann)) for instr_ann in instr_anns]
+
+        instr_anns = [[w.strip().lower() for w in instr_ann] for instr_ann in instr_anns]
+        instr = [instr_ann + ["<<instr>>"] for instr_ann in instr_anns]
+
+        instr += [["<<stop>>"]]
+
+        if numericalize:
+            instr = [self.numericalize(self.vocab["word"], word, train=train) for word in instr]
+        instr = sum(instr, [])  # flatten
+        return instr
+
+    def process_actions(self, ex, traj):
+        if "num" not in traj:
+            traj["num"] = {}
+        traj["num"]["driver_actions_low"] = list()
+        traj["num"]["driver_actions_pred_mask"] = list()
+        for action in ex["driver_action_history"]:
+            action_dict_with_idx = copy.deepcopy(action)
+            action_dict_with_idx["action"] = (self.vocab["action_low"].word2index(action["action_name"], train=True),)
+            traj["num"]["driver_actions_low"].append(action_dict_with_idx)
+            traj["num"]["driver_actions_pred_mask"].append(0)
+        for action in ex["driver_actions_future"]:
+            action_dict_with_idx = copy.deepcopy(action)
+            action_dict_with_idx["action"] = (self.vocab["action_low"].word2index(action["action_name"], train=True),)
+            traj["num"]["driver_actions_low"].append(action_dict_with_idx)
+            traj["num"]["driver_actions_pred_mask"].append(1)
diff --git a/src/teach/modeling/ET/alfred/data/zoo/base.py b/src/teach/modeling/ET/alfred/data/zoo/base.py
new file mode 100644
index 0000000..69eae9b
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/data/zoo/base.py
@@ -0,0 +1,138 @@
+import logging
+import os
+import pickle
+import warnings
+
+import lmdb
+import numpy as np
+import torch
+from alfred import constants
+from alfred.utils import data_util
+from torch.utils.data import Dataset as TorchDataset
+
+from teach.logger import create_logger
+
+logger = create_logger(__name__, level=logging.INFO)
+
+
+class BaseDataset(TorchDataset):
+    def __init__(self, name, partition, args, ann_type):
+        logger.debug("Dataset __init__ with args %s" % str(args))
+        path = os.path.join(constants.ET_DATA, name)
+        self.partition = partition
+        self.name = name
+        self.args = args
+        if ann_type not in ("lang", "frames", "lang_frames"):
+            raise ValueError("Unknown annotation type: {}".format(ann_type))
+        self.ann_type = ann_type
+        self.test_mode = False
+        self.pad = constants.PAD
+
+        # read information about the dataset
+        self.dataset_info = data_util.read_dataset_info(name)
+        if self.dataset_info["visual_checkpoint"]:
+            logger.info("Visual checkpoint for data preprocessing: %s" % str(self.dataset_info["visual_checkpoint"]))
+
+        # load data
+        self._length = self.load_data(path)
+        if self.args.fast_epoch:
+            self._length = 16
+        logger.info("%s dataset size = %d" % (partition, self._length))
+
+        # load vocabularies for input language and output actions
+        vocab = data_util.load_vocab(name, ann_type)
+        self.vocab_in = vocab["word"]
+        out_type = "action_low" if args.model == "transformer" else "action_high"
+        self.vocab_out = vocab[out_type]
+        logger.debug("Loaded vocab_out: %s" % str(self.vocab_out.to_dict()["index2word"]))
+        # if several datasets are used, we will translate outputs to this vocab later
+        self.vocab_translate = None
+
+    def load_data(self, path, feats=True, jsons=True):
+        """
+        load data
+        """
+        # do not open the lmdb database open in the main process, do it in each thread
+        if feats:
+            self.feats_lmdb_path = os.path.join(path, self.partition, "feats")
+
+        # load jsons with pickle and parse them
+        if jsons:
+            with open(os.path.join(path, self.partition, "jsons.pkl"), "rb") as jsons_file:
+                jsons = pickle.load(jsons_file)
+            self.jsons_and_keys = []
+            for idx in range(len(jsons)):
+                key = "{:06}".format(idx).encode("ascii")
+                if key in jsons:
+                    task_jsons = jsons[key]
+                    for json in task_jsons:
+                        # compatibility with the evaluation
+                        if "task" in json and isinstance(json["task"], str):
+                            pass
+                        else:
+                            json["task"] = "/".join(json["root"].split("/")[-3:-1])
+                        # add dataset idx and partition into the json
+                        json["dataset_name"] = self.name
+                        self.jsons_and_keys.append((json, key))
+                        # if the dataset has script annotations, do not add identical data
+                        if len(set([str(j["ann"]["instr"]) for j in task_jsons])) == 1:
+                            break
+
+        # return the true length of the loaded data
+        return len(self.jsons_and_keys) if jsons else None
+
+    def load_frames(self, key):
+        """
+        load image features from the disk
+        """
+        if not hasattr(self, "feats_lmdb"):
+            self.feats_lmdb, self.feats = self.load_lmdb(self.feats_lmdb_path)
+        feats_bytes = self.feats.get(key)
+        feats_numpy = np.frombuffer(feats_bytes, dtype=np.float32).reshape(self.dataset_info["feat_shape"])
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            frames = torch.tensor(feats_numpy)
+        return frames
+
+    def load_lmdb(self, lmdb_path):
+        """
+        load lmdb (should be executed in each worker on demand)
+        """
+        database = lmdb.open(
+            lmdb_path,
+            readonly=True,
+            lock=False,
+            readahead=False,
+            meminit=False,
+            max_readers=self.args.lmdb_max_readers,
+        )
+        cursor = database.begin(write=False)
+        return database, cursor
+
+    def __len__(self):
+        """
+        return dataset length
+        """
+        return self._length
+
+    def __getitem__(self, idx):
+        """
+        get item at index idx
+        """
+        raise NotImplementedError
+
+    @property
+    def id(self):
+        return self.partition + ":" + self.name + ";" + self.ann_type
+
+    def __del__(self):
+        """
+        close the dataset
+        """
+        if hasattr(self, "feats_lmdb"):
+            self.feats_lmdb.close()
+        if hasattr(self, "masks_lmdb"):
+            self.masks_lmdb.close()
+
+    def __repr__(self):
+        return "{}({})".format(type(self).__name__, self.id)
diff --git a/src/teach/modeling/ET/alfred/data/zoo/guides_edh.py b/src/teach/modeling/ET/alfred/data/zoo/guides_edh.py
new file mode 100644
index 0000000..52dca95
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/data/zoo/guides_edh.py
@@ -0,0 +1,107 @@
+import logging
+import os
+
+import torch
+from alfred import constants
+from alfred.data.zoo.base import BaseDataset
+
+from teach.logger import create_logger
+
+logger = create_logger(__name__, level=logging.INFO)
+
+
+class GuidesEdhDataset(BaseDataset):
+    def __init__(self, name, partition, args, ann_type):
+        super().__init__(name, partition, args, ann_type)
+        # preset values
+        self._load_features = True
+        self._load_frames = True
+        # load the vocabulary for object classes
+        vocab_obj_file = os.path.join(constants.ET_ROOT, constants.OBJ_CLS_VOCAB)
+        logger.info("Loading object vocab from %s" % vocab_obj_file)
+        self.vocab_obj = torch.load(vocab_obj_file)
+
+    def load_data(self, path):
+        return super().load_data(path, feats=True, jsons=True)
+
+    def __getitem__(self, idx):
+        task_json, key = self.jsons_and_keys[idx]
+        feat_dict = {}
+        if self._load_features:
+            feat_dict = self.load_features(task_json)
+        if self._load_frames:
+            feat_dict["frames"] = self.load_frames(key)
+
+        # Add a stop action and duplicate the last frame
+        feat_dict["action"].append(self.vocab_out.word2index("Stop"))
+        feat_dict["frames"] = torch.cat((feat_dict["frames"], torch.unsqueeze(feat_dict["frames"][-1, :], 0)), 0)
+        feat_dict["obj_interaction_action"].append(0)
+        feat_dict["driver_actions_pred_mask"].append(0)
+
+        if self.args.no_lang:
+            feat_dict["lang"] = [self.vocab_in.word2index("<<pad>>")]
+        elif self.args.no_vision:
+            feat_dict["frames"] = torch.rand(feat_dict["frames"].shape)
+
+        return task_json, feat_dict
+
+    def load_features(self, task_json):
+        """
+        load features from task_json
+        """
+        feat = dict()
+        # language inputs
+        feat["lang"] = GuidesEdhDataset.load_lang(task_json)
+
+        # action outputs
+        if not self.test_mode:
+            # low-level action
+            feat["action"] = GuidesEdhDataset.load_action(task_json, self.vocab_out)
+            feat["obj_interaction_action"] = [
+                a["obj_interaction_action"] for a in task_json["num"]["driver_actions_low"]
+            ]
+            feat["driver_actions_pred_mask"] = task_json["num"]["driver_actions_pred_mask"]
+            feat["object"] = self.load_object_classes(task_json, self.vocab_obj)
+
+        return feat
+
+    @staticmethod
+    def load_lang(task_json):
+        """
+        load numericalized language from task_json
+        """
+        return sum(task_json["num"]["lang_instr"], [])
+
+    @staticmethod
+    def load_action(task_json, vocab_orig, action_type="action_low"):
+        """
+        load action as a list of tokens from task_json
+        """
+        if action_type == "action_low":
+            # load low actions
+            lang_action = [[vocab_orig.word2index(a["action_name"]) for a in task_json["num"]["driver_actions_low"]]]
+            lang_action = sum(lang_action, [])
+        elif action_type == "action_high_future":
+            if "future_subgoals" in task_json:
+                lang_action = [vocab_orig.word2index(w) for w in task_json["future_subgoals"]]
+            else:
+                lang_action = [0]
+        elif action_type == "action_high_all":
+            lang_action = [
+                vocab_orig.word2index(w) for w in task_json["history_subgoals"] + task_json["future_subgoals"]
+            ]
+        else:
+            raise NotImplementedError("Unknown action_type {}".format(action_type))
+        return lang_action
+
+    def load_object_classes(self, task_json, vocab=None):
+        """
+        load object classes for interactive actions
+        """
+        object_classes = []
+        for idx, action in enumerate(task_json["num"]["driver_actions_low"]):
+            if self.args.compute_train_loss_over_history or task_json["num"]["driver_actions_pred_mask"][idx] == 1:
+                if action["oid"] is not None:
+                    object_class = action["oid"].split("|")[0]
+                    object_classes.append(object_class if vocab is None else vocab.word2index(object_class))
+        return object_classes
diff --git a/src/teach/modeling/ET/alfred/data/zoo/guides_speaker.py b/src/teach/modeling/ET/alfred/data/zoo/guides_speaker.py
new file mode 100644
index 0000000..0e4c456
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/data/zoo/guides_speaker.py
@@ -0,0 +1,32 @@
+from alfred.data.zoo.base import BaseDataset
+from alfred.data.zoo.guides_edh import GuidesEdhDataset
+
+
+class GuidesSpeakerDataset(BaseDataset):
+    def load_data(self, path):
+        return super(GuidesSpeakerDataset, self).load_data(path, feats=True, masks=False, jsons=True)
+
+    def __getitem__(self, idx):
+        task_json, key = self.jsons_and_keys[idx]
+        # load language and frames if asked first
+        feat_dict = {}
+        feat_dict["lang"] = GuidesEdhDataset.load_lang(task_json)
+        if "frames" in self.ann_type:
+            feat_dict["frames"] = self.load_frames(key)
+
+        # load output actions
+        if self.args.lang_pretrain_over_history_subgoals:
+            feat_dict["action"] = GuidesEdhDataset.load_action(
+                task_json, self.vocab_out, self.vocab_translate, "action_high_all"
+            )
+        else:
+            feat_dict["action"] = GuidesEdhDataset.load_action(
+                task_json, self.vocab_out, self.vocab_translate, "action_high_future"
+            )
+
+        # remove all the lang key/value pairs if only frames are used as input
+        if self.ann_type == "frames":
+            keys_lang = [key for key in feat_dict if key.startswith("lang")]
+            for key in keys_lang:
+                feat_dict.pop(key)
+        return task_json, feat_dict
diff --git a/src/teach/modeling/ET/alfred/model/base.py b/src/teach/modeling/ET/alfred/model/base.py
new file mode 100644
index 0000000..11525cb
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/model/base.py
@@ -0,0 +1,63 @@
+from alfred.utils import data_util
+from torch import nn
+
+
+class Model(nn.Module):
+    def __init__(self, args, embs_ann, vocab_out, pad, seg, for_inference=False):
+        """
+        Abstract model
+        """
+        nn.Module.__init__(self)
+        self.args = args
+        self.vocab_out = vocab_out
+        self.pad, self.seg = pad, seg
+        if for_inference:
+            model_dir = args["model_dir"]
+            dataset_info = data_util.read_dataset_info_for_inference(model_dir)
+        else:
+            dataset_info = data_util.read_dataset_info(args.data["train"][0])
+        self.visual_tensor_shape = dataset_info["feat_shape"][1:]
+
+        # create language and action embeddings
+        self.embs_ann = nn.ModuleDict({})
+        for emb_name, emb_size in embs_ann.items():
+            self.embs_ann[emb_name] = nn.Embedding(emb_size, args.demb)
+
+        # dropouts
+        self.dropout_vis = nn.Dropout(args.dropout["vis"], inplace=True)
+        self.dropout_lang = nn.Dropout2d(args.dropout["lang"])
+
+    def init_weights(self, init_range=0.1):
+        """
+        init linear layers in embeddings
+        """
+        for emb_ann in self.embs_ann.values():
+            emb_ann.weight.data.uniform_(-init_range, init_range)
+
+    def compute_metrics(self, model_out, gt_dict, metrics_dict, verbose):
+        """
+        compute model-specific metrics and put it to metrics dict
+        """
+        raise NotImplementedError
+
+    def forward(self, vocab, **inputs):
+        """
+        forward the model for multiple time-steps (used for training)
+        """
+        raise NotImplementedError()
+
+    def compute_batch_loss(self, model_out, gt_dict):
+        """
+        compute the loss function for a single batch
+        """
+        raise NotImplementedError()
+
+    def compute_loss(self, model_outs, gt_dicts):
+        """
+        compute the loss function for several batches
+        """
+        # compute losses for each batch
+        losses = {}
+        for dataset_key in model_outs.keys():
+            losses[dataset_key] = self.compute_batch_loss(model_outs[dataset_key], gt_dicts[dataset_key])
+        return losses
diff --git a/src/teach/modeling/ET/alfred/model/learned.py b/src/teach/modeling/ET/alfred/model/learned.py
new file mode 100644
index 0000000..78c81d8
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/model/learned.py
@@ -0,0 +1,161 @@
+import collections
+import json
+import logging
+import os
+from importlib import import_module
+
+import gtimer as gt
+from alfred.utils import data_util, model_util
+from tensorboardX import SummaryWriter
+from torch import nn
+from tqdm import tqdm
+
+from teach.logger import create_logger
+
+logger = create_logger(__name__, level=logging.INFO)
+
+
+class LearnedModel(nn.Module):
+    def __init__(self, args, embs_ann, vocab_out, for_inference=False):
+        """
+        Abstract model
+        """
+        nn.Module.__init__(self)
+        self.args = args
+        self.embs_ann = embs_ann
+        self.vocab_out = vocab_out
+        # sentinel tokens
+        self.pad, self.seg = 0, 1
+        # summary self.writer
+        self.summary_writer = None
+        # create the model to be trained
+        ModelClass = import_module("alfred.model.{}".format(args.model)).Model
+        self.model = ModelClass(args, embs_ann, vocab_out, self.pad, self.seg, for_inference)
+
+    def run_train(self, loaders, info, optimizer=None):
+        """
+        training loop
+        """
+        # prepare dictionaries
+        loaders_train = dict(filter(lambda x: "train" in x[0], loaders.items()))
+        assert len(set([len(loader) for loader in loaders_train.values()])) == 1
+        vocabs_in = {
+            "{};{}".format(loader.dataset.name, loader.dataset.ann_type): loader.dataset.vocab_in
+            for loader in loaders.values()
+        }
+        epoch_length = len(next(iter(loaders_train.values())))
+        logger.debug("In LearnedModel.run_train, epoch_length = %d" % epoch_length)
+        # initialize summary writer for tensorboardX
+        self.summary_writer = SummaryWriter(log_dir=self.args.dout)
+        # dump config
+        with open(os.path.join(self.args.dout, "config.json"), "wt") as f:
+            json.dump(vars(self.args), f, indent=2)
+        # optimizer
+        optimizer, schedulers = model_util.create_optimizer_and_schedulers(
+            info["progress"], self.args, self.parameters(), optimizer
+        )
+        # make sure that all train loaders have the same length
+        assert len(set([len(loader) for loader in loaders_train.values()])) == 1
+        model_util.save_log(
+            self.args.dout,
+            progress=info["progress"],
+            total=self.args.epochs,
+            stage="train",
+            best_loss=info["best_loss"],
+            iters=info["iters"],
+        )
+
+        # display dout
+        logger.info("Saving to: %s" % self.args.dout)
+        for epoch in range(info["progress"], self.args.epochs):
+            logger.info("Epoch {}/{}".format(epoch, self.args.epochs))
+            self.train()
+            train_iterators = {key: iter(loader) for key, loader in loaders_train.items()}
+            metrics = {key: collections.defaultdict(list) for key in loaders_train}
+            gt.reset()
+
+            for _ in tqdm(range(epoch_length), desc="train"):
+                # sample batches
+                batches = data_util.sample_batches(train_iterators, self.args.device, self.pad, self.args)
+                gt.stamp("data fetching", unique=False)
+
+                # do the forward passes
+                model_outs, losses_train = {}, {}
+                for batch_name, (traj_data, input_dict, gt_dict) in batches.items():
+                    if "lang" not in input_dict:
+                        raise RuntimeError("In learned.run_train, lang not in input_dict")
+                    model_outs[batch_name] = self.model.forward(
+                        vocabs_in[batch_name.split(":")[-1]], action=gt_dict["action"], **input_dict
+                    )
+                    info["iters"]["train"] += len(traj_data) if ":" not in batch_name else 0
+                gt.stamp("forward pass", unique=False)
+                # compute losses
+                losses_train = self.model.compute_loss(
+                    model_outs,
+                    {key: gt_dict for key, (_, _, gt_dict) in batches.items()},
+                )
+
+                # do the gradient step
+                optimizer.zero_grad()
+                sum_loss = sum([sum(loss.values()) for name, loss in losses_train.items()])
+                sum_loss.backward()
+                optimizer.step()
+                gt.stamp("optimizer", unique=False)
+
+                # compute metrics
+                for dataset_name in losses_train.keys():
+                    self.model.compute_metrics(
+                        model_outs[dataset_name],
+                        batches[dataset_name][2],
+                        metrics["train:" + dataset_name],
+                        self.args.compute_train_loss_over_history,
+                    )
+                    for key, value in losses_train[dataset_name].items():
+                        metrics["train:" + dataset_name]["loss/" + key].append(value.item())
+                    metrics["train:" + dataset_name]["loss/total"].append(sum_loss.detach().cpu().item())
+                gt.stamp("metrics", unique=False)
+                if self.args.profile:
+                    logger.info(gt.report(include_itrs=False, include_stats=False))
+
+            # save the checkpoint
+            logger.info("Saving models...")
+            stats = {"epoch": epoch}
+            model_util.save_model(self, "model_{:02d}.pth".format(epoch), stats, optimizer=optimizer)
+            model_util.save_model(self, "latest.pth", stats, symlink=True)
+
+            # compute metrics for train
+            logger.info("Computing train metrics...")
+            metrics = {data: {k: sum(v) / len(v) for k, v in metr.items()} for data, metr in metrics.items()}
+            stats = {
+                "epoch": epoch,
+                "general": {"learning_rate": optimizer.param_groups[0]["lr"]},
+                **metrics,
+            }
+
+            # save the checkpoint
+            logger.info("Saving models...")
+            model_util.save_model(self, "model_{:02d}.pth".format(epoch), stats, optimizer=optimizer)
+            model_util.save_model(self, "latest.pth", stats, symlink=True)
+            # write averaged stats
+            for loader_id in stats.keys():
+                if isinstance(stats[loader_id], dict):
+                    for stat_key, stat_value in stats[loader_id].items():
+                        # for comparison with old epxs, maybe remove later
+                        summary_key = "{}/{}".format(
+                            loader_id.replace(":", "/").replace("lmdb/", "").replace(";lang", "").replace(";", "_"),
+                            stat_key.replace(":", "/").replace("lmdb/", ""),
+                        )
+                        self.summary_writer.add_scalar(summary_key, stat_value, info["iters"]["train"])
+            # dump the training info
+            model_util.save_log(
+                self.args.dout,
+                progress=epoch + 1,
+                total=self.args.epochs,
+                stage="train",
+                best_loss=info["best_loss"],
+                iters=info["iters"],
+            )
+            model_util.adjust_lr(self.args, epoch, schedulers)
+        logger.info(
+            "{} epochs are completed, all the models were saved to: {}".format(self.args.epochs, self.args.dout)
+        )
diff --git a/src/teach/modeling/ET/alfred/model/speaker.py b/src/teach/modeling/ET/alfred/model/speaker.py
new file mode 100644
index 0000000..3c2e809
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/model/speaker.py
@@ -0,0 +1,234 @@
+import logging
+
+import numpy as np
+import torch
+from alfred.model import base
+from alfred.nn.enc_lang import EncoderLang
+from alfred.nn.enc_visual import FeatureFlat
+from alfred.nn.enc_vl import EncoderVL
+from alfred.nn.encodings import PosLangEncoding
+from alfred.utils import model_util
+from torch import nn
+from torch.nn import functional as F
+
+from teach.logger import create_logger
+
+logger = create_logger(__name__, level=logging.INFO)
+
+
+class Model(base.Model):
+    def __init__(self, args, embs_ann, vocab_out, pad, seg, for_inference=False):
+        """
+        speaker model
+        """
+        super().__init__(args, embs_ann, vocab_out, pad, seg, for_inference)
+
+        # encoder and visual embeddings
+        self.encoder_vl, self.encoder_lang = None, None
+        if any("frames" in ann_type for ann_type in args.data["ann_type"]):
+            # create a multi-modal encoder
+            self.encoder_vl = EncoderVL(args)
+            # create feature embeddings
+            self.vis_feat = FeatureFlat(input_shape=self.visual_tensor_shape, output_size=args.demb)
+        else:
+            # create an encoder for language only
+            self.encoder_lang = EncoderLang(args.encoder_layers, args, embs_ann)
+
+        # decoder parts
+        decoder_layer = nn.TransformerDecoderLayer(
+            args.demb,
+            args.decoder_lang["heads"],
+            args.decoder_lang["demb"],
+            args.decoder_lang["dropout"],
+        )
+        self.decoder = nn.TransformerDecoder(decoder_layer, args.decoder_lang["layers"])
+        self.enc_pos = PosLangEncoding(args.demb) if args.decoder_lang["pos_enc"] else None
+        self.emb_subgoal = nn.Embedding(len(vocab_out), args.demb)
+
+        # final touch
+        self.init_weights()
+
+    def encode_vl(self, vocab, **inputs):
+        """
+        apply the VL encoder to the inputs
+        """
+        lang = inputs["lang"] if "lang" in inputs else None
+        frames = inputs["frames"] if "frames" in inputs else None
+        device = lang.device if lang is not None else frames.device
+        assert inputs is not None or frames is not None
+        batch_size = len(lang if lang is not None else frames)
+        # embed language if the model should see them
+        if lang is not None:
+            emb_lang = self.embed_lang(lang, self.embs_ann[vocab.name])
+            lengths_lang = inputs["lengths_lang"]
+        else:
+            emb_lang = torch.zeros([batch_size, 0, self.args.demb]).to(device)
+            lengths_lang = torch.tensor([0] * batch_size)
+
+        # embed frames if the model should see them
+        if frames is not None:
+            emb_frames = self.embed_frames(frames)
+            lengths_frames = inputs["lengths_frames"]
+            length_frames_max = inputs["length_frames_max"]
+        else:
+            emb_frames = torch.zeros([batch_size, 0, self.args.demb]).to(device)
+            lengths_frames, length_frames_max = torch.tensor([0] * batch_size), 0
+        # speaker does not use the actions
+        emb_actions = torch.zeros([batch_size, 0, self.args.demb]).to(device)
+        lengths_actions = torch.tensor([0] * batch_size)
+        # encode inputs
+        hiddens, hiddens_padding = self.encoder_vl(
+            emb_lang,
+            emb_frames,
+            emb_actions,
+            lengths_lang,
+            lengths_frames,
+            lengths_actions,
+            length_frames_max,
+            attn_masks=False,
+        )
+        return hiddens, hiddens_padding
+
+    def encode_lang(self, vocab, lang_pad):
+        """
+        apply the language encoder to the inputs
+        """
+        embedder_lang = self.embs_ann[vocab.name]
+        emb_lang, lengths_lang = self.encoder_lang(lang_pad, embedder_lang, vocab, self.pad)
+        emb_padding = torch.zeros(emb_lang.shape[:2], device=emb_lang.device).bool()
+        for i, len_l in enumerate(lengths_lang):
+            emb_padding[i, len_l:] = True
+        return emb_lang, emb_padding
+
+    def encode_inputs(self, vocab, **inputs):
+        """
+        apply the VL or language encoder to the inputs
+        """
+        if self.encoder_vl is not None:
+            hiddens, hiddens_padding = self.encode_vl(vocab, **inputs)
+        else:
+            hiddens, hiddens_padding = self.encode_lang(vocab, inputs["lang"])
+        return hiddens, hiddens_padding
+
+    def forward(self, vocab, **inputs):
+        """
+        forward the model for multiple time-steps (used for training)
+        """
+        # pass inputs to the encoder
+        hiddens, hiddens_padding = self.encode_inputs(vocab, **inputs)
+        hiddens = self.enc_pos(hiddens) if self.enc_pos else hiddens
+        # generate masks
+        lang_target = inputs["action"]
+        target_mask = model_util.triangular_mask(lang_target.size(1), lang_target.device)
+        # right shift the targets
+        lang_target = lang_target.clone().detach()
+        lang_target = torch.roll(lang_target, 1, 1)
+        lang_target[:, 0] = self.seg
+        # embed targets and add position encodings
+        target = self.embed_lang(lang_target, self.emb_subgoal)
+        target = self.enc_pos(target) if self.enc_pos else target
+
+        # decode the outputs with transformer
+        decoder_out = self.decoder(
+            tgt=target.transpose(0, 1),
+            memory=hiddens.transpose(0, 1),
+            # to avoid looking at the future tokens (the ones on the right)
+            tgt_mask=target_mask,
+            # avoid looking on padding of the src
+            memory_key_padding_mask=hiddens_padding,
+        ).transpose(0, 1)
+        # apply a linear layer
+        decoder_out_flat = decoder_out.reshape(-1, self.args.demb)
+        lang_out_flat = decoder_out_flat.mm(self.emb_subgoal.weight.t())
+        output = {"lang": lang_out_flat.view(len(decoder_out), -1, lang_out_flat.shape[-1])}
+        return output
+
+    def embed_frames(self, frames_pad):
+        """
+        take a list of frames tensors, pad it, apply dropout and extract embeddings
+        """
+        self.dropout_vis(frames_pad)
+        frames_4d = frames_pad.view(-1, *frames_pad.shape[2:])
+        frames_pad_emb = self.vis_feat(frames_4d).view(*frames_pad.shape[:2], -1)
+        return frames_pad_emb
+
+    def embed_lang(self, lang_pad, embedder):
+        """
+        embed goal+instr language
+        """
+        lang_pad_emb = embedder(lang_pad)
+        lang_pad_emb = self.dropout_lang(lang_pad_emb)
+        return lang_pad_emb
+
+    def compute_batch_loss(self, model_out, gt_dict):
+        """
+        language translation loss function
+        """
+        p_lang = model_out["lang"].view(-1, model_out["lang"].shape[-1])
+        l_lang = gt_dict["action"].view(-1)
+        loss_lang = F.cross_entropy(p_lang, l_lang, reduction="none").mean()
+        return {"lang": loss_lang}
+
+    def init_weights(self, init_range=0.1):
+        """
+        init embeddings uniformly
+        """
+        super().init_weights(init_range)
+        self.emb_subgoal.weight.data.uniform_(-init_range, init_range)
+
+    def compute_metrics(self, model_out, gt_dict, metrics_dict, verbose=False):
+        """
+        compute exact matching and f1 score for action predictions
+        """
+        pred_tokens = model_out["lang"].max(2)[1].tolist()
+        pred_lang = model_util.tokens_to_lang(pred_tokens, self.vocab_out, {self.pad}, join=False)
+        gt_lang = model_util.tokens_to_lang(gt_dict["action"], self.vocab_out, {self.pad}, join=False)
+        pred_lang_strs = [" ".join(s) for s in pred_lang]
+        gt_lang_strs = [" ".join(s) for s in gt_lang]
+        model_util.compute_f1_and_exact(metrics_dict, pred_lang_strs, gt_lang_strs, "lang")
+        if verbose:
+            logger.debug("Lang GT:\n{}".format(gt_lang_strs[0]))
+            logger.debug("Lang predictions:\n{}".format(pred_lang_strs[0]))
+            logger.debug("EM = {}, F1 = {}".format(metrics_dict["lang/exact"][-1], metrics_dict["lang/f1"][-1]))
+
+    def translate(self, vocab_in, max_decode=300, num_pad_stop=3, **inputs):
+        """
+        lang and frames has shapes [1, LEN]
+        """
+        # prepare
+        batch_size = len(inputs["lang"] if "lang" in inputs else inputs["frames"])
+        device = (inputs["lang"] if "lang" in inputs else inputs["frames"]).device
+        # pass inputs to the encoder
+        hiddens, hiddens_padding = self.encode_inputs(vocab_in, **inputs)
+        assert len(hiddens) == batch_size
+
+        # start the decoding
+        lang_cur = [[self.seg] for _ in range(batch_size)]
+        for i in range(max_decode):
+            tensor_cur = torch.tensor(lang_cur).to(device)
+            emb_cur = self.embed_lang(tensor_cur, self.emb_subgoal)
+            if self.enc_pos:
+                emb_cur = self.enc_pos(emb_cur)
+            mask_cur = model_util.triangular_mask(i + 1, device)
+
+            decoder_out = self.decoder(
+                tgt=emb_cur.transpose(0, 1),
+                memory=hiddens.transpose(0, 1),
+                tgt_mask=mask_cur,
+                # avoid looking on padding of the src
+                memory_key_padding_mask=hiddens_padding,
+            ).transpose(0, 1)
+
+            # apply a linear layer
+            decoder_out_flat = decoder_out.reshape(-1, self.args.demb)
+            lang_out_flat = decoder_out_flat.mm(self.emb_subgoal.weight.t())
+            lang_out = lang_out_flat.view(batch_size, -1, lang_out_flat.shape[-1])
+            tokens_out = lang_out.max(2)[1]
+            for j in range(batch_size):
+                lang_cur[j].append(tokens_out[i, -1].item())
+            if len(tokens_out[0]) > num_pad_stop and (np.array(lang_cur)[:, -num_pad_stop:] == self.pad).all():
+                break
+
+        lang_result = [l[1:] for l in lang_cur]
+        lang_result = [[t for t in tokens if t != self.pad] for tokens in lang_result]
+        return lang_result
diff --git a/src/teach/modeling/ET/alfred/model/train.py b/src/teach/modeling/ET/alfred/model/train.py
new file mode 100755
index 0000000..dfc26f6
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/model/train.py
@@ -0,0 +1,205 @@
+import logging
+import os
+import random
+import shutil
+
+import numpy as np
+import torch
+from alfred import constants
+from alfred.config import exp_ingredient, train_ingredient
+from alfred.data import GuidesEdhDataset, GuidesSpeakerDataset
+from alfred.model.learned import LearnedModel
+from alfred.utils import data_util, helper_util, model_util
+from sacred import Experiment
+
+from teach.logger import create_logger
+
+ex = Experiment("train", ingredients=[train_ingredient, exp_ingredient])
+
+logger = create_logger(__name__, level=logging.INFO)
+
+
+def prepare(train, exp):
+    """
+    create logdirs, check dataset, seed pseudo-random generators
+    """
+    # args and init
+    args = helper_util.AttrDict(**train, **exp)
+    args.dout = os.path.join(constants.ET_LOGS, args.name)
+    args.data["train"] = args.data["train"].split(",")
+    args.data["valid"] = args.data["valid"].split(",") if args.data["valid"] else []
+    num_datas = len(args.data["train"]) + len(args.data["valid"])
+    for key in ("ann_type",):
+        args.data[key] = args.data[key].split(",")
+        if len(args.data[key]) == 1:
+            args.data[key] = args.data[key] * num_datas
+        if len(args.data[key]) != num_datas:
+            raise ValueError("Provide either 1 {} or {} separated by commas".format(key, num_datas))
+    # set seeds
+    torch.manual_seed(args.seed)
+    random.seed(a=args.seed)
+    np.random.seed(args.seed)
+    # make output dir
+    logger.info("Train args: %s" % str(args))
+    if not os.path.isdir(args.dout):
+        os.makedirs(args.dout)
+    return args
+
+
+def load_only_matching_layers(model, pretrained_model, train_lmdb_name):
+    pretrained_dict = {}
+    model_dict = model.state_dict()
+
+    logger.debug("Pretrained Model keys: %s" % str(pretrained_model["model"].keys()))
+    logger.debug("Model state dict keys: %s" % str(model_dict.keys()))
+
+    for name, param in pretrained_model["model"].items():
+        model_name = name
+        if name not in model_dict.keys():
+            model_name = name.replace("lmdb_human", train_lmdb_name)
+            if model_name not in model_dict.keys():
+                logger.debug("No matching key ignoring %s" % model_name)
+                continue
+
+        if param.size() == model_dict[model_name].size():
+            logger.debug(
+                "Matched name and size: %s %s %s" % (name, str(param.size()), str(model_dict[model_name].size()))
+            )
+            pretrained_dict[model_name] = param
+        else:
+            logger.debug("Mismatched size: %s %s %s" % (name, str(param.size()), str(model_dict[model_name].size())))
+    logger.debug("Matched keys: %s" % str(pretrained_dict.keys()))
+    return pretrained_dict
+
+
+def create_model(args, embs_ann, vocab_out):
+    """
+    load a model and its optimizer
+    """
+    prev_train_info = model_util.load_log(args.dout, stage="train")
+    if args.resume and os.path.exists(os.path.join(args.dout, "latest.pth")):
+        # load a saved model
+        loadpath = os.path.join(args.dout, "latest.pth")
+        model, optimizer = model_util.load_model(loadpath, args.device, prev_train_info["progress"] - 1)
+        assert model.vocab_out.contains_same_content(vocab_out)
+        model.args = args
+    else:
+        # create a new model
+        if not args.resume and os.path.isdir(args.dout):
+            shutil.rmtree(args.dout)
+        model = LearnedModel(args, embs_ann, vocab_out)
+        model = model.to(torch.device(args.device))
+        optimizer = None
+        if args.pretrained_path:
+            if "/" not in args.pretrained_path:
+                # a relative path at the logdir was specified
+                args.pretrained_path = model_util.last_model_path(args.pretrained_path)
+            logger.info("Loading pretrained model from {}".format(args.pretrained_path))
+            pretrained_model = torch.load(args.pretrained_path, map_location=torch.device(args.device))
+            if args.use_alfred_weights:
+                pretrained_dict = load_only_matching_layers(model, pretrained_model, args.data["train"][0])
+                model_dict = model.state_dict()
+                model_dict.update(pretrained_dict)
+                model.load_state_dict(model_dict)
+                loaded_keys = pretrained_dict.keys()
+            else:
+                model.load_state_dict(pretrained_model["model"], strict=False)
+                loaded_keys = set(model.state_dict().keys()).intersection(set(pretrained_model["model"].keys()))
+            assert len(loaded_keys)
+            logger.debug("Loaded keys: %s", str(loaded_keys))
+    # put encoder on several GPUs if asked
+    if torch.cuda.device_count() > 1:
+        logger.info("Parallelizing the model")
+        model.model = helper_util.DataParallel(model.model)
+    return model, optimizer, prev_train_info
+
+
+def load_data(name, args, ann_type, valid_only=False):
+    """
+    load dataset and wrap them into torch loaders
+    """
+    partitions = ([] if valid_only else ["train"]) + ["valid_seen", "valid_unseen"]
+    datasets = []
+    for partition in partitions:
+        if args.model == "speaker":
+            dataset = GuidesSpeakerDataset(name, partition, args, ann_type)
+        elif args.model == "transformer":
+            dataset = GuidesEdhDataset(name, partition, args, ann_type)
+        else:
+            raise ValueError("Unknown model: {}".format(args.model))
+        datasets.append(dataset)
+    return datasets
+
+
+def wrap_datasets(datasets, args):
+    """
+    wrap datasets with torch loaders
+    """
+    batch_size = args.batch // len(args.data["train"])
+    loader_args = {
+        "num_workers": args.num_workers,
+        "drop_last": (torch.cuda.device_count() > 1),
+        "collate_fn": helper_util.identity,
+    }
+    if args.num_workers > 0:
+        # do not prefetch samples, this may speed up data loading
+        loader_args["prefetch_factor"] = 1
+
+    loaders = {}
+    for dataset in datasets:
+        if dataset.partition == "train":
+            if args.data["train_load_type"] == "sample":
+                weights = [1 / len(dataset)] * len(dataset)
+                num_samples = 16 if args.fast_epoch else (args.data["length"] or len(dataset))
+                num_samples = num_samples // len(args.data["train"])
+                sampler = torch.utils.data.WeightedRandomSampler(weights, num_samples=num_samples, replacement=True)
+                loader = torch.utils.data.DataLoader(dataset, batch_size, sampler=sampler, **loader_args)
+            else:
+                loader = torch.utils.data.DataLoader(dataset, args.batch, shuffle=True, **loader_args)
+        else:
+            loader = torch.utils.data.DataLoader(dataset, args.batch, shuffle=(not args.fast_epoch), **loader_args)
+        loaders[dataset.id] = loader
+    return loaders
+
+
+def process_vocabs(datasets, args):
+    """
+    assign the largest output vocab to all datasets, compute embedding sizes
+    """
+    # find the longest vocabulary for outputs among all datasets
+    for dataset in datasets:
+        logger.debug("dataset.id = %s, vocab_out = %s" % (dataset.id, str(dataset.vocab_out)))
+    vocab_out = sorted(datasets, key=lambda x: len(x.vocab_out))[-1].vocab_out
+    # make all datasets to use this vocabulary for outputs translation
+    for dataset in datasets:
+        dataset.vocab_translate = vocab_out
+    # prepare a dictionary for embeddings initialization: vocab names and their sizes
+    embs_ann = {}
+    for dataset in datasets:
+        embs_ann[dataset.name] = len(dataset.vocab_in)
+    return embs_ann, vocab_out
+
+
+@ex.automain
+def main(train, exp):
+    """
+    train a network using an lmdb dataset
+    """
+    # parse args
+    args = prepare(train, exp)
+    # load dataset(s) and process vocabs
+    datasets = []
+    ann_types = iter(args.data["ann_type"])
+    for name, ann_type in zip(args.data["train"], ann_types):
+        datasets.extend(load_data(name, args, ann_type))
+    for name, ann_type in zip(args.data["valid"], ann_types):
+        datasets.extend(load_data(name, args, ann_type, valid_only=True))
+    # assign vocabs to datasets and check their sizes for nn.Embeding inits
+    embs_ann, vocab_out = process_vocabs(datasets, args)
+    logger.debug("In train.main, vocab_out = %s" % str(vocab_out))
+    # wrap datasets with loaders
+    loaders = wrap_datasets(datasets, args)
+    # create the model
+    model, optimizer, prev_train_info = create_model(args, embs_ann, vocab_out)
+    # start train loop
+    model.run_train(loaders, prev_train_info, optimizer=optimizer)
diff --git a/src/teach/modeling/ET/alfred/model/transformer.py b/src/teach/modeling/ET/alfred/model/transformer.py
new file mode 100644
index 0000000..a38ebec
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/model/transformer.py
@@ -0,0 +1,256 @@
+import torch
+from alfred.model import base
+from alfred.nn.dec_object import ObjectClassifier
+from alfred.nn.enc_lang import EncoderLang
+from alfred.nn.enc_visual import FeatureFlat
+from alfred.nn.enc_vl import EncoderVL
+from alfred.nn.encodings import DatasetLearnedEncoding
+from alfred.utils import model_util
+from torch import nn
+from torch.nn import functional as F
+
+
+class Model(base.Model):
+    def __init__(self, args, embs_ann, vocab_out, pad, seg, for_inference=False):
+        """
+        transformer agent
+        """
+        super().__init__(args, embs_ann, vocab_out, pad, seg, for_inference)
+
+        # encoder and visual embeddings
+        self.encoder_vl = EncoderVL(args)
+        # pre-encoder for language tokens
+        self.encoder_lang = EncoderLang(args.encoder_lang["layers"], args, embs_ann)
+        # feature embeddings
+        self.vis_feat = FeatureFlat(input_shape=self.visual_tensor_shape, output_size=args.demb)
+        # dataset id learned encoding (applied after the encoder_lang)
+        self.dataset_enc = None
+        if args.enc["dataset"]:
+            self.dataset_enc = DatasetLearnedEncoding(args.demb, args.data["train"])
+        # embeddings for actions
+        self.emb_action = nn.Embedding(len(vocab_out), args.demb)
+        # dropouts
+        self.dropout_action = nn.Dropout2d(args.dropout["transformer"]["action"])
+
+        # decoder parts
+        encoder_output_size = args.demb
+        self.dec_action = nn.Linear(encoder_output_size, args.demb)
+        self.dec_object = ObjectClassifier(encoder_output_size)
+
+        # skip connection for object predictions
+        self.object_feat = FeatureFlat(input_shape=self.visual_tensor_shape, output_size=args.demb)
+
+        # progress monitoring heads
+        if self.args.progress_aux_loss_wt > 0:
+            self.dec_progress = nn.Linear(encoder_output_size, 1)
+        if self.args.subgoal_aux_loss_wt > 0:
+            self.dec_subgoal = nn.Linear(encoder_output_size, 1)
+
+        # final touch
+        self.init_weights()
+        self.reset()
+
+    def forward(self, vocab, **inputs):
+        """
+        forward the model for multiple time-steps (used for training)
+        """
+        # embed language
+        output = {}
+        emb_lang, lengths_lang = self.embed_lang(inputs["lang"], vocab)
+        emb_lang = self.dataset_enc(emb_lang, vocab) if self.dataset_enc else emb_lang
+
+        # embed frames and actions
+        emb_frames, emb_object = self.embed_frames(inputs["frames"])
+        lengths_frames = inputs["lengths_frames"]
+        emb_actions = self.embed_actions(inputs["action"])
+        assert emb_frames.shape == emb_actions.shape
+        lengths_actions = lengths_frames.clone()
+        length_frames_max = inputs["length_frames_max"]
+
+        # concatenate language, frames and actions and add encodings
+        encoder_out, _ = self.encoder_vl(
+            emb_lang,
+            emb_frames,
+            emb_actions,
+            lengths_lang,
+            lengths_frames,
+            lengths_actions,
+            length_frames_max,
+        )
+        # use outputs corresponding to visual frames for prediction only
+        encoder_out_visual = encoder_out[:, lengths_lang.max().item() : lengths_lang.max().item() + length_frames_max]
+
+        # get the output actions
+        decoder_input = encoder_out_visual.reshape(-1, self.args.demb)
+        action_emb_flat = self.dec_action(decoder_input)
+        action_flat = action_emb_flat.mm(self.emb_action.weight.t())
+        action = action_flat.view(*encoder_out_visual.shape[:2], *action_flat.shape[1:])
+
+        # get the output objects
+        emb_object_flat = emb_object.view(-1, self.args.demb)
+        decoder_input = decoder_input + emb_object_flat
+        object_flat = self.dec_object(decoder_input)
+        objects = object_flat.view(*encoder_out_visual.shape[:2], *object_flat.shape[1:])
+        output.update({"action": action, "object": objects})
+
+        # (optionally) get progress monitor predictions
+        if self.args.progress_aux_loss_wt > 0:
+            progress = torch.sigmoid(self.dec_progress(encoder_out_visual))
+            output["progress"] = progress
+        if self.args.subgoal_aux_loss_wt > 0:
+            subgoal = torch.sigmoid(self.dec_subgoal(encoder_out_visual))
+            output["subgoal"] = subgoal
+        return output
+
+    def embed_lang(self, lang_pad, vocab):
+        """
+        take a list of annotation tokens and extract embeddings with EncoderLang
+        """
+        assert lang_pad.max().item() < len(vocab)
+        embedder_lang = self.embs_ann[vocab.name]
+        emb_lang, lengths_lang = self.encoder_lang(lang_pad, embedder_lang, vocab, self.pad)
+        if self.args.detach_lang_emb:
+            emb_lang = emb_lang.clone().detach()
+        return emb_lang, lengths_lang
+
+    def embed_frames(self, frames_pad):
+        """
+        take a list of frames tensors, pad it, apply dropout and extract embeddings
+        """
+        self.dropout_vis(frames_pad)
+        frames_4d = frames_pad.view(-1, *frames_pad.shape[2:])
+        frames_pad_emb = self.vis_feat(frames_4d).view(*frames_pad.shape[:2], -1)
+        frames_pad_emb_skip = self.object_feat(frames_4d).view(*frames_pad.shape[:2], -1)
+        return frames_pad_emb, frames_pad_emb_skip
+
+    def embed_actions(self, actions):
+        """
+        embed previous actions
+        """
+        emb_actions = self.emb_action(actions)
+        emb_actions = self.dropout_action(emb_actions)
+        return emb_actions
+
+    def reset(self):
+        """
+        reset internal states (used for real-time execution during eval)
+        """
+        self.frames_traj = torch.zeros(1, 0, *self.visual_tensor_shape)
+        self.action_traj = torch.zeros(1, 0).long()
+
+    def step(self, input_dict, vocab, prev_action=None):
+        """
+        forward the model for a single time-step (used for real-time execution during eval)
+        """
+        frames = input_dict["frames"]
+        device = frames.device
+        if prev_action is not None:
+            prev_action_int = vocab["action_low"].word2index(prev_action)
+            prev_action_tensor = torch.tensor(prev_action_int)[None, None].to(device)
+            self.action_traj = torch.cat((self.action_traj.to(device), prev_action_tensor), dim=1)
+        self.frames_traj = torch.cat((self.frames_traj.to(device), frames[None]), dim=1)
+        # at timestep t we have t-1 prev actions so we should pad them
+        action_traj_pad = torch.cat((self.action_traj.to(device), torch.zeros((1, 1)).to(device).long()), dim=1)
+        model_out = self.forward(
+            vocab=vocab["word"],
+            lang=input_dict["lang"],
+            lengths_lang=input_dict["lengths_lang"],
+            length_lang_max=input_dict["length_lang_max"],
+            frames=self.frames_traj.clone(),
+            lengths_frames=torch.tensor([self.frames_traj.size(1)]),
+            length_frames_max=self.frames_traj.size(1),
+            action=action_traj_pad,
+        )
+        step_out = {}
+        for key, value in model_out.items():
+            # return only the last actions, ignore the rest
+            step_out[key] = value[:, -1:]
+        return step_out
+
+    def compute_batch_loss(self, model_out, gt_dict):
+        """
+        loss function for Seq2Seq agent
+        """
+        losses = dict()
+
+        # action loss
+        action_pred = model_out["action"].view(-1, model_out["action"].shape[-1])
+        action_gt = gt_dict["action"].view(-1)
+        pad_mask = action_gt != self.pad
+
+        # Calculate loss only over future actions
+        action_pred_mask = gt_dict["driver_actions_pred_mask"].view(-1)
+
+        action_loss = F.cross_entropy(action_pred, action_gt, reduction="none")
+        action_loss *= pad_mask.float()
+        if not self.args.compute_train_loss_over_history:
+            action_loss *= action_pred_mask.float()
+        action_loss = action_loss.mean()
+        losses["action"] = action_loss * self.args.action_loss_wt
+
+        # object classes loss
+        if len(gt_dict["object"]) > 0:
+            object_pred = model_out["object"]
+            object_gt = torch.cat(gt_dict["object"], dim=0)
+
+            if self.args.compute_train_loss_over_history:
+                interact_idxs = gt_dict["obj_interaction_action"].view(-1).nonzero(as_tuple=False).view(-1)
+            else:
+                interact_idxs = (
+                    (gt_dict["driver_actions_pred_mask"] * gt_dict["obj_interaction_action"])
+                    .view(-1)
+                    .nonzero(as_tuple=False)
+                    .view(-1)
+                )
+            if interact_idxs.nelement() > 0:
+                object_pred = object_pred.view(object_pred.shape[0] * object_pred.shape[1], *object_pred.shape[2:])
+                object_loss = model_util.obj_classes_loss(object_pred, object_gt, interact_idxs)
+                losses["object"] = object_loss * self.args.object_loss_wt
+
+        # subgoal completion loss
+        if self.args.subgoal_aux_loss_wt > 0:
+            subgoal_pred = model_out["subgoal"].squeeze(2)
+            subgoal_gt = gt_dict["subgoals_completed"]
+            subgoal_loss = F.mse_loss(subgoal_pred, subgoal_gt, reduction="none")
+            subgoal_loss = subgoal_loss.view(-1) * pad_mask.float()
+            subgoal_loss = subgoal_loss.mean()
+            losses["subgoal_aux"] = self.args.subgoal_aux_loss_wt * subgoal_loss
+
+        # progress monitoring loss
+        if self.args.progress_aux_loss_wt > 0:
+            progress_pred = model_out["progress"].squeeze(2)
+            progress_gt = gt_dict["goal_progress"]
+            progress_loss = F.mse_loss(progress_pred, progress_gt, reduction="none")
+            progress_loss = progress_loss.view(-1) * pad_mask.float()
+            progress_loss = progress_loss.mean()
+            losses["progress_aux"] = self.args.progress_aux_loss_wt * progress_loss
+
+        # maximize entropy of the policy if asked
+        if self.args.entropy_wt > 0.0:
+            policy_entropy = -F.softmax(action_pred, dim=1) * F.log_softmax(action_pred, dim=1)
+            policy_entropy = policy_entropy.mean(dim=1)
+            policy_entropy *= pad_mask.float()
+            losses["entropy"] = -policy_entropy.mean() * self.args.entropy_wt
+
+        return losses
+
+    def init_weights(self, init_range=0.1):
+        """
+        init embeddings uniformly
+        """
+        super().init_weights(init_range)
+        self.dec_action.bias.data.zero_()
+        self.dec_action.weight.data.uniform_(-init_range, init_range)
+        self.emb_action.weight.data.uniform_(-init_range, init_range)
+
+    def compute_metrics(self, model_out, gt_dict, metrics_dict, compute_train_loss_over_history):
+        """
+        compute exact matching and f1 score for action predictions
+        """
+        preds = model_util.extract_action_preds(model_out, self.pad, self.vocab_out, lang_only=True)
+        stop_token = self.vocab_out.word2index("Stop")
+        gt_actions = model_util.tokens_to_lang(gt_dict["action"], self.vocab_out, {self.pad, stop_token})
+        model_util.compute_f1_and_exact(metrics_dict, [p["action"] for p in preds], gt_actions, "action")
+        model_util.compute_obj_class_precision(
+            metrics_dict, gt_dict, model_out["object"], compute_train_loss_over_history
+        )
diff --git a/src/teach/modeling/ET/alfred/nn/attention.py b/src/teach/modeling/ET/alfred/nn/attention.py
new file mode 100644
index 0000000..b73b5b5
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/nn/attention.py
@@ -0,0 +1,44 @@
+from torch import nn
+from torch.nn import functional as F
+
+
+class SelfAttn(nn.Module):
+    """
+    self-attention with learnable parameters
+    """
+
+    def __init__(self, dhid):
+        super().__init__()
+        self.scorer = nn.Linear(dhid, 1)
+        # scorer: dhid x 1
+
+    def forward(self, inp):
+        # inp: batch_size x seq_len x dhid
+        scores = F.softmax(self.scorer(inp), dim=1)
+        # scores: batch_size x seq_len x 1
+        cont = scores.transpose(1, 2).bmm(inp).squeeze(1)
+        # cont: batch_size x seq_len
+        return cont
+
+
+class DotAttn(nn.Module):
+    """
+    dot-attention (or soft-attention)
+    """
+
+    def forward(self, inp, h):
+        # inp: batch_size x seq_len x dhid
+        # h: batch_size x dhid
+        score = self.softmax(inp, h)
+        # score: batch_size x seq_len x 1
+        score_expanded = score.expand_as(inp)
+        # score_expanded: batch_size x seq_len x dhid
+        # output: batch_size x dhid
+        return score_expanded.mul(inp).sum(1), score
+
+    def softmax(self, inp, h):
+        raw_score = inp.bmm(h.unsqueeze(2))
+        # raw_score: batch_size x seq_len x 1
+        score = F.softmax(raw_score, dim=1)
+        # score: batch_size x seq_len x 1
+        return score
diff --git a/src/teach/modeling/ET/alfred/nn/dec_object.py b/src/teach/modeling/ET/alfred/nn/dec_object.py
new file mode 100644
index 0000000..c38fc20
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/nn/dec_object.py
@@ -0,0 +1,22 @@
+import os
+
+import torch
+from alfred import constants
+from torch import nn
+
+
+class ObjectClassifier(nn.Module):
+    """
+    object classifier module (a single FF layer)
+    """
+
+    def __init__(self, input_size):
+        super().__init__()
+        vocab_obj_path = os.path.join(constants.ET_ROOT, constants.OBJ_CLS_VOCAB)
+        vocab_obj = torch.load(vocab_obj_path)
+        num_classes = len(vocab_obj)
+        self.linear = nn.Linear(input_size, num_classes)
+
+    def forward(self, x):
+        out = self.linear(x)
+        return out
diff --git a/src/teach/modeling/ET/alfred/nn/enc_lang.py b/src/teach/modeling/ET/alfred/nn/enc_lang.py
new file mode 100644
index 0000000..182c20f
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/nn/enc_lang.py
@@ -0,0 +1,89 @@
+import torch
+from alfred.nn.encodings import InstrLangEncoding, PosLangEncoding
+from torch import nn
+
+
+class EncoderLang(nn.Module):
+    def __init__(
+        self,
+        num_layers,
+        args,
+        embs_ann,
+        subgoal_token="<<instr>>",
+        goal_token="<<goal>>",
+    ):
+        """
+        transformer encoder for language inputs
+        """
+        super(EncoderLang, self).__init__()
+        self.subgoal_token = subgoal_token
+        self.goal_token = goal_token
+
+        # transofmer layers
+        encoder_layer = nn.TransformerEncoderLayer(
+            args.demb,
+            args.encoder_heads,
+            args.demb,
+            args.dropout["transformer"]["encoder"],
+        )
+        if args.encoder_lang["shared"]:
+            enc_transformer = nn.TransformerEncoder(encoder_layer, num_layers)
+            self.enc_transformers = nn.ModuleDict({data: enc_transformer for data in embs_ann.keys()})
+        else:
+            self.enc_transformers = nn.ModuleDict(
+                {data: nn.TransformerEncoder(encoder_layer, num_layers) for data in embs_ann.keys()}
+            )
+
+        # encodings
+        self.enc_pos = PosLangEncoding(args.demb) if args.encoder_lang["pos_enc"] else None
+        self.enc_instr = InstrLangEncoding(args.demb) if args.encoder_lang["instr_enc"] else None
+        self.enc_layernorm = nn.LayerNorm(args.demb)
+        self.enc_dropout = nn.Dropout(args.dropout["lang"], inplace=True)
+
+    def forward(self, lang_pad, embedder, vocab, pad):
+        """
+        pass embedded inputs through embeddings and encode them using a transformer
+        """
+        # pad the input language sequences and embed them with a linear layer
+        mask_pad = lang_pad == pad
+        emb_lang = embedder(lang_pad)
+        # add positional encodings
+        mask_token = EncoderLang.mask_token(lang_pad, vocab, {self.subgoal_token, self.goal_token})
+        emb_lang = self.encode_inputs(emb_lang, mask_token, mask_pad)
+        # pass the inputs through the encoder
+        hiddens = EncoderLang.encoder(self.enc_transformers, emb_lang, mask_pad, vocab)
+        lengths = (lang_pad != pad).sum(dim=1)
+        return hiddens, lengths
+
+    @staticmethod
+    def mask_token(lang_pad, vocab, tokens):
+        """
+        returns mask of the tokens
+        """
+        tokens_mask = torch.zeros_like(lang_pad).long()
+        for token in tokens:
+            tokens_mask += lang_pad == vocab.word2index(token)
+        return tokens_mask.bool()
+
+    @staticmethod
+    def encoder(encoders, emb_lang, mask_pad, vocab, mask_attn=None):
+        """
+        compute encodings for all tokens using a normal flat encoder
+        """
+        # skip mask: mask padded words
+        if mask_attn is None:
+            # attention mask: all tokens can attend to all others
+            mask_attn = torch.zeros((mask_pad.shape[1], mask_pad.shape[1]), device=mask_pad.device).float()
+        # encode the inputs
+        output = encoders[vocab.name](emb_lang.transpose(0, 1), mask_attn, mask_pad).transpose(0, 1)
+        return output
+
+    def encode_inputs(self, emb_lang, mask_token, mask_pad):
+        """
+        add positional encodings, apply layernorm and dropout
+        """
+        emb_lang = self.enc_pos(emb_lang) if self.enc_pos else emb_lang
+        emb_lang = self.enc_instr(emb_lang, mask_token) if self.enc_instr else emb_lang
+        emb_lang = self.enc_dropout(emb_lang)
+        emb_lang = self.enc_layernorm(emb_lang)
+        return emb_lang
diff --git a/src/teach/modeling/ET/alfred/nn/enc_visual.py b/src/teach/modeling/ET/alfred/nn/enc_visual.py
new file mode 100644
index 0000000..e9c6e79
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/nn/enc_visual.py
@@ -0,0 +1,240 @@
+import contextlib
+import logging
+import os
+import types
+
+import numpy as np
+import torch
+import torch.nn as nn
+from alfred import constants
+from alfred.nn.transforms import Transforms
+from alfred.utils import data_util
+from torchvision import models
+from torchvision.transforms import functional as F
+
+from teach.logger import create_logger
+
+logger = create_logger(__name__, level=logging.INFO)
+
+
+class Resnet18(nn.Module):
+    """
+    pretrained Resnet18 from torchvision
+    """
+
+    def __init__(self, device, checkpoint_path=None, share_memory=False):
+        super().__init__()
+        self.device = device
+        self.model = models.resnet18(pretrained=True)
+        self.model = nn.Sequential(*list(self.model.children())[:-2])
+        if checkpoint_path is not None:
+            logger.info("Loading ResNet checkpoint from {}".format(checkpoint_path))
+            model_state_dict = torch.load(checkpoint_path, map_location=device)
+            model_state_dict = {
+                key: value for key, value in model_state_dict.items() if "GU_" not in key and "text_pooling" not in key
+            }
+            model_state_dict = {key: value for key, value in model_state_dict.items() if "fc." not in key}
+            model_state_dict = {key.replace("resnet.", ""): value for key, value in model_state_dict.items()}
+            self.model.load_state_dict(model_state_dict)
+        self.model = self.model.to(torch.device(device))
+        self.model = self.model.eval()
+        if share_memory:
+            self.model.share_memory()
+        self._transform = Transforms.get_transform("default")
+
+    def extract(self, x):
+        x = self._transform(x).to(torch.device(self.device))
+        return self.model(x)
+
+
+class RCNN(nn.Module):
+    """
+    pretrained FasterRCNN or MaskRCNN from torchvision
+    """
+
+    def __init__(
+        self,
+        archi,
+        device="cuda",
+        checkpoint_path=None,
+        share_memory=False,
+        load_heads=False,
+    ):
+        super().__init__()
+        self.device = device
+        self.feat_layer = "3"
+        if archi == "maskrcnn":
+            self.model = models.detection.maskrcnn_resnet50_fpn(
+                pretrained=(checkpoint_path is None),
+                pretrained_backbone=(checkpoint_path is None),
+                min_size=800,
+            )
+        elif archi == "fasterrcnn":
+            self.model = models.detection.fasterrcnn_resnet50_fpn(
+                pretrained=(checkpoint_path is None),
+                pretrained_backbone=(checkpoint_path is None),
+                min_size=224,
+            )
+        else:
+            raise ValueError("Unknown model type = {}".format(archi))
+
+        if archi == "maskrcnn":
+            self._transform = self.model.transform
+        else:
+            self._transform = Transforms.get_transform("default")
+        if not load_heads:
+            for attr in ("backbone", "body"):
+                self.model = getattr(self.model, attr)
+
+        if checkpoint_path is not None:
+            self.load_from_checkpoint(checkpoint_path, load_heads, device, archi, "backbone.body")
+        self.model = self.model.to(torch.device(device))
+        self.model = self.model.eval()
+        if share_memory:
+            self.model.share_memory()
+        if load_heads:
+            # if the model is used for predictions, prepare a vocabulary
+            self.vocab_pred = {i: class_name for i, class_name in enumerate(constants.OBJECTS_ACTIONS)}
+
+    def extract(self, images):
+        if isinstance(self._transform, models.detection.transform.GeneralizedRCNNTransform):
+            images_normalized = self._transform(torch.stack([F.to_tensor(img) for img in images]))[0].tensors
+        else:
+            images_normalized = torch.stack([self._transform(img) for img in images])
+        images_normalized = images_normalized.to(torch.device(self.device))
+        model_body = self.model
+        if hasattr(self.model, "backbone"):
+            model_body = self.model.backbone.body
+        features = model_body(images_normalized)
+        return features[self.feat_layer]
+
+    def load_from_checkpoint(self, checkpoint_path, load_heads, device, archi, prefix):
+        logger.info("Loading RCNN checkpoint from {}".format(checkpoint_path))
+        state_dict = torch.load(checkpoint_path, map_location=device)
+        if not load_heads:
+            # load only the backbone
+            state_dict = {k.replace(prefix + ".", ""): v for k, v in state_dict.items() if prefix + "." in k}
+        else:
+            # load a full model, replace pre-trained head(s) with (a) new one(s)
+            num_classes, in_features = state_dict["roi_heads.box_predictor.cls_score.weight"].shape
+            box_predictor = models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)
+            self.model.roi_heads.box_predictor = box_predictor
+            if archi == "maskrcnn":
+                # and replace the mask predictor with a new one
+                in_features_mask = self.model.roi_heads.mask_predictor.conv5_mask.in_channels
+                hidden_layer = 256
+                mask_predictor = models.detection.mask_rcnn.MaskRCNNPredictor(
+                    in_features_mask, hidden_layer, num_classes
+                )
+                self.model.roi_heads.mask_predictor = mask_predictor
+        self.model.load_state_dict(state_dict)
+
+    def predict_objects(self, image, confidence_threshold=0.0, verbose=False):
+        image = F.to_tensor(image).to(torch.device(self.device))
+        output = self.model(image[None])[0]
+        preds = []
+        for pred_idx in range(len(output["scores"])):
+            score = output["scores"][pred_idx].cpu().item()
+            if score < confidence_threshold:
+                continue
+            box = output["boxes"][pred_idx].cpu().numpy()
+            label = self.vocab_pred[output["labels"][pred_idx].cpu().item()]
+            if verbose:
+                logger.debug("{} at {}".format(label, box))
+            pred = types.SimpleNamespace(label=label, box=box, score=score)
+            if "masks" in output:
+                pred.mask = output["masks"][pred_idx].cpu().numpy()
+            preds.append(pred)
+        return preds
+
+
+class FeatureExtractor(nn.Module):
+    def __init__(
+        self,
+        archi,
+        device="cuda",
+        checkpoint=None,
+        share_memory=False,
+        compress_type=None,
+        load_heads=False,
+    ):
+        super().__init__()
+        self.feat_shape = data_util.get_feat_shape(archi, compress_type)
+        self.eval_mode = True
+        if archi == "resnet18":
+            assert not load_heads
+            self.model = Resnet18(device, checkpoint, share_memory)
+        else:
+            self.model = RCNN(archi, device, checkpoint, share_memory, load_heads=load_heads)
+        self.compress_type = compress_type
+        # load object class vocabulary
+        vocab_obj_path = os.path.join(constants.ET_ROOT, constants.OBJ_CLS_VOCAB)
+        self.vocab_obj = torch.load(vocab_obj_path)
+
+    def featurize(self, images, batch=32):
+        feats = []
+        with (torch.set_grad_enabled(False) if not self.model.model.training else contextlib.nullcontext()):
+            for i in range(0, len(images), batch):
+                images_batch = images[i : i + batch]
+                feats.append(self.model.extract(images_batch))
+        feat = torch.cat(feats, dim=0)
+        if self.compress_type is not None:
+            feat = data_util.feat_compress(feat, self.compress_type)
+        assert self.feat_shape[1:] == feat.shape[1:]
+        return feat
+
+    def predict_objects(self, image, verbose=False):
+        with torch.set_grad_enabled(False):
+            pred = self.model.predict_objects(image, verbose=verbose)
+        return pred
+
+    def train(self, mode):
+        if self.eval_mode:
+            return
+        for module in self.children():
+            module.train(mode)
+
+
+class FeatureFlat(nn.Module):
+    """
+    a few conv layers to flatten features that come out of ResNet
+    """
+
+    def __init__(self, input_shape, output_size):
+        super().__init__()
+        if input_shape[0] == -1:
+            input_shape = input_shape[1:]
+        layers, activation_shape = self.init_cnn(input_shape, channels=[256, 64], kernels=[1, 1], paddings=[0, 0])
+        layers += [Flatten(), nn.Linear(np.prod(activation_shape), output_size)]
+        self.layers = nn.Sequential(*layers)
+
+    def init_cnn(self, input_shape, channels, kernels, paddings):
+        layers = []
+        planes_in, spatial = input_shape[0], input_shape[-1]
+        for planes_out, kernel, padding in zip(channels, kernels, paddings):
+            # do not use striding
+            stride = 1
+            layers += [
+                nn.Conv2d(
+                    planes_in,
+                    planes_out,
+                    kernel_size=kernel,
+                    stride=stride,
+                    padding=padding,
+                ),
+                nn.BatchNorm2d(planes_out),
+                nn.ReLU(inplace=True),
+            ]
+            planes_in = planes_out
+            spatial = (spatial - kernel + 2 * padding) // stride + 1
+        activation_shape = (planes_in, spatial, spatial)
+        return layers, activation_shape
+
+    def forward(self, frames):
+        activation = self.layers(frames)
+        return activation
+
+
+class Flatten(nn.Module):
+    def forward(self, x):
+        return x.view(x.size(0), -1)
diff --git a/src/teach/modeling/ET/alfred/nn/enc_vl.py b/src/teach/modeling/ET/alfred/nn/enc_vl.py
new file mode 100644
index 0000000..db26e9e
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/nn/enc_vl.py
@@ -0,0 +1,97 @@
+import torch
+from alfred.nn.encodings import PosEncoding, PosLearnedEncoding, TokenLearnedEncoding
+from alfred.utils import model_util
+from torch import nn
+
+
+class EncoderVL(nn.Module):
+    def __init__(self, args):
+        """
+        transformer encoder for language, frames and action inputs
+        """
+        super(EncoderVL, self).__init__()
+
+        # transofmer layers
+        encoder_layer = nn.TransformerEncoderLayer(
+            args.demb,
+            args.encoder_heads,
+            args.demb,
+            args.dropout["transformer"]["encoder"],
+        )
+        self.enc_transformer = nn.TransformerEncoder(encoder_layer, args.encoder_layers)
+
+        # how many last actions to attend to
+        self.num_input_actions = args.num_input_actions
+
+        # encodings
+        self.enc_pos = PosEncoding(args.demb) if args.enc["pos"] else None
+        self.enc_pos_learn = PosLearnedEncoding(args.demb) if args.enc["pos_learn"] else None
+        self.enc_token = TokenLearnedEncoding(args.demb) if args.enc["token"] else None
+        self.enc_layernorm = nn.LayerNorm(args.demb)
+        self.enc_dropout = nn.Dropout(args.dropout["emb"], inplace=True)
+
+    def forward(
+        self,
+        emb_lang,
+        emb_frames,
+        emb_actions,
+        lengths_lang,
+        lengths_frames,
+        lengths_actions,
+        length_frames_max,
+        attn_masks=True,
+    ):
+        """
+        pass embedded inputs through embeddings and encode them using a transformer
+        """
+        # emb_lang is processed on each GPU separately so they size can vary
+        length_lang_max = lengths_lang.max().item()
+        emb_lang = emb_lang[:, :length_lang_max]
+        # create a mask for padded elements
+        length_mask_pad = length_lang_max + length_frames_max * (2 if lengths_actions.max() > 0 else 1)
+        mask_pad = torch.zeros((len(emb_lang), length_mask_pad), device=emb_lang.device).bool()
+        for i, (len_l, len_f, len_a) in enumerate(zip(lengths_lang, lengths_frames, lengths_actions)):
+            # mask padded words
+            mask_pad[i, len_l:length_lang_max] = True
+            # mask padded frames
+            mask_pad[i, length_lang_max + len_f : length_lang_max + length_frames_max] = True
+            # mask padded actions
+            mask_pad[i, length_lang_max + length_frames_max + len_a :] = True
+
+        # encode the inputs
+        emb_all = self.encode_inputs(emb_lang, emb_frames, emb_actions, lengths_lang, lengths_frames, mask_pad)
+
+        # create a mask for attention (prediction at t should not see frames at >= t+1)
+        if attn_masks:
+            mask_attn = model_util.generate_attention_mask(
+                length_lang_max,
+                length_frames_max,
+                emb_all.device,
+                self.num_input_actions,
+            )
+        else:
+            # allow every token to attend to all others
+            mask_attn = torch.zeros((mask_pad.shape[1], mask_pad.shape[1]), device=mask_pad.device).float()
+
+        # encode the inputs
+        output = self.enc_transformer(emb_all.transpose(0, 1), mask_attn, mask_pad).transpose(0, 1)
+        return output, mask_pad
+
+    def encode_inputs(self, emb_lang, emb_frames, emb_actions, lengths_lang, lengths_frames, mask_pad):
+        """
+        add encodings (positional, token and so on)
+        """
+        if self.enc_pos is not None:
+            emb_lang, emb_frames, emb_actions = self.enc_pos(
+                emb_lang, emb_frames, emb_actions, lengths_lang, lengths_frames
+            )
+        if self.enc_pos_learn is not None:
+            emb_lang, emb_frames, emb_actions = self.enc_pos_learn(
+                emb_lang, emb_frames, emb_actions, lengths_lang, lengths_frames
+            )
+        if self.enc_token is not None:
+            emb_lang, emb_frames, emb_actions = self.enc_token(emb_lang, emb_frames, emb_actions)
+        emb_cat = torch.cat((emb_lang, emb_frames, emb_actions), dim=1)
+        emb_cat = self.enc_layernorm(emb_cat)
+        emb_cat = self.enc_dropout(emb_cat)
+        return emb_cat
diff --git a/src/teach/modeling/ET/alfred/nn/encodings.py b/src/teach/modeling/ET/alfred/nn/encodings.py
new file mode 100644
index 0000000..32b94b9
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/nn/encodings.py
@@ -0,0 +1,173 @@
+import math
+
+import torch
+from torch import nn
+
+
+class PosEncoding(nn.Module):
+    """
+    Transformer-style positional encoding with wavelets
+    """
+
+    def __init__(self, d_model, max_len=1250):
+        super().__init__()
+        self.d_model = d_model
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe[None])
+
+    def forward(self, lang, frames, actions, lens_lang, lens_frames, pos=None):
+        if pos is None:
+            enc = self.pe[:, : lang.shape[1] + frames.shape[1]]
+        else:
+            enc = [[] for _ in range(len(lang))]
+            for batch_idx in range(pos.shape[0]):
+                for pos_idx in range(lang.shape[1] + frames.shape[1]):
+                    enc[batch_idx].append(self.pe[0, pos[batch_idx, pos_idx]])
+            enc = torch.stack([torch.stack(pos_batch) for pos_batch in enc])
+        enc = enc / math.sqrt(self.d_model)
+        lang = lang + enc[:, : lang.shape[1]]
+
+        for i in range(frames.shape[0]):
+            start_idx = lens_lang[i]
+            end_idx = lens_lang[i] + frames.shape[1]
+            if end_idx > enc.shape[1]:
+                end_idx = enc.shape[1]
+                start_idx = enc.shape[1] - frames.shape[1]
+            frames[i] = frames[i] + enc[0, start_idx:end_idx]
+        # use the same position indices for actions as for the frames
+        for i in range(actions.shape[0]):
+            start_idx = lens_lang[i]
+            end_idx = lens_lang[i] + actions.shape[1]
+            if end_idx > enc.shape[1]:
+                end_idx = enc.shape[1]
+                start_idx = enc.shape[1] - actions.shape[1]
+            actions[i] = actions[i] + enc[0, start_idx:end_idx]
+        return lang, frames, actions
+
+
+class LearnedEncoding(nn.Module):
+    """
+    Learned additive encoding implemented on top of nn.Embedding
+    """
+
+    def __init__(self, d_model, vocab_size, init_range=0.1):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, d_model)
+        self.emb.weight.data.uniform_(-init_range, init_range)
+
+    def forward(self, x, tokens):
+        tokens_emb = self.emb(tokens)
+        return x + tokens_emb
+
+
+class PosLearnedEncoding(nn.Module):
+    """
+    Learned additive positional encoding implemented on top of nn.Embedding
+    """
+
+    def __init__(self, d_model, max_pos=1250, init_range=0.1):
+        super().__init__()
+        self.emb = nn.Embedding(max_pos, d_model)
+        self.emb.weight.data.uniform_(-init_range, init_range)
+
+    def forward(self, lang, frames, actions, lens_lang, lens_frames):
+        pos_lang = torch.stack([torch.arange(0, lang.shape[1])] * lang.shape[0])
+        pos_frames = torch.stack([torch.arange(0, frames.shape[1]) + l for l in lens_lang])
+        # use the same position indices for actions as for the frames
+        pos_actions = torch.stack([torch.arange(0, actions.shape[1]) + l for l in lens_lang])
+        lang += self.emb(pos_lang.to(lang.device))
+        frames += self.emb(pos_frames.to(frames.device))
+        actions += self.emb(pos_actions.to(actions.device))
+        return lang, frames, actions
+
+
+class TokenLearnedEncoding(nn.Module):
+    """
+    Learned additive img/word/action token encoding implemented on top of nn.Embedding
+    """
+
+    def __init__(self, d_model, vocab_size=3, init_range=0.1):
+        super().__init__()
+        self.emb = nn.Embedding(vocab_size, d_model)
+        self.emb.weight.data.uniform_(-init_range, init_range)
+
+    def forward(self, lang, frames, actions):
+        token_lang = torch.ones(lang.shape[:2], device=lang.device, dtype=torch.long) * 0
+        token_lang_emb = self.emb(token_lang)
+        lang += token_lang_emb
+        token_frames = torch.ones(frames.shape[:2], device=frames.device, dtype=torch.long) * 1
+        token_frames_emb = self.emb(token_frames)
+        frames += token_frames_emb
+        token_actions = torch.ones(actions.shape[:2], device=actions.device, dtype=torch.long) * 2
+        token_actions_emb = self.emb(token_actions)
+        actions += token_actions_emb
+        return lang, frames, actions
+
+
+class PosLangEncoding(nn.Module):
+    """
+    Transformer-style positional encoding with wavelets
+    """
+
+    def __init__(self, d_model, max_len=2000):
+        super().__init__()
+        self.d_model = d_model
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe[None])
+
+    def forward(self, x, pos=None):
+        if pos is None:
+            enc = self.pe[:, : x.shape[1]]
+        else:
+            enc = [[] for _ in range(len(x))]
+            for batch_idx in range(pos.shape[0]):
+                for pos_idx in range(pos.shape[1]):
+                    enc[batch_idx].append(self.pe[0, pos[batch_idx, pos_idx]])
+            enc = torch.stack([torch.stack(pos_batch) for pos_batch in enc])
+        x = x + enc / math.sqrt(self.d_model)
+        return x
+
+
+class InstrLangEncoding(PosLangEncoding):
+    """
+    Relative position in an instruction (a sentence) encoding with wavelets
+    """
+
+    def forward(self, x, tokens_mask):
+        counts = torch.zeros_like(tokens_mask)[:, 0].long()
+        instrs = torch.zeros_like(tokens_mask).long()
+        # offset the tokens by 1
+        tokens_mask[:, 1:] = tokens_mask.clone()[:, :-1]
+        for i in range(tokens_mask.shape[1] - 1):
+            instrs[:, i] = counts
+            counts += tokens_mask[:, i + 1] == True
+        instrs[:, -1] = instrs[:, -2]
+        pe_tokens = self.pe[0, instrs]
+        x = x + pe_tokens / math.sqrt(self.d_model)
+        return x
+
+
+class DatasetLearnedEncoding(nn.Module):
+    """
+    Learned additive dataset id encoding implemented on top of nn.Embedding
+    """
+
+    def __init__(self, d_model, datasets, init_range=0.1):
+        super().__init__()
+        self.datasets = {dataset: i for i, dataset in enumerate(datasets)}
+        self.emb = nn.Embedding(len(datasets), d_model)
+        self.emb.weight.data.uniform_(-init_range, init_range)
+
+    def forward(self, lang, vocab):
+        dataset_ids = torch.ones(lang.shape[0], device=lang.device, dtype=torch.long)
+        dataset_emb = self.emb(dataset_ids * self.datasets[vocab.name])
+        lang_enc = lang + dataset_emb[:, None]
+        return lang_enc
diff --git a/src/teach/modeling/ET/alfred/nn/transforms.py b/src/teach/modeling/ET/alfred/nn/transforms.py
new file mode 100644
index 0000000..5f832b5
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/nn/transforms.py
@@ -0,0 +1,90 @@
+from torchvision import transforms
+
+
+class Transforms(object):
+    @staticmethod
+    def resize(img_size=224):
+        # expects a PIL Image
+        return transforms.Resize((img_size, img_size))
+
+    @staticmethod
+    def affine(degree=5, translate=0.04, scale=0.02):
+        # expects a PIL Image
+        return transforms.RandomAffine(
+            degrees=(-degree, degree),
+            translate=(translate, translate),
+            scale=(1 - scale, 1 + scale),
+            shear=None,
+        )
+
+    @staticmethod
+    def random_crop(img_size=224):
+        # expects a PIL Image
+        return transforms.RandomCrop((img_size, img_size))
+
+    @staticmethod
+    def normalize():
+        # expects a PIL Image
+        return transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406],
+                    std=[0.229, 0.224, 0.225],
+                ),
+            ]
+        )
+
+    @staticmethod
+    def cutout(p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0.0):
+        # expects a tensor
+        return transforms.RandomErasing(p=p, scale=scale, ratio=ratio, value=value)
+
+    @staticmethod
+    def get_transform(transform="default"):
+        if transform == "default":
+            return transforms.Compose([Transforms.resize(224), Transforms.normalize()])
+        elif transform == "none":
+            return transforms.ToTensor()
+        elif transform == "crops":
+            return transforms.Compose(
+                [
+                    Transforms.resize(240),
+                    Transforms.random_crop(224),
+                    Transforms.normalize(),
+                ]
+            )
+        elif transform == "cutout":
+            return transforms.Compose([Transforms.resize(224), Transforms.normalize(), Transforms.cutout()])
+        elif transform == "affine":
+            return transforms.Compose([Transforms.resize(224), Transforms.affine(), Transforms.normalize()])
+        elif transform == "affine_crops":
+            return transforms.Compose(
+                [
+                    Transforms.resize(240),
+                    Transforms.random_crop(224),
+                    Transforms.affine(),
+                    Transforms.normalize(),
+                ]
+            )
+        elif transform == "affine_crops_cutout":
+            return transforms.Compose(
+                [
+                    Transforms.resize(240),
+                    Transforms.random_crop(224),
+                    Transforms.affine(),
+                    Transforms.normalize(),
+                    Transforms.cutout(),
+                ]
+            )
+        elif transform == "affine_cutout":
+            return transforms.Compose(
+                [
+                    Transforms.resize(224),
+                    Transforms.affine(),
+                    Transforms.normalize(),
+                    Transforms.cutout(),
+                ]
+            )
+        else:
+            raise ValueError("Image augmentation {} is not implemented".format(transform))
diff --git a/src/teach/modeling/ET/alfred/utils/data_util.py b/src/teach/modeling/ET/alfred/utils/data_util.py
new file mode 100644
index 0000000..b2bff49
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/utils/data_util.py
@@ -0,0 +1,325 @@
+import json
+import logging
+import os
+import pickle
+import re
+import shutil
+import string
+from copy import deepcopy
+
+import lmdb
+import torch
+from alfred import constants
+from alfred.utils import helper_util
+from PIL import Image
+from torch.nn.utils.rnn import pad_sequence
+from tqdm import tqdm
+
+from teach.logger import create_logger
+
+logger = create_logger(__name__, level=logging.INFO)
+
+
+def read_images(image_path_list):
+    images = []
+    for image_path in image_path_list:
+        image_orig = Image.open(image_path)
+        images.append(image_orig.copy())
+        image_orig.close()
+    return images
+
+
+def read_traj_images(json_path, image_folder):
+    with open(json_path) as json_file:
+        json_dict = json.load(json_file)
+
+    images_dir = json_path.parents[2] / image_folder / json_path.parts[-2] / json_path.parts[-1].split(".")[0]
+
+    fimages = [images_dir / im for im in json_dict["driver_image_history"] + json_dict["driver_images_future"]]
+    logger.debug("Loading images from %s" % images_dir)
+    logger.debug("Expected image files: %s" % "\n\t".join([str(x) for x in fimages]))
+
+    if not all([os.path.exists(path) for path in fimages]):
+        return None
+    assert len(fimages) > 0
+    images = read_images(fimages)
+    return images
+
+
+def extract_features(images, extractor):
+    if images is None:
+        return None
+    feat = extractor.featurize(images, batch=8)
+    return feat.cpu()
+
+
+def process_traj(traj_orig, traj_path, r_idx, preprocessor):
+    # copy trajectory
+    traj = traj_orig.copy()
+    # root & split
+    traj["root"] = str(traj_path)
+    partition = traj_path.parts[-2]
+    traj["split"] = partition
+    traj["repeat_idx"] = r_idx
+    # numericalize actions for train/valid splits
+    preprocessor.process_actions(traj_orig, traj)
+    # numericalize language
+    if "test" in partition:
+        preprocessor.process_language(traj_orig, traj, r_idx, is_test_split=True)
+    else:
+        preprocessor.process_language(traj_orig, traj, r_idx, is_test_split=False)
+    return traj
+
+
+def gather_feats(files, output_path):
+    if output_path.is_dir():
+        shutil.rmtree(output_path)
+    lmdb_feats = lmdb.open(str(output_path), 700 * 1024 ** 3, writemap=True)
+    with lmdb_feats.begin(write=True) as txn_feats:
+        for idx, path in tqdm(enumerate(files)):
+            traj_feats = torch.load(path).numpy()
+            txn_feats.put("{:06}".format(idx).encode("ascii"), traj_feats.tobytes())
+    lmdb_feats.close()
+
+
+def gather_jsons(files, output_path):
+    if output_path.exists():
+        os.remove(output_path)
+    jsons = {}
+    for idx, path in tqdm(enumerate(files)):
+        with open(path, "rb") as f:
+            jsons_idx = pickle.load(f)
+            jsons["{:06}".format(idx).encode("ascii")] = jsons_idx
+    with output_path.open("wb") as f:
+        pickle.dump(jsons, f)
+
+
+def get_preprocessor(PreprocessorClass, subgoal_ann, lock, vocab_path=None, task_type="edh"):
+    if vocab_path is None:
+        init_words = ["<<pad>>", "<<seg>>", "<<goal>>", "<<mask>>"]
+    else:
+        init_words = []
+    vocabs_with_lock = {
+        "word": helper_util.VocabWithLock(deepcopy(init_words), lock),
+        "action_low": helper_util.VocabWithLock(deepcopy(init_words), lock),
+        "action_high": helper_util.VocabWithLock(deepcopy(init_words), lock),
+    }
+    if vocab_path is not None:
+        vocabs_loaded = torch.load(vocab_path)
+        for vocab_name, vocab in vocabs_with_lock.items():
+            loaded_dict = vocabs_loaded[vocab_name].to_dict()
+            for _i, w in enumerate(loaded_dict["index2word"]):
+                vocab.word2index(w, train=True)
+                vocab.counts[w] = loaded_dict["counts"][w]
+
+    actions_high_init_words = [
+        "Navigate",
+        "Pickup",
+        "Place",
+        "Open",
+        "Close",
+        "ToggleOn",
+        "ToggleOff",
+        "Slice",
+        "Pour",
+        "object",
+    ]
+
+    # Reset low actions vocab to empty because Simbot vocab is different
+    actions_low_init_words = [
+        "Stop",
+        "Forward",
+        "Backward",
+        "Turn Left",
+        "Turn Right",
+        "Look Up",
+        "Look Down",
+        "Pan Left",
+        "Pan Right",
+        "Navigation",
+        "Pickup",
+        "Place",
+        "Open",
+        "Close",
+        "ToggleOn",
+        "ToggleOff",
+        "Slice",
+        "Pour",
+    ]
+    if task_type == "tfd":
+        actions_low_init_words.append("Text")
+
+    vocabs_with_lock["action_low"] = helper_util.VocabWithLock(actions_low_init_words, lock)
+    vocabs_with_lock["action_high"] = helper_util.VocabWithLock(actions_high_init_words, lock)
+    vocab_obj = torch.load(os.path.join(constants.ET_ROOT, constants.OBJ_CLS_VOCAB)).to_dict()
+    logger.debug("In get_preprocessor, vocab_obj = %s" % vocab_obj["index2word"])
+    for _i, w in enumerate(vocab_obj["index2word"]):
+        vocabs_with_lock["action_high"].word2index(w, train=True)
+        vocabs_with_lock["action_high"].counts[w] = vocab_obj["counts"][w]
+
+    preprocessor = PreprocessorClass(vocabs_with_lock, subgoal_ann)
+    return preprocessor
+
+
+def tensorize_and_pad(batch, device, pad):
+    """
+    cast values to torch tensors, put them to the correct device and pad sequences
+    """
+    device = torch.device(device)
+    input_dict, gt_dict, feat_dict = dict(), dict(), dict()
+    traj_data, feat_list = list(zip(*batch))
+    for key in feat_list[0].keys():
+        feat_dict[key] = [el[key] for el in feat_list]
+    # feat_dict keys that start with these substrings will be assigned to input_dict
+    input_keys = {"lang", "frames"}
+    # the rest of the keys will be assigned to gt_dict
+
+    for k, v in feat_dict.items():
+        dict_assign = input_dict if any([k.startswith(s) for s in input_keys]) else gt_dict
+        if k.startswith("lang"):
+            # no preprocessing should be done here
+            seqs = [torch.tensor(vv if vv is not None else [pad, pad], device=device).long() for vv in v]
+            pad_seq = pad_sequence(seqs, batch_first=True, padding_value=pad)
+            dict_assign[k] = pad_seq
+            dict_assign["lengths_" + k] = torch.tensor(list(map(len, seqs)))
+            length_max_key = "length_" + k + "_max"
+            if ":" in k:
+                # for translated length keys (e.g. lang:lmdb/1x_det) we should use different names
+                length_max_key = "length_" + k.split(":")[0] + "_max:" + ":".join(k.split(":")[1:])
+            dict_assign[length_max_key] = max(map(len, seqs))
+        elif k in {"object"}:
+            # convert lists with object indices to tensors
+            seqs = [torch.tensor(vv, device=device, dtype=torch.long) for vv in v if len(vv) > 0]
+            dict_assign[k] = seqs
+        elif k in {"frames"}:
+            # frames features were loaded from the disk as tensors
+            seqs = [vv.clone().detach().to(device).type(torch.float) for vv in v]
+            pad_seq = pad_sequence(seqs, batch_first=True, padding_value=pad)
+            dict_assign[k] = pad_seq
+            dict_assign["lengths_" + k] = torch.tensor(list(map(len, seqs)))
+            dict_assign["length_" + k + "_max"] = max(map(len, seqs))
+        else:
+            # default: tensorize and pad sequence
+            seqs = [torch.tensor(vv, device=device, dtype=torch.long) for vv in v]
+            pad_seq = pad_sequence(seqs, batch_first=True, padding_value=pad)
+            dict_assign[k] = pad_seq
+    return traj_data, input_dict, gt_dict
+
+
+def sample_batches(iterators, device, pad, args):
+    """
+    sample a batch from each iterator, return Nones if the iterator is empty
+    """
+    batches_dict = {}
+    for dataset_id, iterator in iterators.items():
+        try:
+            batches = next(iterator)
+        except StopIteration as e:
+            return None
+        dataset_name = dataset_id.split(":")[1]
+        traj_data, input_dict, gt_dict = tensorize_and_pad(batches, device, pad)
+        batches_dict[dataset_name] = (traj_data, input_dict, gt_dict)
+    return batches_dict
+
+
+def load_vocab(name, ann_type="lang"):
+    """
+    load a vocabulary from the dataset
+    """
+    path = os.path.join(constants.ET_DATA, name, constants.VOCAB_FILENAME)
+    logger.info("In load_vocab, loading vocab from %s" % path)
+    vocab_dict = torch.load(path)
+    # set name and annotation types
+    for vocab in vocab_dict.values():
+        vocab.name = name
+        vocab.ann_type = ann_type
+    return vocab_dict
+
+
+def load_vocab_for_inference(model_dir, name, ann_type="lang"):
+    path = os.path.join(model_dir, constants.VOCAB_FILENAME)
+    logger.info("In load_vocab, loading vocab from %s" % path)
+    vocab_dict = torch.load(path)
+    # set name and annotation types
+    for vocab in vocab_dict.values():
+        vocab.name = name
+        vocab.ann_type = ann_type
+    return vocab_dict
+
+
+def get_feat_shape(visual_archi, compress_type=None):
+    """
+    Get feat shape depending on the training archi and compress type
+    """
+    if visual_archi == "fasterrcnn":
+        # the RCNN model should be trained with min_size=224
+        feat_shape = (-1, 2048, 7, 7)
+    elif visual_archi == "maskrcnn":
+        # the RCNN model should be trained with min_size=800
+        feat_shape = (-1, 2048, 10, 10)
+    elif visual_archi == "resnet18":
+        feat_shape = (-1, 512, 7, 7)
+    else:
+        raise NotImplementedError("Unknown archi {}".format(visual_archi))
+
+    if compress_type is not None:
+        if not re.match(r"\d+x", compress_type):
+            raise NotImplementedError("Unknown compress type {}".format(compress_type))
+        compress_times = int(compress_type[:-1])
+        feat_shape = (
+            feat_shape[0],
+            feat_shape[1] // compress_times,
+            feat_shape[2],
+            feat_shape[3],
+        )
+    return feat_shape
+
+
+def feat_compress(feat, compress_type):
+    """
+    Compress features by channel average pooling
+    """
+    assert re.match(r"\d+x", compress_type) and len(feat.shape) == 4
+    times = int(compress_type[:-1])
+    assert feat.shape[1] % times == 0
+    feat = feat.reshape((feat.shape[0], times, feat.shape[1] // times, feat.shape[2], feat.shape[3]))
+    feat = feat.mean(dim=1)
+    return feat
+
+
+def read_dataset_info(data_name):
+    """
+    Read dataset a feature shape and a feature extractor checkpoint path
+    """
+    path = os.path.join(constants.ET_DATA, data_name, "params.json")
+    with open(path, "r") as f_params:
+        params = json.load(f_params)
+    return params
+
+
+def read_dataset_info_for_inference(model_dir):
+    """
+    Read dataset a feature shape and a feature extractor checkpoint path from file stored in model checkpoint
+    """
+    path = os.path.join(model_dir, "params.json")
+    logger.info("Reading dataset info from %s for model dir %s" % (path, model_dir))
+    with open(path, "r") as f_params:
+        params = json.load(f_params)
+    return params
+
+
+def remove_spaces(s):
+    cs = " ".join(s.split())
+    return cs
+
+
+def remove_spaces_and_lower(s):
+    cs = remove_spaces(s)
+    cs = cs.lower()
+    return cs
+
+
+def remove_punctuation(s):
+    cs = s.translate(str.maketrans("", "", string.punctuation))
+    cs = remove_spaces_and_lower(cs)
+    return cs
diff --git a/src/teach/modeling/ET/alfred/utils/eval_util.py b/src/teach/modeling/ET/alfred/utils/eval_util.py
new file mode 100644
index 0000000..c95533f
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/utils/eval_util.py
@@ -0,0 +1,37 @@
+import logging
+
+from alfred.nn.enc_visual import FeatureExtractor
+from alfred.utils import model_util
+
+from teach.logger import create_logger
+
+logger = create_logger(__name__, level=logging.INFO)
+
+
+def load_agent(model_path, dataset_info, args, for_inference=False):
+    """
+    load a pretrained agent and its feature extractor
+    """
+    logger.info("In load_agent, model_path = %s, dataset_info = %s" % (str(model_path), str(dataset_info)))
+    learned_model, _ = model_util.load_model(model_path, args.device, for_inference=for_inference)
+    model = learned_model.model
+    model.eval()
+    model.args.device = args.device
+    extractor = FeatureExtractor(
+        archi=dataset_info["visual_archi"],
+        device=args.device,
+        checkpoint=args.visual_checkpoint,
+        compress_type=dataset_info["compress_type"],
+    )
+    return model, extractor
+
+
+def load_object_predictor(args):
+    if args.object_predictor is None:
+        return None
+    return FeatureExtractor(
+        archi="maskrcnn",
+        device=args.device,
+        checkpoint=args.object_predictor,
+        load_heads=True,
+    )
diff --git a/src/teach/modeling/ET/alfred/utils/helper_util.py b/src/teach/modeling/ET/alfred/utils/helper_util.py
new file mode 100644
index 0000000..f0f2587
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/utils/helper_util.py
@@ -0,0 +1,52 @@
+import torch
+from vocab import Vocab as VocabBase
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+class DataParallel(torch.nn.DataParallel):
+    """
+    Allow nn.DataParallel to call model's attributes.
+    """
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.module, name)
+
+
+class VocabWithLock(VocabBase):
+    """vocab.Vocab with a lock for parallel computations."""
+
+    def __init__(self, words=(), lock=None):
+        self.lock = lock
+        super().__init__(words)
+
+    def word2index(self, word, train=False):
+        """Original function copy with the self.lock call."""
+        if isinstance(word, (list, tuple)):
+            return [self.word2index(w, train=train) for w in word]
+        with self.lock:
+            self.counts[word] += train
+            if word in self._word2index:
+                return self._word2index[word]
+            else:
+                if train:
+                    self._index2word += [word]
+                    self._word2index[word] = len(self._word2index)
+                else:
+                    return self._handle_oov_word(word)
+            index = self._word2index[word]
+        return index
+
+
+def identity(x):
+    """
+    pickable equivalent of lambda x: x
+    """
+    return x
diff --git a/src/teach/modeling/ET/alfred/utils/metric_util.py b/src/teach/modeling/ET/alfred/utils/metric_util.py
new file mode 100755
index 0000000..96215c8
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/utils/metric_util.py
@@ -0,0 +1,51 @@
+import collections
+import re
+import string
+
+
+def normalize_answer(s):
+    """
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+def compute_exact(a_gold, a_pred):
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+
+def compute_f1(a_gold, a_pred):
+    gold_toks = get_tokens(a_gold)
+    pred_toks = get_tokens(a_pred)
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
diff --git a/src/teach/modeling/ET/alfred/utils/model_util.py b/src/teach/modeling/ET/alfred/utils/model_util.py
new file mode 100644
index 0000000..87e5205
--- /dev/null
+++ b/src/teach/modeling/ET/alfred/utils/model_util.py
@@ -0,0 +1,399 @@
+import collections
+import copy
+import json
+import logging
+import os
+from importlib import import_module
+
+import numpy as np
+import torch
+from alfred import constants
+from alfred.utils import metric_util
+from torch.nn import functional as F
+
+from teach.logger import create_logger
+
+logger = create_logger(__name__, level=logging.INFO)
+
+
+def adjust_lr(args, epoch, schedulers):
+    """
+    adjust optimizer learning rate w.r.t the schedulers
+    """
+    if epoch >= args.lr["warmup_epoch"]:
+        schedulers["base"].step()
+    else:
+        schedulers["warmup"].step()
+
+
+def create_optimizer_and_schedulers(first_epoch, args, parameters, optimizer=None):
+    """
+    create a scheduler for the learning rate
+    """
+    # create an optimizer if it was not provided
+    init_lr = args.lr["init"] * args.lr["warmup_scale"]
+    if args.lr["warmup_scale"] != 1:
+        assert args.lr["warmup_epoch"] > 0
+    if optimizer is None:
+        assert args.optimizer in ("adam", "adamw")
+        OptimizerClass = torch.optim.Adam if args.optimizer == "adam" else torch.optim.AdamW
+        optimizer = OptimizerClass(parameters, lr=init_lr, weight_decay=args.weight_decay)
+    else:
+        for param_group in optimizer.param_groups:
+            param_group["lr"] = init_lr
+
+    # create a learning rate scheduler
+    assert args.lr["profile"] in ("linear", "cosine", "triangular", "triangular2")
+    if args.lr["profile"] == "linear":
+        lr_scheduler = torch.optim.lr_scheduler.StepLR(
+            optimizer, gamma=args.lr["decay_scale"], step_size=args.lr["decay_epoch"]
+        )
+    elif args.lr["profile"] == "cosine":
+        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer,
+            T_max=(args.epochs - args.lr["warmup_epoch"] - 1),
+            eta_min=args.lr["final"],
+        )
+    else:
+        assert min(args.lr["cycle_epoch_up"], args.lr["cycle_epoch_down"]) > 0
+        lr_scheduler = torch.optim.lr_scheduler.CyclicLR(
+            optimizer,
+            base_lr=args.lr["init"],
+            max_lr=args.lr["final"],
+            step_size_up=args.lr["cycle_epoch_up"],
+            step_size_down=args.lr["cycle_epoch_down"],
+            mode=args.lr["profile"],
+            cycle_momentum=False,
+        )
+
+    # create a learning rate scheduler for the warmup period
+    warmup_scheduler = None
+    if args.lr["warmup_epoch"]:
+        warmup_scheduler = torch.optim.lr_scheduler.ExponentialLR(
+            optimizer,
+            gamma=(1 / args.lr["warmup_scale"] ** (1 / args.lr["warmup_epoch"])),
+        )
+
+    # in case if we start not from the first epoch, fastforward the scheduler
+    for epoch in range(first_epoch):
+        if epoch >= args.lr["warmup_epoch"]:
+            lr_scheduler.step()
+        else:
+            warmup_scheduler.step()
+    return optimizer, {"base": lr_scheduler, "warmup": warmup_scheduler}
+
+
+def load_model(fsave, device, check_epoch=None, for_inference=False):
+    """
+    load pth model from disk
+    """
+    logger.info("Loading from {} to {}".format(fsave, device))
+    save = torch.load(fsave, map_location=device)
+    LearnedModel = import_module("alfred.model.learned").LearnedModel
+    save["args"]["model_dir"] = os.path.dirname(fsave)
+    model = LearnedModel(save["args"], save["embs_ann"], save["vocab_out"], for_inference)
+    model.load_state_dict(save["model"])
+    OptimizerClass = torch.optim.Adam if save["args"].optimizer == "adam" else torch.optim.AdamW
+    optimizer = OptimizerClass(model.parameters(), lr=1e-3, weight_decay=save["args"].weight_decay)
+    optimizer.load_state_dict(save["optim"])
+    if check_epoch:
+        assert save["metric"]["epoch"] == check_epoch, "Epochs in info.json and latest.pth do not match"
+    model = model.to(torch.device(device))
+    optimizer_to(optimizer, torch.device(device))
+    return model, optimizer
+
+
+def load_model_args(fsave):
+    """
+    load model's args from disk
+    """
+    save = torch.load(fsave, map_location=lambda storage, loc: storage)
+    return save["args"]
+
+
+def save_model(model, model_name, stats, optimizer=None, symlink=False):
+    """
+    save the model to args.dout/model_name or create a symlink from the latest model to args.dout/model_name
+    """
+    save_path = os.path.join(model.args.dout, model_name)
+    if not symlink:
+        # nn.DaraParallel related renaming
+        state_dict = {key.replace("model.module.", "model."): value for key, value in model.state_dict().items()}
+        assert optimizer is not None
+        torch.save(
+            {
+                "metric": stats,
+                "model": state_dict,
+                "optim": optimizer.state_dict(),
+                "args": model.args,
+                "vocab_out": model.vocab_out,
+                "embs_ann": model.embs_ann,
+            },
+            save_path,
+        )
+    else:
+        # create symlink to last saved model
+        model_path = os.path.join(model.args.dout, "model_{:02d}.pth".format(stats["epoch"]))
+        if os.path.islink(save_path):
+            os.unlink(save_path)
+        os.symlink(model_path, save_path)
+
+
+def tensorboard(writer, metrics, split, iter, frequency, batch_size):
+    if (iter // batch_size) % frequency == 0:
+        for metric_name, metric_value_list in metrics.items():
+            metric_value = np.mean(metric_value_list[-frequency:])
+            writer.add_scalar("{}/{}".format(split, metric_name), metric_value, iter)
+
+
+def save_log(dout, progress, total, stage, **kwargs):
+    """
+    logging a method json for besteffort mode and jobs monitoring on Alex's machine
+    """
+    info_path = os.path.join(dout, "info.json")
+    info_dicts = []
+    if os.path.exists(info_path):
+        with open(info_path, "r") as f:
+            info_dicts = json.load(f)
+    info_dict = {"stage": stage, "progress": progress, "total": total}
+    info_dict.update(kwargs)
+    info_dicts.append(info_dict)
+    with open(info_path, "w") as f:
+        json.dump(info_dicts, f)
+
+
+def load_log(dout, stage):
+    """
+    loading a method json to continue training from the correct place
+    """
+    info_path = os.path.join(dout, "info.json")
+    if os.path.exists(info_path):
+        with open(info_path) as f:
+            info_dicts = json.load(f)
+        info_dict = [el for el in info_dicts if el["stage"] == stage][-1]
+    else:
+        info_dict = {"progress": 0, "best_loss": {}, "iters": {}}
+    if isinstance(info_dict["best_loss"], dict):
+        info_dict["best_loss"] = collections.defaultdict(lambda: 1e10, info_dict["best_loss"])
+    if isinstance(info_dict["iters"], dict):
+        info_dict["iters"] = collections.defaultdict(lambda: 0, info_dict["iters"])
+    return info_dict
+
+
+def update_log(dout, stage, update, **kwargs):
+    """
+    updating a method json for monitoring on Alex's machine
+    """
+    assert update in ("increase", "rewrite")
+    info_path = os.path.join(dout, "info.json")
+    assert os.path.exists(info_path)
+    with open(info_path) as f:
+        info_dicts = json.load(f)
+    info_dict = copy.deepcopy([el for el in info_dicts if el["stage"] == stage][-1])
+    # update the values
+    for key, value in kwargs.items():
+        assert key in info_dict
+        new_value = value + info_dict[key] if update == "increase" else value
+        info_dict[key] = new_value
+    # decide what to do with the list with updated values
+    if info_dicts[-1]["stage"] == stage:
+        # rewrite the values
+        info_dicts[-1] = info_dict
+    else:
+        # append a new list element
+        info_dicts.append(info_dict)
+    # dump to the disk
+    with open(info_path, "w") as f:
+        json.dump(info_dicts, f)
+
+
+def triangular_mask(size, device, diagonal_shift=1):
+    """
+    generate upper triangular matrix filled with ones
+    """
+    square = torch.triu(torch.ones(size, size, device=device), diagonal=diagonal_shift)
+    square = square.masked_fill(square == 1.0, float("-inf"))
+    return square
+
+
+def generate_attention_mask(len_lang, len_frames, device, num_input_actions=0):
+    """
+    generate mask for attention (a timestep at t does not attend to timesteps after t)"""
+    # 1. language should attend only to language
+    lang_to_lang = torch.zeros((len_lang, len_lang), device=device).float()
+    lang_to_rest = torch.ones((len_lang, len_frames * 2), device=device).float() * float("-inf")
+    lang_to_all = torch.cat((lang_to_lang, lang_to_rest), dim=1)
+    # 2.1 frames should attend to all language tokens
+    frames_to_lang = torch.zeros((len_frames, len_lang), device=device).float()
+    # 2.2 frames should attend to frames with timestep <= t
+    frames_to_frames = triangular_mask(len_frames, device)
+    # 2.3 frames should attend to actions with timestep < t. first make all actions invisible
+    frames_to_actions = torch.ones((len_frames, len_frames), device=device).float() * float("-inf")
+    # 2.3 then unmask `num_input_actions` previous actions for each frame (excluding index t)
+    for a_idx in range(num_input_actions):
+        for f_idx in range(len_frames):
+            if f_idx - 1 - a_idx < 0:
+                # the index is out of bound
+                continue
+            frames_to_actions[f_idx, f_idx - 1 - a_idx] = 0.0
+    frames_to_all = torch.cat((frames_to_lang, frames_to_frames, frames_to_actions), dim=1)
+    # 3. actions should attend to the same indices as frames
+    actions_to_all = frames_to_all.clone()
+    # 4. concatenate all the masks
+    all_to_all = torch.cat((lang_to_all, frames_to_all, actions_to_all), dim=0)
+    return all_to_all
+
+
+def process_prediction(action, objects, pad, vocab_action, clean_special_tokens, predict_object=True):
+    """
+    process a single trajectory, return it as a dict
+    """
+    # remove padding tokens
+    if pad in action:
+        pad_start_idx = action.index(pad)
+        action = action[:pad_start_idx]
+        objects = objects[:pad_start_idx]
+    if clean_special_tokens:
+        # remove <<stop>> tokens
+        stop_token = vocab_action.word2index("Stop")
+        if stop_token in action:
+            stop_start_idx = action.index(stop_token)
+            action = action[:stop_start_idx]
+            objects = objects[:stop_start_idx]
+    # index to API actions
+    words = vocab_action.index2word(action)
+
+    if predict_object:
+        pred_object = objects[None].max(2)[1].cpu().numpy()
+    else:
+        pred_object = None
+    pred_processed = {
+        "action": " ".join(words),
+        "object": pred_object,
+    }
+    return pred_processed
+
+
+def extract_action_preds(model_out, pad, vocab_action, clean_special_tokens=True, lang_only=False):
+    """
+    output processing for a VLN agent
+    """
+    zipped_data = zip(model_out["action"].max(2)[1].tolist(), model_out["object"])
+    predict_object = not lang_only
+    preds_list = [
+        process_prediction(action, objects, pad, vocab_action, clean_special_tokens, predict_object)
+        for action, objects in zipped_data
+    ]
+    return preds_list
+
+
+def compute_f1_and_exact(metrics, preds, labels, loss_key):
+    """
+    compute f1 and extract match scores for agent output
+    """
+    m = collections.defaultdict(list)
+    for pred_str, label_str in zip(preds, labels):
+        pred_list, label_list = pred_str.lower().split(" "), label_str.lower().split(" ")
+        # compute f1 score for the full sequence of actions
+        m["{}/f1".format(loss_key)].append(metric_util.compute_f1(label_str, pred_str))
+        # compute exact matching for each timestep individually
+        for pred_action, label_action in zip(pred_list, label_list):
+            m["{}/exact".format(loss_key)].append(metric_util.compute_exact(label_action, pred_action))
+    m_averaged = {k: sum(v) / len(v) for k, v in m.items()}
+    for k, v in m_averaged.items():
+        metrics[k].append(v)
+
+
+def compute_obj_class_precision(metrics, gt_dict, classes_out, compute_train_loss_over_history):
+    """
+    compute precision of predictions for interaction object classes
+    """
+    if len(gt_dict["object"]) > 0:
+        if compute_train_loss_over_history:
+            interact_idxs = torch.nonzero(gt_dict["obj_interaction_action"])
+        else:
+            interact_idxs = torch.nonzero(gt_dict["driver_actions_pred_mask"] * gt_dict["obj_interaction_action"])
+        obj_classes_prob = classes_out[tuple(interact_idxs.T)]
+        obj_classes_pred = obj_classes_prob.max(1)[1]
+        obj_classes_gt = torch.cat(gt_dict["object"], dim=0)
+        precision = torch.sum(obj_classes_pred == obj_classes_gt) / len(obj_classes_gt)
+        metrics["action/object"].append(precision.item())
+    else:
+        metrics["action/object"].append(0.0)
+
+
+def obj_classes_loss(pred_obj_cls, gt_obj_cls, interact_idxs):
+    """
+    Compute a cross-entropy loss for the object class predictions.
+    """
+    pred_obj_cls_inter = pred_obj_cls[interact_idxs]
+    # the interaction objects should be non zeros
+    assert not (gt_obj_cls == 0).any()
+    # compute the loss for interaction objects
+    obj_cls_loss = F.cross_entropy(pred_obj_cls_inter, gt_obj_cls, reduction="mean")
+    return obj_cls_loss
+
+
+def tokens_to_lang(tokens, vocab, skip_tokens=None, join=True):
+    """
+    convert tokens into human-readable words
+    """
+    if skip_tokens is None:
+        skip_tokens = {}
+
+    def _tokens_to_lang(seq):
+        if isinstance(seq, torch.Tensor):
+            seq = seq.tolist()
+        lang = [vocab.index2word(t) for t in seq if t not in skip_tokens]
+        lang = " ".join(lang) if join else lang
+        return lang
+
+    if isinstance(tokens[0], int):
+        # a list of ints is provided, only one sequence
+        output = _tokens_to_lang(tokens)
+    else:
+        # a list of lists is provided, several sequences
+        output = [_tokens_to_lang(seq) for seq in tokens]
+    return output
+
+
+def translate_to_vocab(tokens, vocab, vocab_translate, skip_new_tokens=False):
+    """
+    translate tokens from orig vocab to translate vocab
+    """
+    if vocab_translate.contains_same_content(vocab):
+        return tokens
+    lang_orig = tokens_to_lang(tokens, vocab, join=False)
+    tokens_new = []
+    for word in lang_orig:
+        if skip_new_tokens and word not in vocab_translate.counts:
+            word = "<<pad>>"
+        tokens_new.append(vocab_translate.word2index(word))
+    if not skip_new_tokens:
+        lang_new = tokens_to_lang(tokens_new, vocab_translate, join=False)
+        assert lang_orig == lang_new
+    return tokens_new
+
+
+def last_model_path(exp_name):
+    """
+    get path of the last model in the exp
+    """
+    model_path = os.path.join(constants.ET_LOGS, exp_name, "latest.pth")
+    assert os.path.islink(model_path)
+    return model_path
+
+
+def optimizer_to(optim, device):
+    for param in optim.state.values():
+        # Not sure there are any global tensors in the state dict
+        if isinstance(param, torch.Tensor):
+            param.data = param.data.to(device)
+            if param._grad is not None:
+                param._grad.data = param._grad.data.to(device)
+        elif isinstance(param, dict):
+            for subparam in param.values():
+                if isinstance(subparam, torch.Tensor):
+                    subparam.data = subparam.data.to(device)
+                    if subparam._grad is not None:
+                        subparam._grad.data = subparam._grad.data.to(device)
diff --git a/src/teach/modeling/ET/data/.gitkeep b/src/teach/modeling/ET/data/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/src/teach/modeling/ET/files/human.vocab b/src/teach/modeling/ET/files/human.vocab
new file mode 100644
index 0000000..55b095a
Binary files /dev/null and b/src/teach/modeling/ET/files/human.vocab differ
diff --git a/src/teach/modeling/ET/files/obj_cls.vocab b/src/teach/modeling/ET/files/obj_cls.vocab
new file mode 100644
index 0000000..771cc98
Binary files /dev/null and b/src/teach/modeling/ET/files/obj_cls.vocab differ
diff --git a/src/teach/modeling/ET/files/overview.png b/src/teach/modeling/ET/files/overview.png
new file mode 100644
index 0000000..a159582
Binary files /dev/null and b/src/teach/modeling/ET/files/overview.png differ
diff --git a/src/teach/modeling/ET/files/synth.vocab b/src/teach/modeling/ET/files/synth.vocab
new file mode 100644
index 0000000..01339bc
Binary files /dev/null and b/src/teach/modeling/ET/files/synth.vocab differ
diff --git a/src/teach/modeling/ET/logs/.gitkeep b/src/teach/modeling/ET/logs/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/src/teach/modeling/__init__.py b/src/teach/modeling/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/teach/replay/episode_replay.py b/src/teach/replay/episode_replay.py
index 9097dc7..2f0a1d5 100644
--- a/src/teach/replay/episode_replay.py
+++ b/src/teach/replay/episode_replay.py
@@ -363,6 +363,7 @@ def _add_interaction(self, idx, interact_oid, logged_success):
 
     def _set_up_new_episode(self, obs_dir, turn_on_lights, task=None):
         api_success = True
+        self.simulator.reset_stored_data()
         logger.info("Starting episode...")
         self.simulator.start_new_episode(
             world=self.episode.world,
diff --git a/src/teach/simulators/simulator_THOR.py b/src/teach/simulators/simulator_THOR.py
index aed9ff0..8e95fff 100644
--- a/src/teach/simulators/simulator_THOR.py
+++ b/src/teach/simulators/simulator_THOR.py
@@ -424,6 +424,24 @@ def start_new_episode(
             custom_object_metadata=self.__custom_object_metadata,
         )
 
+    def save(self, file_name=None):
+        """
+        Save the session using the current state as the final simulator state. This does not shut down the simulator.
+        Call done() instead if simulator should be shut down after this
+        :param file_name: If file_name is not None, the simulator session is saved in the same format as original games
+        """
+        # Add final state to log.
+        state = self.get_scene_object_locs_and_states()
+        self.current_episode.final_state = Initialization(
+            time_start=time.time() - self.start_time,
+            agents=state["agents"],
+            objects=state["objects"],
+            custom_object_metadata=self.__custom_object_metadata,
+        )
+
+        # Save log file
+        super().save(file_name=file_name)
+
     def done(self, file_name=None):
         """
         Shut down the simulator and save the session with final simulator state; Should be called at end of collection/
@@ -1806,20 +1824,24 @@ def __update_custom_coffee_prop(self, event, objs_before_event=None):
         reliability and checks that a container just got placed in a coffee maker and the coffee maker was on
         """
         cur_objects = self.get_objects(event)
-        coffee_makers = [obj for obj in cur_objects if "CoffeeMachine" in obj["objectType"]]
-        coffee_maker_ids = set([obj["objectId"] for obj in coffee_makers])
+        coffee_maker_ids = set(
+            [obj["objectId"] for obj in cur_objects if "CoffeeMachine" in obj["objectType"] and obj["isToggled"]]
+        )
         for obj in cur_objects:
+            prev_filled_with_liquid = False
             if objs_before_event is not None:
                 prev_state = self.__get_object_by_id(objs_before_event, obj["objectId"])
-            else:
-                prev_state = None
+                if prev_state:
+                    prev_filled_with_liquid = prev_state["isFilledWithLiquid"]
             parent_receptacles = self.get_parent_receptacles(obj, cur_objects)
+            placed_in_toggled_coffee_maker = False
+            if parent_receptacles is not None and len(set(parent_receptacles).intersection(coffee_maker_ids)) > 0:
+                placed_in_toggled_coffee_maker = True
             if (
-                parent_receptacles is not None
-                and len(set(parent_receptacles).intersection(coffee_maker_ids)) > 0
+                placed_in_toggled_coffee_maker
                 and obj["canFillWithLiquid"]
                 and obj["isFilledWithLiquid"]
-                and (prev_state is None or not prev_state["isFilledWithLiquid"])
+                and not prev_filled_with_liquid
             ):
                 self.__update_custom_object_metadata(obj["objectId"], "simbotIsFilledWithCoffee", True)
 
@@ -1852,13 +1874,15 @@ def __update_sink_interaction_outcomes(self, event):
 
         for child_obj in objs_in_sink:
             if child_obj["isDirty"]:
-                ac = dict(action="CleanObject", objectId=child_obj["objectId"])
+                ac = dict(action="CleanObject", objectId=child_obj["objectId"], forceAction=True)
                 if debug_print_all_sim_steps:
                     logger.info("step %s", ac)
                 self.controller.step(ac)
 
             if child_obj["canFillWithLiquid"]:
-                ac = dict(action="FillObjectWithLiquid", objectId=child_obj["objectId"], fillLiquid="water")
+                ac = dict(
+                    action="FillObjectWithLiquid", objectId=child_obj["objectId"], fillLiquid="water", forceAction=True
+                )
                 if debug_print_all_sim_steps:
                     logger.info("step %s", ac)
                 self.controller.step(ac)
diff --git a/src/teach/simulators/simulator_base.py b/src/teach/simulators/simulator_base.py
index 65c9855..d6821f6 100644
--- a/src/teach/simulators/simulator_base.py
+++ b/src/teach/simulators/simulator_base.py
@@ -111,6 +111,17 @@ def set_task_by_id(self, task_id: int, task_params=None, comments=""):
     def set_task_by_name(self, task_name: str, task_params=None, comments=""):
         raise NotImplementedError("Derived class must implement this!")
 
+    def reset_stored_data(self):
+        """
+        This removes data of previous tasks / episodes from the simulator object and should be used with caution
+        This should precede calls to start_new_episode() and set_task() to ensure that a future call to save() or done()
+        will save session data properly.
+        """
+        logger.info("Resetting dataset object and removing previously stored episodes...")
+        task_type = self._dataset.task_type
+        comments = self._dataset.comments
+        self._dataset = Dataset(task_type=task_type, definitions=None, comments=comments, version="2.0")
+
     def start_new_episode(
         self,
         world=None,
diff --git a/src/teach/utils.py b/src/teach/utils.py
index e7c4466..618ecba 100644
--- a/src/teach/utils.py
+++ b/src/teach/utils.py
@@ -4,9 +4,11 @@
 
 import copy
 import json
+import os
 from pathlib import Path
 
 import numpy as np
+from PIL import Image
 
 from teach.dataset.task_THOR import Task_THOR
 from teach.logger import create_logger
@@ -378,3 +380,19 @@ def dynamically_load_class(package_path, class_name):
     module = __import__(package_path, fromlist=[class_name])
     klass = getattr(module, class_name)
     return klass
+
+
+def load_images(image_dir, image_file_names):
+    images = list()
+    if not image_file_names:
+        return images
+    if not os.path.exists(image_dir):
+        raise Exception(f"{image_dir} doesn't exist")
+    for f in image_file_names:
+        image_file = os.path.join(image_dir, f)
+        if not os.path.exists(image_file):
+            continue
+        image_orig = Image.open(image_file)
+        images.append(image_orig.copy())
+        image_orig.close()
+    return images