In [None]:
!pip install peft
!pip install pycocoevalcap
!pip install pycocotools

Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/251.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m245.8/251.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->

In [None]:
%cd '/content/drive/MyDrive/ColabNotebooks/EM-VLM4AD/DriveLM/'

/content/drive/MyDrive/ColabNotebooks/EM-VLM4AD/DriveLM


In [None]:
from torch.utils.data import Dataset
from torchvision.io import read_image
from torchvision import transforms
import torch
import json
import os
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
import os
from collections import namedtuple
from tqdm import tqdm as progress_bar
from transformers import T5Tokenizer, T5ForConditionalGeneration
from peft import LoraConfig, get_peft_model, LoftQConfig
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.models import vit_b_32
import json
import pandas as pd

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class MultiFrameDataset(Dataset):

    def __init__(self, input_file, tokenizer, transform=None):
        with open(input_file) as f:
            self.data = json.load(f)

        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the question and answer at the idx
        qa, img_path = self.data[idx]
        img_paths = list(img_path.values())

        q_text, a_text = qa['Q'], qa['A']
        q_text = f"Question: {q_text} Answer:"

        # Concatenate images into a single tensor
        imgs = [self.transform(read_image(p).float()).to(device) for p in img_paths]
        imgs = torch.stack(imgs, dim=0)

        return q_text, imgs, a_text, sorted(list(img_path.values()))

    def collate_fn(self, batch):

        q_texts, imgs, a_texts, _ = zip(*batch)
        imgs = torch.stack(list(imgs), dim=0)

        encodings = self.tokenizer(q_texts, padding=True, return_tensors="pt").input_ids.to(device)
        labels = self.tokenizer(a_texts, padding=True, return_tensors='pt').input_ids.to(device)

        return encodings, imgs, labels

    def collate_fn_test(self, batch):

        q_texts, imgs, a_texts, img_paths = zip(*batch)

        imgs = torch.stack(list(imgs), dim=0)
        img_paths = list(img_paths)
        encodings = self.tokenizer(q_texts, padding=True, return_tensors="pt").input_ids.to(device)
        labels = self.tokenizer(a_texts, padding=True, return_tensors='pt').input_ids.to(device)

        return q_texts, encodings, imgs, labels, img_paths

In [None]:
VIT_HIDDEN_STATE = 768
VIT_SEQ_LENGTH = 49

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(
        f"Trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )



class DriveVLMT5(nn.Module):

    def __init__(self, config):

        super().__init__()

        # Make tokenizer and text model
        if config.lm == 'T5-Base':
            self.model = T5ForConditionalGeneration.from_pretrained('google-t5/t5-base')
        else:
            self.model = T5ForConditionalGeneration.from_pretrained('google-t5/t5-large')

            # For quantization
            loftq_config = LoftQConfig(loftq_bits=8)
            # Create LoRA model
            lora_config = LoraConfig(
                r=64,
                lora_alpha=32,
                loftq_config=loftq_config,
                lora_dropout=0.05,
                bias='none',
                target_modules=['q', 'v']
            )
            self.model = get_peft_model(self.model, lora_config)

        hidden_size = self.model.config.d_model

        print('Trainable Parameters for LM model:')
        print_trainable_parameters(self.model)

        # Create instance for multi-view processor
        self.mvp = self.MultiViewProcessor(config.gpa_hidden_size, hidden_size, config.lm, freeze=True)

    class MultiViewProcessor(nn.Module):

        def __init__(self, gpa_hidden_size, hidden_size, lm, freeze=False):

            super().__init__()

            # Use ViT for image embeddings
            self.img_model = vit_b_32(weights='DEFAULT')
            self.lm = lm

            # Modal embedding to distinguish between image and text
            self.modal_embeddings = nn.Embedding(2, hidden_size)
            self.modal_embeddings.weight.data.normal_(mean=0.0, std=0.02)

            # If we are freezing the CLIP embeddings
            if freeze:
                for param in self.img_model.parameters():
                    param.requires_grad = False

            # Set matrices based on MIVC paper
            self.w = nn.Linear(in_features=gpa_hidden_size, out_features=1)
            self.Z = nn.Sequential(
                nn.Linear(in_features=VIT_HIDDEN_STATE * VIT_SEQ_LENGTH, out_features=gpa_hidden_size, bias=False),
                nn.Tanh()
            )
            self.G = nn.Sequential(
                nn.Linear(in_features=VIT_HIDDEN_STATE * VIT_SEQ_LENGTH, out_features=gpa_hidden_size, bias=False),
                nn.Sigmoid()
            )

            if self.lm != 'T5-Base':
              self.img_projection_layer = nn.Linear(in_features=VIT_HIDDEN_STATE, out_features=hidden_size)


        def gpa(self, img_embeddings):

            """"
            Calculates the gated-pooling attention score for the image embeddings
            :param img_embeddings: (6x768) dimensional
            :return single embedding of size (768,)
            """

            # Get weights for gated pooling attention
            gpa_weights = torch.softmax(self.w(self.Z(img_embeddings) * self.G(img_embeddings)), dim=0  )

            # Take a linear combination of all the image embeddings
            fused_embeddings = torch.sum(gpa_weights * img_embeddings, dim=0)

            return fused_embeddings

        def get_img_embedding(self, imgs):

            N = imgs.shape[0]

            # Process into patches (N x 6 x 49 x H)
            merged_embedding = torch.stack([self.img_model._process_input(img) for img in imgs], dim=0)

            # Concatenate the batch class tokens -> (N, 6, 50, H)
            batch_class_tokens = self.img_model.class_token.expand(merged_embedding.shape[1], -1, -1).repeat(N, 1, 1, 1)
            merged_embedding = torch.cat([batch_class_tokens, merged_embedding], dim=2)

            # Add positional embeddings and remove class token -> (N, 6, 49, H)
            merged_embedding += self.img_model.encoder.pos_embedding.repeat(N, 1, 1, 1)
            merged_embedding = merged_embedding[:, :, 1:]

            # Get merged embedding and reshape to 2D embedding -> (N, 1, 49, H)
            merged_embedding = torch.stack([self.gpa(embedding.flatten(start_dim=1)).reshape(VIT_SEQ_LENGTH,
                                            VIT_HIDDEN_STATE) for embedding in merged_embedding], dim=0)

            # Project to VL dimension -> (1, 49, H) (H is 512 for t5-small, 768 for t5-base)
            if self.lm != 'T5-Base':
              merged_embedding = self.img_projection_layer(merged_embedding)

            # Add modal type embedding to merged embedding
            merged_embedding += self.modal_embeddings(
                torch.ones((1, merged_embedding.shape[1]), dtype=torch.int, device=device))

            return merged_embedding

        def forward(self, text_enc, imgs, text_model):

            # Get the image embeddings (N x 1 x 49 x H)
            imgs_embedding = self.get_img_embedding(imgs)

            # Get the text embeddings (N x S x H)
            text_embeddings = text_model.get_input_embeddings()(text_enc)

            # Add modal embeddings to text
            text_embeddings += self.modal_embeddings(torch.zeros((1, text_embeddings.shape[1]), dtype=torch.int,
                                                                 device=device))

            # Concatenate embeddings -> (1 x S x 512)
            merged_embedding = torch.cat([text_embeddings, imgs_embedding], dim=1)

            return merged_embedding

    def forward(self, text_enc, imgs, labels=None):

        # Get the merged embeddings
        merged_embedding = self.mvp(text_enc, imgs, self.model)

        # If training include the labels
        return self.model(inputs_embeds=merged_embedding, labels=labels)

    def generate(self, text_enc, imgs, lidar=None):

        merged_embedding = self.mvp(text_enc, imgs, self.model)

        attention_mask = torch.ones(merged_embedding.shape[:2], dtype=torch.long, device=device)
        decoder_input_ids = torch.ones((merged_embedding.shape[0], 1), dtype=torch.long, device=device)*self.model.config.decoder_start_token_id
        output_ids = self.model.generate(attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, inputs_embeds=merged_embedding, max_length=512, early_stopping=True)

        return output_ids

In [None]:
Config = namedtuple('Instance', ['batch_size', 'gpa_hidden_size', 'model_name', 'lm'])

config = Config(
    batch_size = 16,
    gpa_hidden_size = 128,
    model_name = 'T5-Large-Q',
    lm = 'T5-Large'
)

In [None]:
def val_model(dloader):

    model.eval()
    ids_answered = set()
    test_data = []

    with torch.no_grad():
      for idx, (q_texts, encodings, imgs, labels, img_paths) in progress_bar(enumerate(dloader), total=len(dloader)):

          # Get the hidden states (output)
          outputs = model.generate(encodings, imgs)

          # Get the text output
          text_outputs = [processor.decode(output, skip_special_tokens=True) for output in outputs]

          if idx % 100 == 0:
            print(q_texts)
            print(text_outputs)

          for image_path, q_text, text_output in zip(img_paths, q_texts, text_outputs):

              img_key = image_path[0]

              # Skip duplicate questions
              if image_id_dict[img_key + ' ' + q_text][0] in ids_answered:
                  continue

              ids_answered.add(image_id_dict[img_key + ' ' + q_text][0])
              test_data.append({'image_id': image_id_dict[img_key + ' ' + q_text][0], 'caption': text_output})

    # Save test output to file
    with open(os.path.join('/content/drive/MyDrive/ColabNotebooks/EM-VLM4AD', 'DriveLM', 'multi_frame_results', config.model_name, 'predictions.json'), 'w') as f:
        json.dump(test_data, f)


def save_experiment():
    """
    Saves the experiment results to a csv
    :param config: The hyperparameters used
    :param statistics: The accuracies for the training, validation, and test sets
    """

    trial_dict = {}

    # Add metrics to dictionary
    for metric, score in coco_eval.eval.items():
        trial_dict[metric] = [score]

    trial_dict = pd.DataFrame(trial_dict)
    trial_dict.to_csv(os.path.join('/content/drive/MyDrive/ColabNotebooks/EM-VLM4AD', 'DriveLM', 'multi_frame_results', config.model_name, 'metrics.csv'), index=False, header=True)

# Load processors and models
model = DriveVLMT5(config)
model.to(device)

if config.lm == 'T5-Base':
    processor = T5Tokenizer.from_pretrained('google-t5/t5-base')
else:
    processor = T5Tokenizer.from_pretrained('google-t5/t5-large')

processor.add_tokens('<')

model.load_state_dict(torch.load(os.path.join('/content/drive/MyDrive/ColabNotebooks/EM-VLM4AD', 'DriveLM', 'multi_frame_results', config.model_name,
                                                          'latest_model.pth')))

# Load dataset and dataloader
test_dset = MultiFrameDataset(
    input_file=os.path.join('/content/drive/MyDrive/ColabNotebooks/EM-VLM4AD', 'DriveLM',
                            'data', 'multi_frame',
                            'multi_frame_test.json'),
    tokenizer=processor,
    transform=transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.Normalize((127.5, 127.5, 127.5), (127.5, 127.5, 127.5))
    ])
)
test_dloader = DataLoader(test_dset, shuffle=True, batch_size=config.batch_size, drop_last=True, collate_fn=test_dset.collate_fn_test)

# Load in image ids
with open(os.path.join('/content/drive/MyDrive/ColabNotebooks/EM-VLM4AD', 'DriveLM', 'data', 'multi_frame', 'image_id.json')) as f:
    image_id_dict = json.load(f)

# Get the loss and predictions from the model
val_model(test_dloader)

annotation_file = os.path.join('/content/drive/MyDrive/ColabNotebooks/EM-VLM4AD', 'DriveLM', 'data', 'multi_frame', 'multi_frame_test_coco.json')
results_file = os.path.join('/content/drive/MyDrive/ColabNotebooks/EM-VLM4AD', 'DriveLM', 'multi_frame_results', config.model_name, 'predictions.json')

# create coco object and coco_result object
coco = COCO(annotation_file)
coco_result = coco.loadRes(results_file)

# create coco_eval object by taking coco and coco_result
coco_eval = COCOEvalCap(coco, coco_result)

# evaluate on a subset of images by setting
# coco_eval.params['image_id'] = coco_result.getImgIds()
# please remove this line when evaluating the full validation set
# coco_eval.params['image_id'] = coco_result.getImgIds()

# evaluate results
# SPICE will take a few minutes the first time, but speeds up due to caching
coco_eval.evaluate()

# Save the experiment results
save_experiment()

Trainable Parameters for LM model:
Trainable params: 18874368 || all params: 756542464 || trainable%: 2.494819378704432


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  0%|          | 1/1051 [00:02<48:19,  2.76s/it]

('Question: Are <c1,CAM_BACK,1060.0,525.8> and <c3,CAM_FRONT_RIGHT,238.3,496.7> traffic signs? Answer:', 'Question: What does <c3,CAM_FRONT,805.8,750.9> mean? Answer:', 'Question: Are there moving pedestrians to the front left of the ego car? Answer:', 'Question: What is the observed status of object <c3,CAM_FRONT,872.3,518.1>? Answer:', 'Question: What is the future state of <c1,CAM_FRONT,896.7,496.7>? Answer:', 'Question: What is the status of the cars that are to the back left of the ego car? Answer:', 'Question: Is <c1,CAM_BACK,868.3,630.8> an object that the ego vehicle should consider in the current scene? Answer:', 'Question: Is <c1,CAM_FRONT_LEFT,1402.6,615.8> a traffic sign or a road barrier? Answer:', 'Question: Based on the observations of <c1,CAM_BACK,1148.3,522.5>, what are possible actions to be taken by <c2,CAM_BACK_LEFT,805.0,512.5>? What is the reason? Answer:', 'Question: Will <c2,CAM_FRONT_RIGHT,1203.3,598.3> be in the moving direction of <c5,CAM_BACK,569.2,468.3>? A

 10%|▉         | 101/1051 [12:44<1:04:20,  4.06s/it]

('Question: Predict the behavior of the ego vehicle. Answer:', 'Question: What is the observed status of object <c1,CAM_FRONT_RIGHT,654.2,298.1>? Answer:', 'Question: Would <c1,CAM_FRONT,745.8,460.0> be in the moving direction of the ego vehicle? Answer:', 'Question: What is the status of the motorcycle that is to the back of the ego car? Answer:', 'Question: In this scenario, what object is most likely to consider <c4,CAM_FRONT,836.8,645.0>? Answer:', 'Question: Is it necessary for the ego vehicle to take <c5,CAM_FRONT,765.8,302.3> into account? Answer:', 'Question: What is the status of the motorcycle that is to the back left of the ego car? Answer:', 'Question: What is the future state of <c1,CAM_BACK_RIGHT,965.8,564.2>? Answer:', 'Question: What is the future state of <c2,CAM_BACK,547.5,511.7>? Answer:', "Question: What's your comment on this scene? Answer:", "Question: What actions could the ego vehicle take based on <c1,CAM_FRONT,1583.3,499.2>? Why take this action and what's the

 19%|█▉        | 201/1051 [24:16<2:21:38, 10.00s/it]

('Question: What are the important objects in the current scene? Those objects will be considered for the future reasoning and driving decision. Answer:', 'Question: Would <c2,CAM_FRONT,727.5,461.7> be in the moving direction of the ego vehicle? Answer:', 'Question: Are there motorcycles with riders to the back left of the ego car? Answer:', 'Question: What object would consider <c2,CAM_FRONT_LEFT,887.6,417.9> to be most relevant to its decision? Answer:', 'Question: Will <c3,CAM_FRONT,951.2,694.2> be in the moving direction of <c1,CAM_FRONT,280.5,521.3>? Answer:', 'Question: What are objects to the front of the ego car? Answer:', 'Question: What is the future state of <c2,CAM_BACK_RIGHT,750.0,426.7>? Answer:', 'Question: What is the relative positioning of the important objects in the current scene? Answer:', 'Question: What is the probability of colliding with <c6,CAM_FRONT_RIGHT,539.2,545.0> after the ego vehicle decelerates and goes straight? Answer:', 'Question: What is the traffi

 29%|██▊       | 301/1051 [38:38<2:35:13, 12.42s/it]

('Question: Are <c2,CAM_FRONT_RIGHT,575.8,574.2> and <c1,CAM_BACK,855.0,522.5> traffic signs? Answer:', 'Question: What is the observed status of object <c3,CAM_FRONT,849.2,499.2>? Answer:', 'Question: What actions taken by the ego vehicle can lead to a collision with <c4,CAM_FRONT_LEFT,789.9,563.9>? Answer:', 'Question: Based on the observations of <c4,CAM_FRONT_RIGHT,370.8,553.3>, what are possible actions to be taken by <c1,CAM_FRONT_LEFT,918.3,546.7>? What is the reason? Answer:', "Question: What actions could the ego vehicle take based on <c3,CAM_FRONT,1291.7,602.5>? Why take this action and what's the probability? Answer:", 'Question: What actions taken by the ego vehicle can lead to a collision with <c4,CAM_FRONT,892.5,520.0>? Answer:', 'Question: What object would consider <c4,CAM_BACK,905.8,641.7> to be most relevant to its decision? Answer:', 'Question: What actions taken by the ego vehicle can lead to a collision with <c2,CAM_FRONT,860.8,479.2>? Answer:', 'Question: Would <c

 38%|███▊      | 401/1051 [53:57<1:26:48,  8.01s/it]

('Question: Based on the observations of <c2,CAM_FRONT,400.8,595.0>, what are possible actions to be taken by <c1,CAM_BACK,973.3,514.2>? What is the reason? Answer:', 'Question: Are there parked trucks to the back of the ego car? Answer:', 'Question: Will <c2,CAM_BACK,309.2,505.8> change its motion state based on <c4,CAM_BACK,807.5,525.8>? Answer:', 'Question: What is the status of the car that is to the back left of the ego car? Answer:', 'Question: Is there any traffic element in the front view? Answer:', 'Question: Is <c2,CAM_FRONT_LEFT,1088.3,558.3> a traffic sign or a road barrier? Answer:', 'Question: Is <c3,CAM_FRONT_RIGHT,1022.5,540.0> an object that the ego vehicle should consider in the current scene? Answer:', 'Question: Is <c5,CAM_FRONT,813.3,489.2> a traffic sign or a road barrier? Answer:', 'Question: What is the status of the pedestrians that are to the back of the ego car? Answer:', 'Question: Is <c4,CAM_FRONT,1036.7,671.7> an object that the ego vehicle should consider

 48%|████▊     | 501/1051 [1:08:13<1:09:52,  7.62s/it]

("Question: What actions could the ego vehicle take based on <c2,CAM_FRONT,1372.5,497.5>? Why take this action and what's the probability? Answer:", 'Question: What is the visual description of <c1,CAM_BACK,821.7,543.3>? Answer:', 'Question: What is the priority of the objects that the ego vehicle should consider?(in descending order) Answer:', 'Question: Which object is most likely to be occluded by <c3,CAM_BACK_RIGHT,79.9,632.8>? Would this object affect the ego vehicle? Based on this object, what action of the ego vehicle is dangerous? Answer:', 'Question: Are there parked cars to the front of the ego car? Answer:', 'Question: What are the important objects in the current scene? Those objects will be considered for the future reasoning and driving decision. Answer:', 'Question: What is the future state of <c2,CAM_FRONT_LEFT,992.6,596.9>? Answer:', "Question: What actions could the ego vehicle take based on <c3,CAM_FRONT,885.8,540.0>? Why take this action and what's the probability? 

 57%|█████▋    | 601/1051 [1:20:36<1:25:31, 11.40s/it]

("Question: What's your comment on this scene? Answer:", 'Question: What is the status of the car that is to the back right of the ego car? Answer:', 'Question: What is the future state of <c1,CAM_BACK,750.8,541.7>? Answer:', 'Question: What are objects to the back right of the ego car? Answer:', 'Question: What is the observed status of object <c2,CAM_BACK,474.2,535.8>? Answer:', 'Question: What is the moving status of object <c1,CAM_BACK,932.5,575.0>? Answer:', 'Question: Is <c2,CAM_BACK,173.2,580.0> an object that the ego vehicle should consider in the current scene? Answer:', 'Question: Are there moving trailers to the front of the ego car? Answer:', 'Question: Are there bicycles without riders to the front left of the ego car? Answer:', 'Question: What is the future state of <c3,CAM_FRONT_LEFT,1282.5,452.5>? Answer:', 'Question: Will <c3,CAM_FRONT,877.5,465.8> be in the moving direction of <c1,CAM_FRONT_LEFT,264.2,606.7>? Answer:', 'Question: Please describe the current scene. Ans

 67%|██████▋   | 701/1051 [1:35:34<53:10,  9.12s/it]  

('Question: Are there moving pedestrians to the back of the ego car? Answer:', 'Question: What are objects to the back left of the ego car? Answer:', 'Question: What is the probability of colliding with <c2,CAM_FRONT_LEFT,1295.8,620.8> after the ego vehicle steps on the brakes? Answer:', 'Question: What object would consider <c3,CAM_FRONT,989.7,357.9> to be most relevant to its decision? Answer:', 'Question: What is the probability of colliding with <c4,CAM_FRONT_RIGHT,426.7,499.2> after the ego vehicle slows down and turns right? Answer:', 'Question: Is <c2,CAM_FRONT,727.5,461.7> a traffic sign or a road barrier? Answer:', 'Question: Are there parked cars to the front of the ego car? Answer:', 'Question: What is the visual description of <c1,CAM_FRONT,1216.7,499.2>? Answer:', 'Question: What are objects to the back of the ego car? Answer:', 'Question: What is the status of the trucks that are to the back left of the ego car? Answer:', 'Question: Are there parked cars to the front of t

 76%|███████▌  | 801/1051 [1:49:36<51:56, 12.47s/it]

('Question: Are there moving cars to the front of the ego car? Answer:', 'Question: What is the status of the car that is to the back of the ego car? Answer:', "Question: What actions could the ego vehicle take based on <c5,CAM_FRONT,683.3,480.0>? Why take this action and what's the probability? Answer:", 'Question: What is the status of the truck that is to the back of the ego car? Answer:', 'Question: What is the target action of the ego vehicle? Answer:', 'Question: Are there traffic cones to the back of the ego car? Answer:', 'Question: In this scenario, what are dangerous actions to take for the ego vehicle? Answer:', 'Question: What is the status of the cars that are to the front right of the ego car? Answer:', 'Question: What is the future state of <c5,CAM_FRONT,813.3,489.2>? Answer:', 'Question: Which object is most likely to be occluded by <c2,CAM_FRONT,794.8,425.5>? Would this object affect the ego vehicle? Based on this object, what action of the ego vehicle is dangerous? An

 86%|████████▌ | 901/1051 [2:06:41<23:20,  9.33s/it]

('Question: What object would consider <c1,CAM_FRONT,642.5,488.3> to be most relevant to its decision? Answer:', 'Question: What are objects to the front left of the ego car? Answer:', 'Question: Are there moving buses to the back of the ego car? Answer:', 'Question: What is the status of the pedestrians that are to the back left of the ego car? Answer:', 'Question: What object would consider <c3,CAM_BACK_LEFT,665.0,641.7> to be most relevant to its decision? Answer:', 'Question: Are there bicycles with riders to the back left of the ego car? Answer:', 'Question: Which object is most likely to be occluded by <c4,CAM_FRONT,1020.0,455.0>? Would this object affect the ego vehicle? Based on this object, what action of the ego vehicle is dangerous? Answer:', 'Question: What is the observed status of object <c1,CAM_FRONT_LEFT,929.6,460.3>? Answer:', 'Question: Is <c1,CAM_FRONT,795.8,500.0> a traffic sign or a road barrier? Answer:', 'Question: Will <c3,CAM_FRONT,1338.6,584.4> be in the movin

 95%|█████████▌| 1001/1051 [2:19:39<11:11, 13.43s/it]

('Question: In this scenario, what are dangerous actions to take for the ego vehicle? Answer:', 'Question: Is <c3,CAM_FRONT_LEFT,1122.3,639.8> a traffic sign or a road barrier? Answer:', 'Question: What is the visual description of <c1,CAM_BACK_LEFT,639.2,403.3>? Answer:', 'Question: What is the status of the pedestrians that are to the back of the ego car? Answer:', 'Question: What is the future state of <c1,CAM_FRONT_LEFT,812.9,470.8>? Answer:', 'Question: What is the observed status of object <c2,CAM_BACK,520.8,573.3>? Answer:', 'Question: What are the important objects in the current scene? Those objects will be considered for the future reasoning and driving decision. Answer:', 'Question: What is the status of the pedestrian that is to the front of the ego car? Answer:', 'Question: What object would consider <c1,CAM_FRONT,429.2,633.3> to be most relevant to its decision? Answer:', 'Question: What are objects to the back of the ego car? Answer:', 'Question: What kind of traffic sig

100%|██████████| 1051/1051 [2:26:33<00:00,  8.37s/it]


loading annotations into memory...
Done (t=0.04s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
Downloading stanford-corenlp-3.6.0 for SPICE ...
Progress: 384.5M / 384.5M (100.0%)
Extracting stanford-corenlp-3.6.0 ...
Done.
computing Bleu score...


AssertionError: 