<a href="https://colab.research.google.com/github/alierenc/di725-transformers-and-attention-based-deep-networks-term-project/blob/main/Phase%20III/2.1.%20SigLIP-GPT2%20Custom%20VLM%20-%20Image%20Captioning%20Fine-tuning%20and%20Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from huggingface_hub import login
hf_token = " " # Huggingface token
login(token = hf_token)

In [None]:
# Access google drive to save the model
from google.colab import drive
drive.mount('/content/drive')

# Import and log in wandb
import wandb

wandb.login()
# Initialize W&B run
wandb.init(project="term-project-vision-language-model", name="siglip-gpt2")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[34m[1mwandb[0m: Currently logged in as: [33maeren[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
!pip install -U datasets
!pip install bitsandbytes --upgrade



In [None]:
from datasets import load_dataset, DatasetDict

# Load the dataset of full riscm
ds = load_dataset('caglarmert/full_riscm')

full = ds["train"]

# test   = indices [0, 3150)
test_ds = full.select(range(3150))

# validation = indices [3150, 6300)
val_ds = full.select(range(3150, 6300))

# train  = indices [6300, end)
train_ds = full.select(range(6300, len(full)))

# bundle into a DatasetDict
ds = DatasetDict({
    "test": test_ds,
    "val": val_ds,
    "train": train_ds,
})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
ds["test"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=224x224>,
 'caption_1': 'A gray plane on the runway and the lawn beside .',
 'caption_2': 'A grey plane is on the runway by the lawn .',
 'caption_3': 'There is an airplane on the runway with a large lawn by the runway .',
 'caption_4': 'A plane is parked on the runway next to the grass .',
 'caption_5': 'There is a plane on the runway beside the grass .'}

In [None]:
import torch
import torch.nn as nn

class CustomVLM(nn.Module):
    def __init__(self, vision_model, language_model, vision_hidden_size, language_hidden_size):
        super(CustomVLM, self).__init__()
        self.vision_model = vision_model
        self.language_model = language_model
        self.vision_proj = nn.Linear(vision_hidden_size, language_hidden_size)

    def forward(self, image, input_ids=None, attention_mask=None, labels=None):
        # Encode image
        vision_output = self.vision_model(pixel_values=image).last_hidden_state
        vision_embedding = torch.mean(vision_output, dim=1)
        projected_embedding = self.vision_proj(vision_embedding)  # [B, D]
        prefix = projected_embedding.unsqueeze(1)  # [B, 1, D]

        # Get text embeddings
        inputs_embeds = self.language_model.transformer.wte(input_ids)  # [B, T, D]
        inputs_embeds = torch.cat([prefix, inputs_embeds], dim=1)

        # Create or update attention mask
        batch_size = inputs_embeds.size(0)
        seq_len = inputs_embeds.size(1)

        if attention_mask is None:
            # No mask was passed -> assume no padding in input
            attention_mask = torch.ones((batch_size, input_ids.size(1)), dtype=torch.long, device=input_ids.device)

        # Prefix mask for vision token
        prefix_mask = torch.ones((batch_size, 1), dtype=attention_mask.dtype, device=attention_mask.device)
        attention_mask = torch.cat([prefix_mask, attention_mask], dim=1)

        return self.language_model(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            labels=labels
        )


In [None]:
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    SiglipVisionModel,
    AutoImageProcessor,
    BitsAndBytesConfig
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import bitsandbytes

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load quantized GPT-2
language_model = GPT2LMHeadModel.from_pretrained(
    "gpt2",
    quantization_config=bnb_config,
    device_map="auto"
)

# Prepare for QLoRA
language_model = prepare_model_for_kbit_training(language_model)

# LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
language_model = get_peft_model(language_model, lora_config)
language_hidden_size = language_model.config.n_embd  # Usually 768

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Load SigLIP
vision_model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
vision_model.requires_grad_(False)
image_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
vision_hidden_size = vision_model.config.hidden_size

# Final model
model = CustomVLM(
    vision_model=vision_model,
    language_model=language_model,
    vision_hidden_size=vision_hidden_size,
    language_hidden_size=language_hidden_size
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
print("Language model device:", next(model.language_model.parameters()).device)
print("Vision model device:", next(model.vision_model.parameters()).device)
print("Vision projection layer device:", next(model.vision_proj.parameters()).device)

Language model device: cuda:0
Vision model device: cuda:0
Vision projection layer device: cuda:0


In [None]:
# We count the number of parameters
def count_parameters(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return {"Total": total, "Trainable": trainable}

print("Printing the total number of parameters and the number of trainable parameters:")
print("Vision Encoder (SigLIP):", count_parameters(model.vision_model))
print("Language Decoder (GPT-2):", count_parameters(model.language_model))
print("Vision Projection Layer:", count_parameters(model.vision_proj))
print("Total CustomVLM:", count_parameters(model))

Printing the total number of parameters and the number of trainable parameters:
Vision Encoder (SigLIP): {'Total': 92884224, 'Trainable': 0}
Language Decoder (GPT-2): {'Total': 82783488, 'Trainable': 811008}
Vision Projection Layer: {'Total': 590592, 'Trainable': 590592}
Total CustomVLM: {'Total': 176258304, 'Trainable': 1401600}


In [None]:
# Check whether the model can generate captions at all
# Set model to eval mode
model.eval()

for i in range(10):
    print(f"Generating caption for sample {i + 1}")

    # Load and preprocess image
    image = ds["test"][i]["image"]
    pixel_values = image_processor(image, return_tensors="pt")["pixel_values"]
    pixel_values = pixel_values.to(model.language_model.device)

    with torch.no_grad():
        # Vision encoding
        vision_output = model.vision_model(pixel_values=pixel_values).last_hidden_state
        vision_embedding = vision_output.mean(dim=1)
        vision_proj = model.vision_proj(vision_embedding).unsqueeze(1)  # [B, 1, D]

        # Prepare prompt
        prompt = "caption en"
        tokenized = tokenizer(prompt, return_tensors="pt", padding=False)
        input_ids = tokenized["input_ids"].to(model.language_model.device)
        attention_mask = tokenized["attention_mask"].to(model.language_model.device)

        # Text embedding
        input_embeds = model.language_model.transformer.wte(input_ids)
        input_embeds = torch.cat([vision_proj, input_embeds], dim=1)

        # Extend attention mask with prefix mask
        prefix_mask = torch.ones((attention_mask.shape[0], 1), dtype=attention_mask.dtype).to(attention_mask.device)
        attention_mask = torch.cat([prefix_mask, attention_mask], dim=1)

        # Generate text
        generated_ids = model.language_model.generate(
            inputs_embeds=input_embeds,
            attention_mask=attention_mask,
            max_new_tokens=30,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False
        )

        # Decode the caption
        caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)[len(prompt):]
        print("Generated caption:", repr(caption))
        print()


Generating caption for sample 1
Generated caption: 'T-S-S-S-S-S-S-S-S-S-S-S'

Generating caption for sample 2
Generated caption: 'T-S-S-S-S-S-S-S-S-S-S-S'

Generating caption for sample 3
Generated caption: 'm a big, big, big, big, big, big, big, big, big, big, big,'

Generating caption for sample 4
Generated caption: 'I-I-I-I-I-I-I-I-I-I-I-I'

Generating caption for sample 5
Generated caption: 'rst thing that I do is I go to the local, the local, the local, the local, the local, the'

Generating caption for sample 6
Generated caption: 'st of the two, the "The Great and the Great" is a "The Great and the Great" and the "The'

Generating caption for sample 7
Generated caption: 'he other.\n\nThe first time I saw the video, I was in the middle of the night, and I was'

Generating caption for sample 8
Generated caption: 'T-S-S-S-S-S-S-S-S-S-S-S'

Generating caption for sample 9
Generated caption: 'st of the two, the "The Great and the Great" is a "The Great and the Great" and the "The'

Ge

In [None]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

# Prefix for all captions
caption_prefix = "caption en"

# Custom collate function
def collate_fn(batch):
    images = [image_processor(example["image"], return_tensors="pt")["pixel_values"].squeeze(0) for example in batch]
    captions = [caption_prefix + example["caption_3"] for example in batch]
    pixel_values = torch.stack(images)

    tokenized = tokenizer(
        captions,
        padding=True,
        truncation=True,
        return_tensors="pt",
        return_attention_mask=True
    )

    return {
        "pixel_values": pixel_values,
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": tokenized["input_ids"].clone()
    }

# DataLoaders
train_loader = DataLoader(ds["train"], batch_size=256, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(ds["val"], batch_size=256, shuffle=False, collate_fn=collate_fn)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_samples = 0

    for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")):
        pixel_values = batch["pixel_values"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Pad labels for prefix position with -100
        prefix_ignore = torch.full((labels.size(0), 1), -100, dtype=labels.dtype, device=labels.device)
        labels = torch.cat([prefix_ignore, labels], dim=1)

        # Add prefix to attention mask
        prefix_mask = torch.ones((attention_mask.size(0), 1), dtype=attention_mask.dtype, device=attention_mask.device)
        extended_attention_mask = torch.cat([prefix_mask, attention_mask], dim=1)

        optimizer.zero_grad()

        outputs = model(
            image=pixel_values,
            input_ids=input_ids,
            attention_mask=extended_attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        batch_size = input_ids.size(0)
        total_loss += loss.item() * batch_size
        total_samples += batch_size

        wandb.log({
            "train/loss": loss.item(),
            "train/step": epoch * len(train_loader) + step
        })

    avg_train_loss = total_loss / total_samples
    print(f"Epoch {epoch+1} completed. Average Train Loss: {avg_train_loss:.4f}")
    wandb.log({"train/avg_epoch_loss": avg_train_loss, "epoch": epoch + 1})

    # Validation loop
    model.eval()
    val_loss = 0.0
    val_samples = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            pixel_values = batch["pixel_values"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Pad labels for prefix position with -100
            prefix_ignore = torch.full((labels.size(0), 1), -100, dtype=labels.dtype, device=labels.device)
            labels = torch.cat([prefix_ignore, labels], dim=1)

            prefix_mask = torch.ones((attention_mask.size(0), 1), dtype=attention_mask.dtype, device=attention_mask.device)
            extended_attention_mask = torch.cat([prefix_mask, attention_mask], dim=1)

            outputs = model(
                image=pixel_values,
                input_ids=input_ids,
                attention_mask=extended_attention_mask,
                labels=labels
            )

            batch_size = input_ids.size(0)
            val_loss += outputs.loss.item() * batch_size
            val_samples += batch_size

    avg_val_loss = val_loss / val_samples
    print(f"Average Validation Loss: {avg_val_loss:.4f}")
    wandb.log({"val/loss": avg_val_loss, "epoch": epoch + 1})

Epoch 1/10:   0%|          | 0/150 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1/10: 100%|██████████| 150/150 [04:42<00:00,  1.89s/it]


Epoch 1 completed. Average Train Loss: 3.2993


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.37s/it]


Average Validation Loss: 2.3526


Epoch 2/10: 100%|██████████| 150/150 [04:42<00:00,  1.88s/it]


Epoch 2 completed. Average Train Loss: 1.4317


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.37s/it]


Average Validation Loss: 1.5089


Epoch 3/10: 100%|██████████| 150/150 [04:42<00:00,  1.89s/it]


Epoch 3 completed. Average Train Loss: 1.1175


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.38s/it]


Average Validation Loss: 1.2788


Epoch 4/10: 100%|██████████| 150/150 [04:41<00:00,  1.88s/it]


Epoch 4 completed. Average Train Loss: 0.9998


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.37s/it]


Average Validation Loss: 1.1197


Epoch 5/10: 100%|██████████| 150/150 [04:42<00:00,  1.88s/it]


Epoch 5 completed. Average Train Loss: 0.9091


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.36s/it]


Average Validation Loss: 0.9938


Epoch 6/10: 100%|██████████| 150/150 [04:41<00:00,  1.88s/it]


Epoch 6 completed. Average Train Loss: 0.8470


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.37s/it]


Average Validation Loss: 0.8784


Epoch 7/10: 100%|██████████| 150/150 [04:41<00:00,  1.88s/it]


Epoch 7 completed. Average Train Loss: 0.7864


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.36s/it]


Average Validation Loss: 0.7822


Epoch 8/10: 100%|██████████| 150/150 [04:42<00:00,  1.88s/it]


Epoch 8 completed. Average Train Loss: 0.7367


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.37s/it]


Average Validation Loss: 0.7038


Epoch 9/10: 100%|██████████| 150/150 [04:41<00:00,  1.87s/it]


Epoch 9 completed. Average Train Loss: 0.7071


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.37s/it]


Average Validation Loss: 0.6536


Epoch 10/10: 100%|██████████| 150/150 [04:41<00:00,  1.88s/it]


Epoch 10 completed. Average Train Loss: 0.6684


Validating: 100%|██████████| 13/13 [00:17<00:00,  1.37s/it]

Average Validation Loss: 0.6095





In [None]:
# Set model to evaluation mode
model.eval()
predictions = []

# Loop through the full test dataset
for i in tqdm(range(len(ds["test"])), desc="Generating captions"):
    image = ds["test"][i]["image"]
    pixel_values = image_processor(image, return_tensors="pt")["pixel_values"].to(model.language_model.device)

    with torch.no_grad():
        # Vision encoding
        vision_output = model.vision_model(pixel_values=pixel_values).last_hidden_state
        vision_embedding = vision_output.mean(dim=1)
        vision_proj = model.vision_proj(vision_embedding).unsqueeze(1)  # [B, 1, D]

        # Prepare prompt
        prompt = "caption en"
        tokenized = tokenizer(prompt, return_tensors="pt", padding=False)
        input_ids = tokenized["input_ids"].to(model.language_model.device)
        attention_mask = tokenized["attention_mask"].to(model.language_model.device)

        # Text embedding
        input_embeds = model.language_model.transformer.wte(input_ids)
        input_embeds = torch.cat([vision_proj, input_embeds], dim=1)

        # Extend attention mask with prefix mask
        prefix_mask = torch.ones((attention_mask.shape[0], 1), dtype=attention_mask.dtype).to(attention_mask.device)
        attention_mask = torch.cat([prefix_mask, attention_mask], dim=1)

        # Generate text
        generated_ids = model.language_model.generate(
            inputs_embeds=input_embeds,
            attention_mask=attention_mask,
            max_new_tokens=30,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False
        )

        # Decode caption (removing prompt part)
        caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)[len(prompt):].strip()
        predictions.append(caption)

Generating captions: 100%|██████████| 3150/3150 [16:12<00:00,  3.24it/s]


In [None]:
# Get the references
# Define a variable to store the reference captions
all_references = []
for i in tqdm(range(len(ds["test"])), desc="Collecting reference captions"):
    # Get the reference
    reference_per_sample = []
    for j in range(1,6):
        reference = ds["test"][i][f"caption_{j}"]
        reference_per_sample.append(reference)
        print(f"The reference caption_{j}:")
        print(repr(reference))

    print()
    all_references.append(reference_per_sample)

Collecting reference captions:   1%|          | 19/3150 [00:00<00:16, 189.70it/s]

The reference caption_1:
'A gray plane on the runway and the lawn beside .'
The reference caption_2:
'A grey plane is on the runway by the lawn .'
The reference caption_3:
'There is an airplane on the runway with a large lawn by the runway .'
The reference caption_4:
'A plane is parked on the runway next to the grass .'
The reference caption_5:
'There is a plane on the runway beside the grass .'

The reference caption_1:
'Three small planes parked in a line on the airport and a big plane behind them .'
The reference caption_2:
'There are four aircraft on the open ground, The largest of which is three times as large as the smallest one .'
The reference caption_3:
'There are many planes of different sizes in a clearing .'
The reference caption_4:
'Four planes are parked on the runway .'
The reference caption_5:
'Four planes of different sizes were on the marked ground .'

The reference caption_1:
'A plane parked in a line on the airport with some marks .'
The reference caption_2:
'A whit

Collecting reference captions:   2%|▏         | 59/3150 [00:00<00:15, 194.66it/s]

The reference caption_1:
'Two tail-to-tail planes parked on the airport .'
The reference caption_2:
'Here are two planes heading in opposite directions .'
The reference caption_3:
'There are two planes of different sizes on a clearing .'
The reference caption_4:
'Two planes are parked on the open space between the terminal and the runway .'
The reference caption_5:
'Two planes are on the marked ground .'

The reference caption_1:
'Two planes with the purple marks parked next to boarding bridges .'
The reference caption_2:
'Two blue-tailed planes parked at the boarding gate .'
The reference caption_3:
'Two planes of the same size parked neatly next to the buildings inside the airport .'
The reference caption_4:
'Two planes are parked next to the terminal .'
The reference caption_5:
'Two planes are in the parking lot next to the building .'

The reference caption_1:
'Two planes parked one after another on the airport and the lawn beside .'
The reference caption_2:
'There are two white pl

Collecting reference captions:   3%|▎         | 100/3150 [00:00<00:15, 193.68it/s]

The reference caption_1:
'An airport with some parallel runways and an open area on the lawn .'
The reference caption_2:
'The lawn is divided into many rectangular areas by runways .'
The reference caption_3:
'A huge complex airport with many runways and buildings and many planes parked next to the brown mountains .'
The reference caption_4:
'There is an airport between the mountain and many houses .'
The reference caption_5:
'The two roads of the airport are parallel to each other .'

The reference caption_1:
'An airport with two parallel runways and an open area at the foot of the mountain .'
The reference caption_2:
'This airport has only one runway .'
The reference caption_3:
'There are many buildings along the runway and apron of the airport and there are many planes parked  On the other side of the runway is a brown mountain range .'
The reference caption_4:
'Airports built at the foot of the mountain next to the tenantless land .'
The reference caption_5:
'The two roads of the a

Collecting reference captions:   4%|▍         | 140/3150 [00:00<00:15, 190.14it/s]

The reference caption_5:
'The two roads of the airport are parallel to each other .'

The reference caption_1:
'An airport with two parallel runways and an open area on the farmland and some buildings beside .'
The reference caption_2:
'The airport is built by the sea and the sea water is light blue .'
The reference caption_3:
'A complex airport with many runways and buildings is surrounded by a brown and green farmland .'
The reference caption_4:
'An airport is surrounded by the field next to many houses .'
The reference caption_5:
'There are many buildings of different sizes and shapes at the airport .'

The reference caption_1:
'An airport with a runway and an open area on the farmland with different plants and some villages beside .'
The reference caption_2:
'This airport has only one runway, And its color is grey .'
The reference caption_3:
'An airport with a runway and many buildings is surrounded by a brown and green farmland .'
The reference caption_4:
'There is an airport betw

Collecting reference captions:   6%|▌         | 181/3150 [00:00<00:15, 191.01it/s]

'A road is next to a baseball diamond .'

The reference caption_1:
'A baseball diamond next to a parking lot and the medium residential area beside .'
The reference caption_2:
'There are 11 baseball courts in the picture, Four of which are double the others .'
The reference caption_3:
'There is a baseball diamond in the grass surrounded by many trees, Houses and parking lot .'
The reference caption_4:
'A baseball diamond is in the middle of grass surrounded by many buildings and trees .'
The reference caption_5:
'Many buildings are next to  the  baseball diamond .'

The reference caption_1:
'Two baseball diamonds next to a parking lot and some trees beside .'
The reference caption_2:
'In the picture there is a baseball field and a parking lot .'
The reference caption_3:
'There are two baseball diamonds in the grass surrounded by many trees, Houses and parking lot .'
The reference caption_4:
'A big baseball diamond and a small baseball diamond are next to some cars .'
The reference capt

Collecting reference captions:   7%|▋         | 222/3150 [00:01<00:15, 193.96it/s]

The reference caption_2:
'There is a muddy path by the baseball field .'
The reference caption_3:
'There are three baseball diamonds in the grass surrounded by many trees .'
The reference caption_4:
'There are three baseball diamonds in the middle of grass next to an open place .'
The reference caption_5:
'One vacant lot is next to three baseball diamonds .'

The reference caption_1:
'Four baseball diamonds on the lawn and the open area beside .'
The reference caption_2:
'There is a trail leading to this baseball field .'
The reference caption_3:
'There are four baseball diamonds in the grass surrounded by many trees and houses .'
The reference caption_4:
'There are four baseball diamonds on the open place .'
The reference caption_5:
'One vacant lot is next to four baseball diamonds .'

The reference caption_1:
'A baseball diamond on the lawn and the residential area around it .'
The reference caption_2:
'There are neat rows of houses near the baseball field .'
The reference caption_3:

Collecting reference captions:   8%|▊         | 262/3150 [00:01<00:15, 191.74it/s]

The reference caption_5:
'The buildings are next to the basketball court .'

The reference caption_1:
'Two basketball courts next to a parking lot and the residential area beside .'
The reference caption_2:
'There are two basketball courts in the upper right corner .'
The reference caption_3:
'The basketball courts and parking lots are one street away from the residential area,With many trees nearby .'
The reference caption_4:
'There are two basketball courts in the middle of grass next to a parking lot with a road pass by .'
The reference caption_5:
'Many trees are near the basketball courts .'

The reference caption_1:
'Two basketball courts next to the open area and a road beside .'
The reference caption_2:
'The basketball court is surrounded by trees .'
The reference caption_3:
'The basketball court on the bare land is next to the road,With trees nearby .'
The reference caption_4:
'There are two basketball courts on the open place next to a road .'
The reference caption_5:
'Many tr

Collecting reference captions:  10%|▉         | 302/3150 [00:01<00:14, 193.16it/s]

The reference caption_2:
'There is a big basketball court beside the alley .'
The reference caption_3:
'The basketball court next to other facilities is on the lawn .'
The reference caption_4:
'There is a basketball court in the middle of grass .'
The reference caption_5:
'The grass is beside the basketball court .'

The reference caption_1:
'Two basketball courts on the lawn with some trees and some roads beside .'
The reference caption_2:
'Three basketball courts flank the house .'
The reference caption_3:
'A row of basketball courts separates the lawn and buildings .'
The reference caption_4:
'There are two basketball courts between a road and some trees .'
The reference caption_5:
'Many trees are near the basketball courts .'

The reference caption_1:
'A basketball court next to a parking lot and the residential area beside .'
The reference caption_2:
'Opposite the basketball court is the residence .'
The reference caption_3:
'The basketball court,A piece of lawn and parking lots a

Collecting reference captions:  11%|█         | 342/3150 [00:01<00:14, 191.20it/s]


'The waves are heavy around the beach .'

The reference caption_1:
'The beach with khaki sand and some small dirt slopes on the beach .'
The reference caption_2:
'The waves were blowing by the beach .'
The reference caption_3:
'White holey beach and green ocean .'
The reference caption_4:
'Huge waves of deep green seawater beat against the beach with sparse plants .'
The reference caption_5:
'Many green plants are on the beach .'

The reference caption_1:
'The beach with brown sand and the color of the seawater is dark blue .'
The reference caption_2:
'There were large white waves near the beach .'
The reference caption_3:
'Brown traced beach and deep green ocean .'
The reference caption_4:
'Waves of deep blue water wash the black beach .'
The reference caption_5:
'There are water marks on the beach .'

The reference caption_1:
'The beach with brown sand and the residential area on the beach .'
The reference caption_2:
'There are white waves near the beach .'
The reference caption_3:


Collecting reference captions:  12%|█▏        | 382/3150 [00:01<00:14, 193.58it/s]

The reference caption_2:
'The waves were blowing by the beach .'
The reference caption_3:
'Grey-green waters and grey-green beaches and meadows .'
The reference caption_4:
'The deep blue seawater washes the beach .'
The reference caption_5:
'Many green plants are on the beach .'

The reference caption_1:
'A bridge built on a dark green river and the residential area on both sides of the river .'
The reference caption_2:
'The bridge is bi-directional, And the water under the bridge appears dark blue .'
The reference caption_3:
'A bridge connects the land on both sides of the turquoise water, And there are many buildings on the land .'
The reference caption_4:
'A bridge is over the river with some buildings beside .'
The reference caption_5:
'There are many buildings beside the bridge .'

The reference caption_1:
'Two bridges built on a dark green river and the vacant lot on both sides of the river .'
The reference caption_2:
'On one side of the river are vegetation and on the other side

Collecting reference captions:  13%|█▎        | 423/3150 [00:02<00:13, 195.64it/s]

The reference caption_1:
'Two bridges built on a dark green river and some cars on one bridge .'
The reference caption_2:
'It was a two-way bridge, And the water in the river was black .'
The reference caption_3:
'Two parallel bridges connect the land on both sides of the blue river  There are many cars on the bridge .'
The reference caption_4:
'There are two bridges over the river with some cars on one of it .'
The reference caption_5:
'There are many cars on the bridges .'

The reference caption_1:
'Some bridges built on a dark green river and the residential area on both sides of the river .'
The reference caption_2:
'The water under the bridge is green .'
The reference caption_3:
'The three intersecting bridges connect the land on both sides of the turquoise water, One of which is curved and has many buildings and roads on the land .'
The reference caption_4:
'There are two straight bridge and a winding bridge over the river between many buildings .'
The reference caption_5:
'There

Collecting reference captions:  15%|█▍        | 463/3150 [00:02<00:14, 188.27it/s]


'Chaparral is distributed cluster by cluster on the sand .'
The reference caption_4:
'There are some different kinds of bushes make up the chaparral on the desert .'
The reference caption_5:
'The chaparral is not evenly distributed .'

The reference caption_1:
'A lot of chaparrals of different sizes grow in the desert .'
The reference caption_2:
'Many traces of cars driving in this desert can be seen .'
The reference caption_3:
'Chaparral is evenly distributed on the sand .'
The reference caption_4:
'The chaparral is consist of sparse bushes on the desert .'
The reference caption_5:
'The chaparral is not evenly distributed .'

The reference caption_1:
'A lot of small chaparrals grow in the desert .'
The reference caption_2:
'In this jungle, Only the little vegetation can be seen .'
The reference caption_3:
'Chaparral is scattered in the sand from dense to sparse .'
The reference caption_4:
'There are many bushes make up the chaparral on the desert .'
The reference caption_5:
'The midd

Collecting reference captions:  16%|█▌        | 501/3150 [00:02<00:14, 184.62it/s]

The reference caption_3:
'Chaparral is densely distributed on the sand .'
The reference caption_4:
'The chaparral is consist of some green bushes on the desert .'
The reference caption_5:
'The chaparral is thicker on the left than on the right .'

The reference caption_1:
'Some chaparrals of different sizes grow in the desert .'
The reference caption_2:
'Only scattered weeds can be seen in this desert .'
The reference caption_3:
'Chaparral is evenly distributed on the sand .'
The reference caption_4:
'There are some dry bushes make up the chaparral on the desert .'
The reference caption_5:
'The chaparral is not evenly distributed .'

The reference caption_1:
'Some chaparrals of different kinds and sizes grow on the wasteland .'
The reference caption_2:
'There are dozens of trees in this desert .'
The reference caption_3:
'Chaparral is densely distributed on the sand .'
The reference caption_4:
'There are some different kinds of bushes make up the chaparral in the bare land .'
The refer

Collecting reference captions:  17%|█▋        | 520/3150 [00:02<00:14, 183.86it/s]

The reference caption_1:
'The church with two green circular pointed towers and the rest of the church has some orange roofs .'
The reference caption_2:
'This is a church with two meadows in front of it .'
The reference caption_3:
'A church with brown and cyan roofs and several turquoise towers on the main house .'
The reference caption_4:
'There is a red church with blue circular roof next to the grass .'
The reference caption_5:
'A church with blue and orange roofs .'

The reference caption_1:
'The church with an octagon tower and the rest of the church has a brown cross-shaped roof .'
The reference caption_2:
'This is a church, And there is a highway in front of it .'
The reference caption_3:
'A tan cross-shaped roof with a domed church on the main house, Surrounded by buildings, Roads and cars .'
The reference caption_4:
'There is a red church with a circular roof .'
The reference caption_5:
'A church with a gray dome and brown roofs is surrounded by many brown buildings .'

The re

Collecting reference captions:  18%|█▊        | 559/3150 [00:02<00:14, 182.36it/s]

The reference caption_5:
'A church is beside a lawn by the urban roadside .'

The reference caption_1:
'The church with a tower and the rest of the church has a cross-shaped roof .'
The reference caption_2:
'This is a church, Which is mostly made of brown and yellow brick .'
The reference caption_3:
'A silver-gray cross-shaped roof, A main church with a gray domed roof, Surrounded by buildings, Trees, Roads and cars .'
The reference caption_4:
'The church is next to a road with some cars .'
The reference caption_5:
'A church with gray roofs is by the roadside and beside some trees .'

The reference caption_1:
'The church with an octagon tower is built on the corner .'
The reference caption_2:
'This is a church  There are many trees around it .'
The reference caption_3:
'A tan cross-shaped roof with a gray domed church on the main house, Surrounded by many buildings with roads and cars .'
The reference caption_4:
'The church is on the open place next to some buildings .'
The reference c

Collecting reference captions:  19%|█▉        | 599/3150 [00:03<00:13, 190.59it/s]


'Many dense green, Light yellow and tan mixed circular farms of different sizes .'
The reference caption_4:
'There are some bare circle farmland in the green land .'
The reference caption_5:
'There are many circular farmlands of different sizes distributed irregularly .'

The reference caption_1:
'Several khaki circular farmlands are on the ground .'
The reference caption_2:
'Round fields are not green .'
The reference caption_3:
'Four neatly arranged light green and tan mixed circular farmland .'
The reference caption_4:
'There are some bare circle farmland in the green land .'
The reference caption_5:
'There are four circular farmlands of different sizes next to each other .'

The reference caption_1:
'Many circular farmlands with different colors are neatly arranged on the wasteland and some roads go through the wasteland .'
The reference caption_2:
'Only the right side has circular farmland .'
The reference caption_3:
'Many densely arranged circular farmland with a mix of green, L

Collecting reference captions:  20%|██        | 640/3150 [00:03<00:12, 193.85it/s]

The reference caption_1:
'Several smaller dark green circular farmlands and a larger circular farmlands are next to some rectangular farmlands .'
The reference caption_2:
'There is a large circular field in the upper left corner .'
The reference caption_3:
'Many neatly arranged circular green fields of different sizes .'
The reference caption_4:
'Some green circle farmland are in the bare land .'
The reference caption_5:
'There are many green circular farmlands of the same size and a larger one .'

The reference caption_1:
'Several khaki circular farmlands are neatly arranged on the wasteland and a river goes through the wasteland .'
The reference caption_2:
'There are many round blue fields .'
The reference caption_3:
'Many neatly arranged circular farmlands of light green and light earth .'
The reference caption_4:
'There are some bare circle farmland in the green land .'
The reference caption_5:
'There are some circular farmlands of the same size .'

The reference caption_1:
'Severa

Collecting reference captions:  22%|██▏       | 681/3150 [00:03<00:12, 196.57it/s]

The reference caption_5:
'There are many gauzy clouds over the sea .'

The reference caption_1:
'The stratus clouds are located above the surface of the sea and some islands are at sea .'
The reference caption_2:
'A white cloud floats above the forest and river .'
The reference caption_3:
'Large white clouds on green land and blue sea .'
The reference caption_4:
'The white cloud is above blue sea and green island .'
The reference caption_5:
'There are many white clouds over some green islands .'

The reference caption_1:
'The cumulus clouds are located above the land covered by spare vegetation while some ups and downs are on the land .'
The reference caption_2:
'A thin cloud of thin clouds floated in the air .'
The reference caption_3:
'The clouds are in the air and the clouds are cast on the ground where the green and the yellow-brown intersect .'
The reference caption_4:
'The white cloud is above the mountain .'
The reference caption_5:
'A few pieces of white clouds over the mountai

Collecting reference captions:  23%|██▎       | 721/3150 [00:03<00:12, 190.69it/s]


'The white cloud is above bare farmland .'
The reference caption_5:
'There are some white clouds over the city .'

The reference caption_1:
'The stratus clouds and cumulus clouds are located above the sea and the land covered by dense vegetation .'
The reference caption_2:
'A white cloud floats above the forest and river .'
The reference caption_3:
'Large fluffy white clouds on green land and blue sea .'
The reference caption_4:
'The white cloud is above blue sea and green island .'
The reference caption_5:
'There are some gauzy clouds over a green land near the sea .'

The reference caption_1:
'The stratus clouds and cumulus clouds are located above the surface of the sea .'
The reference caption_2:
'A thin cloud of thin clouds floated in the air .'
The reference caption_3:
'Large fluffy white clouds on blue sea .'
The reference caption_4:
'The white cloud is above blue sea .'
The reference caption_5:
'There are thick clouds over the sea .'

The reference caption_1:
'The stratus clou

Collecting reference captions:  24%|██▍       | 760/3150 [00:03<00:13, 182.73it/s]


'The buildings are of different sizes and colors .'
The reference caption_3:
'In a business district there are several tall buildings of different shapes and several roads and many trees .'
The reference caption_4:
'The commercial area is near the road with many cars .'
The reference caption_5:
'A building with gray roofs stands on a commercial area .'

The reference caption_1:
'A commercial area has a rectangular building with a circular dome .'
The reference caption_2:
'There are two rows of tall buildings in the picture .'
The reference caption_3:
'In a commercial district there are several tall buildings of different shapes and several roads with many cars .'
The reference caption_4:
'The commercial area is near the road with many cars .'
The reference caption_5:
'A white building at the corner on a commercial area .'

The reference caption_1:
'A commercial area has some connected buildings of different heights .'
The reference caption_2:
'There are two black cars on the road next

Collecting reference captions:  25%|██▌       | 798/3150 [00:04<00:13, 177.50it/s]

The reference caption_2:
'The houses were arranged in disorder .'
The reference caption_3:
'There is a tall building and a road in a business district .'
The reference caption_4:
'The commercial area is near the road .'
The reference caption_5:
'A building with gray roofs by the roadside on a commercial area .'

The reference caption_1:
'A commercial area has lots of buildings of different heights and shapes and some roads go through the commercial area .'
The reference caption_2:
'The rooms are spaced differently .'
The reference caption_3:
'In a business district there are many buildings of different heights and heights and several roads with many cars and some trees .'
The reference caption_4:
'The commercial area is consist of many tall buildings .'
The reference caption_5:
'There are some buildings by the roadside on a commercial area .'

The reference caption_1:
'A commercial area with lots of buildings of different heights and shapes is surrounded by some roads and trees .'
The 

Collecting reference captions:  26%|██▋       | 834/3150 [00:04<00:13, 175.55it/s]

The reference caption_5:
'There are many black buildings arranged neatly on dense residential area .'

The reference caption_1:
'A dense residential area has some vacant lots and lots of medium residences arranged neatly while a lake is surrounded by houses .'
The reference caption_2:
'There is a winding road to the left of the residential area .'
The reference caption_3:
'In densely populated areas there are many roads and neatly arranged trees as well as large lawns and circularly arranged houses around a lake .'
The reference caption_4:
'The dense residential is on the grass next to some trees and lakes .'
The reference caption_5:
'There are many buildings around the lake on dense residential area .'

The reference caption_1:
'A dense residential area with lots of houses next to each other is surrounded by some roads .'
The reference caption_2:
'The houses in the residential area are next to each other .'
The reference caption_3:
'There are many roads and neatly arranged houses and 

Collecting reference captions:  28%|██▊       | 873/3150 [00:04<00:12, 182.17it/s]

The reference caption_1:
'A dense residential area has lots of houses of the same style in the lawn while some roads go through the residential area .'
The reference caption_2:
'Most of the houses in the residential area are square .'
The reference caption_3:
'There are many roads and neatly arranged houses and trees and large lawns in densely populated areas .'
The reference caption_4:
'The dense residential is on the grass next to the road .'
The reference caption_5:
'There are many gray buildings arranged neatly on dense residential area .'

The reference caption_1:
'A dense residential area has lots of neatly arranged houses of different sizes .'
The reference caption_2:
'Residential areas are sparsely spaced .'
The reference caption_3:
'There are many roads and neatly arranged houses and trees and many vehicles in densely populated areas .'
The reference caption_4:
'The dense residential is next to the road .'
The reference caption_5:
'There are some brown buildings and gray build

Collecting reference captions:  29%|██▉       | 912/3150 [00:04<00:11, 187.72it/s]

The reference caption_2:
'There is some weed on the edge of this desert .'
The reference caption_3:
'It is a large piece of desert mixed with brown and a little green with many sand dunes and brown gullys .'
The reference caption_4:
'The yellow desert is next to the bare land .'
The reference caption_5:
'A brown desert covered with bare land .'

The reference caption_1:
'The desert has deep yellow sand and many fixed trellis dunes of different sizes .'
The reference caption_2:
'There are some ravines in this desert .'
The reference caption_3:
'This is a yellow desert with wavy stripes .'
The reference caption_4:
'The desert is full of white sand .'
The reference caption_5:
'A brown desert with many mounds of dirt .'

The reference caption_1:
'The desert has deep yellow sand and many trellis dunes of different sizes .'
The reference caption_2:
'There are many lines in the desert .'
The reference caption_3:
'This is a yellow desert with wavy stripes .'
The reference caption_4:
'The deser

Collecting reference captions:  30%|███       | 951/3150 [00:05<00:11, 188.67it/s]

The reference caption_5:
'A forest with many green trees .'

The reference caption_1:
'The forest has a lot of dense green trees of different heights .'
The reference caption_2:
'This is a dense forest .'
The reference caption_3:
'Many green trees are in a forest and the forest is so dense that it looks like sea waves .'
The reference caption_4:
'The forest is full of green trees .'
The reference caption_5:
'A dense forest with many green trees .'

The reference caption_1:
'The forest has a lot of dense yellow trees .'
The reference caption_2:
'This is a dense forest .'
The reference caption_3:
'Many withered trees are in a forest .'
The reference caption_4:
'The forest is consist of yellow and green trees .'
The reference caption_5:
'A dense forest with many brown trees and withered trees .'

The reference caption_1:
'The forest has some sage green trees, An open areas and a parking lot and some roads go through the forest .'
The reference caption_2:
'This is a dense forest .'
The ref

Collecting reference captions:  31%|███▏      | 989/3150 [00:05<00:11, 187.07it/s]

The reference caption_5:
'A dense forest with many green trees and some brown trees .'

The reference caption_1:
'The forest has a lot of dense dark green trees of different types .'
The reference caption_2:
'There are some crisscross paths on this land .'
The reference caption_3:
'A large dense mixed dark green and light green forest .'
The reference caption_4:
'The forest is on the grass .'
The reference caption_5:
'A dense forest with many green trees .'

The reference caption_1:
'The forest has a lot of dense dark green trees and some clearings .'
The reference caption_2:
'This is a dense forest .'
The reference caption_3:
'There are many yellow land in the forest with many green trees .'
The reference caption_4:
'The grass is in the middle of the forest .'
The reference caption_5:
'A dense green forest with some lawns around .'

The reference caption_1:
'The forest has some sparse green trees randomly arranged and the forest soil is deep yellow .'
The reference caption_2:
'This is

Collecting reference captions:  32%|███▏      | 1008/3150 [00:05<00:11, 185.49it/s]

The reference caption_4:
'The freeway goes through the bare land and some trees .'
The reference caption_5:
'There are some brown bare land and some bushes beside the freeway .'

The reference caption_1:
'A freeway with a bend is a farmland on one side and on the other side is a forest .'
The reference caption_2:
'There are no cars on the motorway .'
The reference caption_3:
'There is a large green lawn around the freeway, There are several cars on the freeway, And there is a building next to the freeway .'
The reference caption_4:
'The freeway is between the grass and some trees .'
The reference caption_5:
'There are green lawns and a green farmland beside the freeway .'

The reference caption_1:
'A straight freeway is a forest on one side and on the other side is a lawn with some trees .'
The reference caption_2:
'There are few cars on the motorway .'
The reference caption_3:
'There is a large green lawn around the freeway, There are two cars on the freeway, And there is a building n

Collecting reference captions:  33%|███▎      | 1046/3150 [00:05<00:11, 186.15it/s]

The reference caption_5:
'There are some green lawns and brown lawns beside the freeway .'

The reference caption_1:
'A straight freeway goes through the lawn and some cars are driving on the freeway .'
The reference caption_2:
'The trees cast their shadows on the highway .'
The reference caption_3:
'There are large green lawns around the freeway and there are three cars on the freeway .'
The reference caption_4:
'The freeway goes through the grass with some trees .'
The reference caption_5:
'There are green lawns beside the freeway .'

The reference caption_1:
'A straight freeway goes through the block and many trees are on both sides of the freeway .'
The reference caption_2:
'Traffic was sparse on the motorway .'
The reference caption_3:
'There are many buildings and roads and green plants on both sides of the freeway, And there are many cars on the freeway .'
The reference caption_4:
'The freeway is next to the houses and some trees .'
The reference caption_5:
'There are many green

Collecting reference captions:  34%|███▍      | 1086/3150 [00:05<00:10, 191.69it/s]

'There are many trees near the golf course .'
The reference caption_5:
'There are many green trees and some bunkers on the golf course .'

The reference caption_1:
'The golf course has two putting greens, A clearing, A fairway, Some barrier trees and three sandpits .'
The reference caption_2:
'There are three large bunkers and two paths .'
The reference caption_3:
'There are a few bunkers and a few paths on the large green lawn on the golf course and many trees .'
The reference caption_4:
'There are several roads on the golf course .'
The reference caption_5:
'There are some bunkers and green trees on the golf course .'

The reference caption_1:
'The golf course has two putting greens, A fairway, Some roads, Barrier trees and sandpits and some barrier trees are on both sides of the fairways .'
The reference caption_2:
'There are two parallel paths from top left to bottom right .'
The reference caption_3:
'There are a few bunkers and a few paths on the large green lawn on the golf cours

Collecting reference captions:  36%|███▌      | 1128/3150 [00:05<00:10, 196.75it/s]

The reference caption_1:
'The golf course has some lakes, Fairways, Barrier trees and sandpits and some houses arranged neatly are next to the golf course .'
The reference caption_2:
'There are six bunkers scattered along the road .'
The reference caption_3:
'On the large turf of the golf course On the large green turf of the golf course there are several bunkers and lakes and several paths and many trees and buildings .'
The reference caption_4:
'Many trees are near the golf course .'
The reference caption_5:
'There are many green trees and some lakes on the golf course .'

The reference caption_1:
'The golf course has a fairway, Some barrier trees and two sandpits .'
The reference caption_2:
'Two similar-sized bunkers next to each other .'
The reference caption_3:
'There are a few bunkers and a few paths on the large green lawn on the golf course and many trees .'
The reference caption_4:
'There are many trees near the golf course .'
The reference caption_5:
'There are two bunkers an

Collecting reference captions:  37%|███▋      | 1168/3150 [00:06<00:10, 188.58it/s]


"There's a little track in the upper left corner ."
The reference caption_3:
'The track and field is built on the bare lawn next to the building .'
The reference caption_4:
'There are many buildings next to the ground track field .'
The reference caption_5:
'There are some buildings around the ground track field with a red track .'

The reference caption_1:
'The ground track field is surrounded by some trees and lots of houses arranged neatly .'
The reference caption_2:
'There is a track and field on the green field .'
The reference caption_3:
'A track and field is built in the middle of a residential area, Surrounded by a lot of vegetation .'
The reference caption_4:
'There are many trees near the ground track field .'
The reference caption_5:
'There are many buildings and trees around the ground track field with a white track .'

The reference caption_1:
'The ground track field is next to some trees and buildings .'
The reference caption_2:
'There is a big track and field on the gre

Collecting reference captions:  38%|███▊      | 1207/3150 [00:06<00:10, 188.07it/s]


'The ground track field is next to lots of houses arranged neatly and some lawns are next to the ground track field .'
The reference caption_2:
'The green space inside the track is worn .'
The reference caption_3:
'The track and field is built on the lawn, A road away from the residential area, And there is a lot of vegetation around it .'
The reference caption_4:
'There are many buildings next to the ground track field .'
The reference caption_5:
'There are many buildings around the ground track field with a red track .'

The reference caption_1:
'The ground track field is next to lots of houses arranged neatly and some lawns are next to the ground track field .'
The reference caption_2:
'There is a track and field at the bottom of the picture .'
The reference caption_3:
'The track and field is built on the lawn, A road away from the residential area, And there is a lot of vegetation around it .'
The reference caption_4:
'There are a lot of  buildings near the ground track field .'
T

Collecting reference captions:  40%|███▉      | 1245/3150 [00:06<00:10, 186.02it/s]

The reference caption_3:
'There are a few yachts lined up on the sea next to the harbor next to the parking lot, And there are many empty seats .'
The reference caption_4:
'There are many ships in the harbor .'
The reference caption_5:
'There are some ships at the harbor with many vacancies .'

The reference caption_1:
'The harbor with lots of neatly docked boats is surrounded by some neatly arranged houses .'
The reference caption_2:
'Several cars were parked on the harbor shore .'
The reference caption_3:
'There are some yachts lined up on the sea next to the harbor next to the building and the parking lot, And some empty seats .'
The reference caption_4:
'Many ships are in the harbor .'
The reference caption_5:
'There are many ships and some buildings at the harbor .'

The reference caption_1:
'The harbor has lots of neatly docked boats and the color of the waters is green .'
The reference caption_2:
'The shape of the harbor is nearly circular .'
The reference caption_3:
'There are 

Collecting reference captions:  41%|████      | 1283/3150 [00:06<00:10, 183.55it/s]

The reference caption_4:
'Many ships are in the harbor .'
The reference caption_5:
'There are many ships docked neatly at the harbor .'

The reference caption_1:
'The harbor has lots of neatly docked boats and the color of the waters is green .'
The reference caption_2:
'There are different Numbers of ships on either side of the harbor .'
The reference caption_3:
'Many yachts are neatly arranged on the sea side of the harbor .'
The reference caption_4:
'There are many ships in the harbor .'
The reference caption_5:
'There are many ships docked neatly at the harbor .'

The reference caption_1:
'The harbor has lots of neatly docked boats and the color of the waters is green .'
The reference caption_2:
'There are more ships in the middle of the harbor .'
The reference caption_3:
'Many yachts are neatly arranged on the sea side of the harbor .'
The reference caption_4:
'Numerous ships are in the harbor .'
The reference caption_5:
'There are many buildings and some ships at the harbor .'

T

Collecting reference captions:  42%|████▏     | 1321/3150 [00:07<00:09, 183.13it/s]

The reference caption_3:
'In an industrial area, There are neatly planned factories and roads, Green belts and green spaces, And parking lots for cars and trucks .'
The reference caption_4:
'Many houses of different shapes are in the industrial area .'
The reference caption_5:
'There are some brown buildings and white buildings on the industrial area .'

The reference caption_1:
'The industrial area has some big blue workshops and lots of small workshops of different colors .'
The reference caption_2:
'Three buildings in the industrial area have blue roofs .'
The reference caption_3:
'In an industrial area, There are factories and roads, Green belts, And open-air equipment .'
The reference caption_4:
'There are many houses of different shapes and sizes in the industrial zone .'
The reference caption_5:
'There are some blue buildings and red buildings on the industrial area .'

The reference caption_1:
'The industrial area has lots of industrial equipment and some workshops .'
The refer

Collecting reference captions:  43%|████▎     | 1359/3150 [00:07<00:09, 180.57it/s]

The reference caption_5:
'There are many containers and some gray buildings on the industrial area .'

The reference caption_1:
'An intersection with sparse traffic located in a residential area with some green areas .'
The reference caption_2:
'There is no zebra crossing at the intersection .'
The reference caption_3:
'There are many buildings and many trees around the intersection .'
The reference caption_4:
'There are many buildings of different shapes and sizes beside the intersection .'
The reference caption_5:
'There are some green trees and lawns around the intersection .'

The reference caption_1:
'An intersection with normal traffic is between some buildings and parking lots .'
The reference caption_2:
'There is a blue building at the top left corner of the intersection .'
The reference caption_3:
'There are several moving vehicles on the road at the intersection, There are many buildings and some trees around the intersection, And many cars are parked in the open space beside

Collecting reference captions:  44%|████▍     | 1399/3150 [00:07<00:09, 187.66it/s]

The reference caption_2:
'The zebra crossing at the intersection is yellow .'
The reference caption_3:
'There are several moving vehicles on the road at the intersection, With many buildings and some trees around the intersection .'
The reference caption_4:
'There are many trees near the intersection .'
The reference caption_5:
'There are some green lawns around the intersection .'

The reference caption_1:
'An intersection with dense traffic is between some buildings and green areas and a river is next to the intersection .'
The reference caption_2:
"There aren't many cars at the intersection ."
The reference caption_3:
'There are many moving and waiting vehicles on the road at the intersection, And there are many neatly arranged tall buildings and some trees around the intersection .'
The reference caption_4:
'There are many buildings of different shapes and sizes beside the intersection .'
The reference caption_5:
'There are some green lawns beside the intersection .'

The reference

Collecting reference captions:  46%|████▌     | 1442/3150 [00:07<00:08, 196.78it/s]

The reference caption_3:
'There are a group of yellow-brown islands on the blue sea .'
The reference caption_4:
'There are many islands of different shapes and sizes .'
The reference caption_5:
'There are group of brown islands on a blue sea .'

The reference caption_1:
'The archipelago with lots of islands of different sizes is surrounded by light blue waters .'
The reference caption_2:
'The island is cool like a girl in a skirt .'
The reference caption_3:
'There are a group of green islands on the blue sea, And there are mountains with green vegetation on the island .'
The reference caption_4:
'The island on the right is the largest .'
The reference caption_5:
'There are some green islands surrounded by green water .'

The reference caption_1:
'The island with dense vegetation is surrounded by green waters .'
The reference caption_2:
'The green island, Which is approximately rectangular in shape, Is surrounded by pale blue water, And the island’s green plants flourish .'
The referenc

Collecting reference captions:  47%|████▋     | 1484/3150 [00:07<00:08, 201.99it/s]

The reference caption_3:
'There are a group of yellow-brown islands on the blue sea .'
The reference caption_4:
'The island on the left is the largest .'
The reference caption_5:
'There are three brown islands surrounded by green water .'

The reference caption_1:
'The stone island is surrounded by deep blue waters .'
The reference caption_2:
'The island is like a sock .'
The reference caption_3:
'Irregularly shaped khaki island surrounded by blue sea water .'
The reference caption_4:
'The land on the island is uneven .'
The reference caption_5:
'The white island is on a deep blue sea .'

The reference caption_1:
'The island with a large reef is surrounded by deep blue waters .'
The reference caption_2:
'The shape of the island is almost rectangular .'
The reference caption_3:
'The long strip of green islands is surrounded by blue waters with mountains with green vegetation .'
The reference caption_4:
'There are many islands of different shapes and sizes .'
The reference caption_5:
'Th

Collecting reference captions:  48%|████▊     | 1525/3150 [00:08<00:08, 196.92it/s]

The reference caption_2:
'The lake blends with the green space .'
The reference caption_3:
'There is an irregular blue-blue lake on the yellow-brown mountain, And the green vegetation covers more than half of the mountain .'
The reference caption_4:
'Many trees are around the lake .'
The reference caption_5:
'There are brown bare land and green meadows around the lake .'

The reference caption_1:
'The lake with a winding bank lies on a flat ground covered with vegetation and the water is dark blue .'
The reference caption_2:
'There are paths on both sides of the lake .'
The reference caption_3:
'An irregularly shaped blue lake on the ground with lush green vegetation .'
The reference caption_4:
'There are many trees around the lake .'
The reference caption_5:
'There are green land around the lake .'

The reference caption_1:
'Some lakes of different sizes lie in a wasteland and the water is deep green .'
The reference caption_2:
'The lake is irregular in shape .'
The reference caption_

Collecting reference captions:  50%|████▉     | 1566/3150 [00:08<00:07, 198.29it/s]

The reference caption_2:
'The lake was long and thin .'
The reference caption_3:
'On the ground with green vegetation, There is an irregular shape and an almost elliptical blue lake with some small lakes next to it .'
The reference caption_4:
'There are many trees around the lake .'
The reference caption_5:
'There are green forests and meadows around the lake .'

The reference caption_1:
'The lake lies in a rolling wasteland and the water is blue .'
The reference caption_2:
'The lake in the picture is small .'
The reference caption_3:
'An irregularly shaped pale blue lake on a tan mountain .'
The reference caption_4:
'The ground is uneven around the lake .'
The reference caption_5:
'The blue lake is in a brown desert .'

The reference caption_1:
'The lake lies in a mountainous area and the water is green .'
The reference caption_2:
'The lake in the picture is surrounded by green fields .'
The reference caption_3:
'An irregularly shaped blue lake surrounded by yellow-brown mountains .'


Collecting reference captions:  51%|█████     | 1607/3150 [00:08<00:07, 195.44it/s]


'The grass is flanked by lush woods .'
The reference caption_3:
'There are lines of trees next to the meadow .'
The reference caption_4:
'Some trees are on the meadow .'
The reference caption_5:
'There are some green trees on the sparse meadow .'

The reference caption_1:
'This meadow has dense grasses of different colors .'
The reference caption_2:
'The entire image is dominated by grass .'
The reference caption_3:
'The meadow is full of green grass .'
The reference caption_4:
'The meadow is dense but uneven .'
The reference caption_5:
'A dense meadow with many green grass .'

The reference caption_1:
'This meadow has dense grasses and some trees .'
The reference caption_2:
'The entire image is dominated by grass .'
The reference caption_3:
'There are some trees next to the meadow .'
The reference caption_4:
'Some trees are on the meadow .'
The reference caption_5:
'There are some green trees on the meadow .'

The reference caption_1:
'This meadow has dense lower grasses and some tal

Collecting reference captions:  52%|█████▏    | 1627/3150 [00:08<00:07, 192.14it/s]

The reference caption_2:
'The entire image is dominated by grass .'
The reference caption_3:
'There are some trees on the meadow .'
The reference caption_4:
'The meadow is dense but uneven .'
The reference caption_5:
'There are some green bushes on the green meadow .'

The reference caption_1:
'This meadow has dense grasses and a bare tree .'
The reference caption_2:
'The entire image is dominated by grass .'
The reference caption_3:
'The meadow is consist of green grass and dry grass .'
The reference caption_4:
'There is no grass in some parts of the meadow .'
The reference caption_5:
'A dense meadow with many green grass .'

The reference caption_1:
'This meadow has evenly distributed dense grasses .'
The reference caption_2:
'The grass is flanked by lush woods .'
The reference caption_3:
'The meadow is full of green grass .'
The reference caption_4:
'The meadow is dense but uneven .'
The reference caption_5:
'A dense meadow with many green grass .'

The reference caption_1:
'The med

Collecting reference captions:  53%|█████▎    | 1666/3150 [00:08<00:08, 183.08it/s]

The reference caption_3:
'The medium residential is on the grass next to the road .'
The reference caption_4:
'There are many buildings of different shapes and sizes in medium residential area .'
The reference caption_5:
'There are some buildings on green lawns on the medium residential area .'

The reference caption_1:
'The medium residential area has lots of trees, Some houses, A parking lot and a swimming pool .'
The reference caption_2:
'Medium-sized houses are built on green land .'
The reference caption_3:
'The medium residential is on the grass next to some trees .'
The reference caption_4:
'There are many trees in the middle residential area .'
The reference caption_5:
'There are many green trees and some buildings on the medium residential area .'

The reference caption_1:
'The medium residential area has some houses along the roads .'
The reference caption_2:
'There are a lot of trees in the medium-sized residential area .'
The reference caption_3:
'The medium residential is 

Collecting reference captions:  54%|█████▍    | 1704/3150 [00:09<00:07, 183.00it/s]

The reference caption_1:
'The mobile home park has some dense white mobile homes and some trees are among these mobile homes .'
The reference caption_2:
'There are three rows of mobile homes .'
The reference caption_3:
'The mobile home park is on the open place next to some trees .'
The reference caption_4:
'There are many buildings in the mobile home park .'
The reference caption_5:
'There are some white buildings and green trees in the mobile home park .'

The reference caption_1:
'The mobile home park has some mobile homes arranged in lines and bare trees .'
The reference caption_2:
'There are moving homes in this rectangular area .'
The reference caption_3:
'The mobile home park is on the grass next to the road .'
The reference caption_4:
'Many trees are in the mobile home park .'
The reference caption_5:
'There are some buildings and withered trees in the mobile home park .'

The reference caption_1:
'The mobile home park has some mobile homes arranged in lines and trees and a roa

Collecting reference captions:  55%|█████▌    | 1742/3150 [00:09<00:07, 178.26it/s]

The reference caption_3:
'The mobile home park is next to the road .'
The reference caption_4:
'There are many buildings in the mobile home park .'
The reference caption_5:
'There are some buildings and green trees in the mobile home park .'

The reference caption_1:
'The mobile home park has some dense white mobile homes .'
The reference caption_2:
'There are three rows of mobile homes .'
The reference caption_3:
'The mobile home park is on the open place next to the road .'
The reference caption_4:
'Many buildings are in the mobile home park .'
The reference caption_5:
'There are many white buildings arranged neatly in the mobile home park .'

The reference caption_1:
'The mobile home park has lots of neatly arranged white mobile homes and some roads go through the mobile home park .'
The reference caption_2:
'There are moving homes in this rectangular area .'
The reference caption_3:
'The mobile home park is on the open place next to the road .'
The reference caption_4:
'Many buildi

Collecting reference captions:  57%|█████▋    | 1780/3150 [00:09<00:07, 181.21it/s]

The reference caption_4:
'The mountains are winding and undulating .'
The reference caption_5:
'There are many ridges on the brown mountain .'

The reference caption_1:
'The folded mountain has a fault and its rocks are brown and yellow .'
The reference caption_2:
'You can see the veins of the mountain clearly .'
The reference caption_3:
'The mountain is consist of half of white soil and half of yellow soil .'
The reference caption_4:
'The mountain is uneven and rough .'
The reference caption_5:
'There are some cracks on the brown mountain .'

The reference caption_1:
'The folded mountain consists of a number of ridges and valleys .'
The reference caption_2:
'The mountain is covered with brown mud .'
The reference caption_3:
'The mountain is full of bare land with lines of green plants .'
The reference caption_4:
'The mountain is uneven and rough .'
The reference caption_5:
'There are many ridges on the brown mountain .'

The reference caption_1:
'The folded mountain consists of a numb

Collecting reference captions:  58%|█████▊    | 1818/3150 [00:09<00:07, 179.99it/s]

The reference caption_3:
'The mountain is consist of the bare land .'
The reference caption_4:
'The mountain is uneven and rough .'
The reference caption_5:
'There are many cracks on the brown mountain .'

The reference caption_1:
'The folded mountain consists of a number of ridges and valleys and is covered with some vegetation .'
The reference caption_2:
'The mountains are covered with yellow dirt .'
The reference caption_3:
'The mountain is full of green plants with lines of bare land .'
The reference caption_4:
'There is not the even mountain .'
The reference caption_5:
'There are some ridges on the green and brown mountain .'

The reference caption_1:
'The folded mountain consists of some ridges and valleys and is covered with dense vegetation .'
The reference caption_2:
'Thirty percent of the area on this mountain is covered by vegetation .'
The reference caption_3:
'The mountain is full of green plants .'
The reference caption_4:
'There are many trees on the mountain .'
The refe

Collecting reference captions:  59%|█████▉    | 1857/3150 [00:09<00:07, 184.49it/s]

The reference caption_1:
'The intricated overpasses is over the roads on the green space and some buildings are next to the overpass .'
The reference caption_2:
'The two overpasses in the picture form a heart shape .'
The reference caption_3:
'The overpass is on the grass next to some houses and trees .'
The reference caption_4:
'There are many overpasses of different lengths .'
The reference caption_5:
'There are green meadows around the overpass .'

The reference caption_1:
'The intricated overpasses is over the roads and lots of houses are next to the overpass .'
The reference caption_2:
'The perimeter of the overpasses is completely covered by vegetation .'
The reference caption_3:
'The overpass is next to some houses .'
The reference caption_4:
'Many buildings of different shapes and sizes are beside the overpass .'
The reference caption_5:
'There are many buildings and some green lawns beside the overpass .'

The reference caption_1:
'The cross-line overpass is over the roads on 

Collecting reference captions:  60%|██████    | 1897/3150 [00:10<00:06, 188.27it/s]

The reference caption_3:
'The overpass is next to the bare land with some cars .'
The reference caption_4:
'There is a car on the overpass .'
The reference caption_5:
'There are roads beneath the overpass .'

The reference caption_1:
'The overpass is over the roads and some cars are driving on the roads .'
The reference caption_2:
'The perimeter of the overpasses is completely covered by vegetation .'
The reference caption_3:
'The overpass is next to the grass with some cars .'
The reference caption_4:
'Several cars are on the road under the overpass .'
The reference caption_5:
'There are green meadows beside the overpass .'

The reference caption_1:
'The cross-line overpass is over the road and some cars are driving on the overpass .'
The reference caption_2:
'An overpass on top of two roads .'
The reference caption_3:
'The overpass is next to the grass and some trees .'
The reference caption_4:
'There are some cars on the overpass .'
The reference caption_5:
'There are some brown tre

Collecting reference captions:  61%|██████▏   | 1936/3150 [00:10<00:06, 188.85it/s]

The reference caption_4:
'Some trees are beside the palace .'
The reference caption_5:
'There are some green lawns and trees beside the palace .'

The reference caption_1:
'The palace has a courtyard and some cars parked in the courtyard .'
The reference caption_2:
'The central part of the palace is a circular building .'
The reference caption_3:
'The palace is on the grass next to some trees .'
The reference caption_4:
'The grass is beside the palace .'
The reference caption_5:
'There are some green lawns and trees beside the palace .'

The reference caption_1:
'The green palace with some palaces is surrounded by a grove of trees .'
The reference caption_2:
'There are many round lawns near the palace .'
The reference caption_3:
'The palace is surrounded by trees .'
The reference caption_4:
'There are some trees near the palace .'
The reference caption_5:
'There are some green trees beside the cyan palace .'

The reference caption_1:
'The palace has some palaces of different sizes colo

Collecting reference captions:  63%|██████▎   | 1975/3150 [00:10<00:06, 189.06it/s]


The reference caption_1:
'The green square palace with a courtyard is surrounded by roads .'
The reference caption_2:
'There is a wide road in front of the palace .'
The reference caption_3:
'The palace is on the open place next to the grass .'
The reference caption_4:
'There are many cars on the road beside the palace .'
The reference caption_5:
'There are roads around the green palace .'

The reference caption_1:
'The cross-shaped palace has a tower .'
The reference caption_2:
'The roofs of palaces are all red .'
The reference caption_3:
'The palace is on the open place next to some buildings .'
The reference caption_4:
'Several buildings of different shapes are beside the palace .'
The reference caption_5:
'A black palace with a dome beside a brown building .'

The reference caption_1:
'The palace has some palaces of different colors, A square and a park .'
The reference caption_2:
'The roofs of palaces are all black, And the whole looks like a rectangle .'
The reference caption_3:

Collecting reference captions:  64%|██████▍   | 2013/3150 [00:10<00:06, 185.64it/s]

The reference caption_5:
'There are some parking lots with many cars .'

The reference caption_1:
'The parking lot has some cars arranged neatly in lines .'
The reference caption_2:
'Six rows of cars were parked in the parking lot .'
The reference caption_3:
'The parking lot is full of cars .'
The reference caption_4:
'Many cars of different styles are on the parking lot .'
The reference caption_5:
'There are many cars on the parking lot .'

The reference caption_1:
'The parking lot has lots of neatly arranged cars of different colors but some parking spaces are still empty .'
The reference caption_2:
'There are several trees beside the parking lot .'
The reference caption_3:
'The parking lot is full of cars .'
The reference caption_4:
'Many cars are in the parking lot .'
The reference caption_5:
'There are many cars and some empty space on the parking lot .'

The reference caption_1:
'The parking lot with some cars arranged neatly in lines is between a green area and a building .'
The

Collecting reference captions:  65%|██████▌   | 2051/3150 [00:10<00:05, 185.26it/s]

The reference caption_5:
'There are some green trees and many cars on the parking lot .'

The reference caption_1:
'The parking lot has lots of cars arranged neatly in lines and some trees are among cars .'
The reference caption_2:
'There is a white building by the parking lot .'
The reference caption_3:
'The parking lot is full of cars .'
The reference caption_4:
'There are many cars in the parking lot .'
The reference caption_5:
'There are some green trees and many cars on the parking lot .'

The reference caption_1:
'The parking lot has lots of cars arranged neatly in lines and a building is next to the parking lot .'
The reference caption_2:
'There are a dozen cars in the parking lot .'
The reference caption_3:
'The parking lot is full of cars between buildings and trees .'
The reference caption_4:
'There are many different colored cars in the parking lot .'
The reference caption_5:
'There are many cars on the parking lot beside a gray building .'

The reference caption_1:
'The par

Collecting reference captions:  66%|██████▋   | 2090/3150 [00:11<00:05, 187.88it/s]


'These railways go through the wasteland and a parking lot is on one side of the railways .'
The reference caption_2:
'One side of the railway, there are lush trees  And the other side, there is a bare ground .'
The reference caption_3:
'The railway is on the bare land next to some trees .'
The reference caption_4:
'There are many trees beside the railway .'
The reference caption_5:
'There are brown bare land and brown trees beside the railway .'

The reference caption_1:
'These staggered railways have some fields on one side of the railways .'
The reference caption_2:
'There are dozens of parallel railway lines here .'
The reference caption_3:
'The railway is on the bare land next to some trees .'
The reference caption_4:
'Many trees are beside the railway .'
The reference caption_5:
'There are some green bushes beside the railway .'

The reference caption_1:
'These railways have some trains of different colors on the tracks .'
The reference caption_2:
'There are several towering tre

Collecting reference captions:  67%|██████▋   | 2110/3150 [00:11<00:05, 188.97it/s]

The reference caption_3:
'The railway is next to some houses .'
The reference caption_4:
'There are several buildings beside the railway .'
The reference caption_5:
'There are some buildings with gray roofs beside the railway .'

The reference caption_1:
'These railways are flanked by some buildings and a lake is next to the railways .'
The reference caption_2:
'There are two parallel railway lines here .'
The reference caption_3:
'The railway is next to some trees and houses .'
The reference caption_4:
'Many buildings of different shapes and sizes are beside the railway .'
The reference caption_5:
'There are some green bushes and buildings beside the railway .'

The reference caption_1:
'These railways have a green area on one side of the railways .'
The reference caption_2:
'There are two trains passing by in the picture .'
The reference caption_3:
'The railway is next to some trees and grass .'
The reference caption_4:
'A lot of trees are beside the railway .'
The reference caption_

Collecting reference captions:  68%|██████▊   | 2149/3150 [00:11<00:05, 187.00it/s]

The reference caption_1:
'The railway station has lots of railways and some buildings and some other buildings are next to the railway station .'
The reference caption_2:
'There is a fully equipped railway station in the city .'
The reference caption_3:
'The railway station is next to some buildings and trees .'
The reference caption_4:
'Many buildings of different shapes and sizes are beside the railway station .'
The reference caption_5:
'There are some buildings near the railway station .'

The reference caption_1:
'The railway station with lots of railways is surrounded by a residential area and some trains are on the railways .'
The reference caption_2:
'A well-equipped railway station in a crowded city .'
The reference caption_3:
'The railway station is next to some buildings and trees .'
The reference caption_4:
'There are many buildings beside the railway station .'
The reference caption_5:
'There are many brown buildings around the railway station .'

The reference caption_1:


Collecting reference captions:  69%|██████▉   | 2188/3150 [00:11<00:05, 189.43it/s]


'There are some buildings near the white railway station .'

The reference caption_1:
'The railway station has lots of railways and some white striped buildings and the railway station is surrounded by lots of other buildings .'
The reference caption_2:
'A well-equipped railway station in a crowded city .'
The reference caption_3:
'The railway station is next to some buildings .'
The reference caption_4:
'Many buildings are beside the railway station .'
The reference caption_5:
'There are many buildings around the railway station .'

The reference caption_1:
'The railway station has lots of railways and an orange building and some trains are on the railways .'
The reference caption_2:
'There is a large railway station under construction in the city .'
The reference caption_3:
'The railway station is next to some buildings .'
The reference caption_4:
'Many red buildings are beside the railway station .'
The reference caption_5:
'There are some red buildings near the red railway station

Collecting reference captions:  71%|███████   | 2228/3150 [00:11<00:04, 190.03it/s]

The reference caption_3:
'There are some light green and bare rectangular farmland .'
The reference caption_4:
'There are many rectangular farmlands of different sizes .'
The reference caption_5:
'There are many brown rectangular farmlands .'

The reference caption_1:
'Some green rectangular farmlands are on the ground and a shed is next to the farmlands .'
The reference caption_2:
'Many neatly arranged green, light blue and tan mixed rectangular farmlands of different sizes .'
The reference caption_3:
'There are some green rectangular farmland .'
The reference caption_4:
'There is a buildings beside the rectangular farmlands .'
The reference caption_5:
'There are some green rectangular farmlands .'

The reference caption_1:
'Some green rectangular farmlands of different sizes are arranged in neat rows and a channel goes through the farmlands .'
The reference caption_2:
'Many green and tan mixed rectangular farm fields of different sizes .'
The reference caption_3:
'There are some ligh

Collecting reference captions:  72%|███████▏  | 2269/3150 [00:12<00:04, 194.00it/s]

The reference caption_2:
'A curvy turquoise river with tributaries, Green and tan mixed land and bustling cities on both sides of the bank .'
The reference caption_3:
'The river goes through farmland and houses .'
The reference caption_4:
'Many trees are around the river .'
The reference caption_5:
'There are many buildings and farmlands near the river .'

The reference caption_1:
'The dark green river with a channel bar goes through the residential area and the farmland .'
The reference caption_2:
'A curvy green river with small islands, Green and tan mixed fields on both sides of the bank, And a bustling city .'
The reference caption_3:
'The river goes through houses and trees .'
The reference caption_4:
'There are many trees around the river .'
The reference caption_5:
'There are many buildings and trees near the river with a green island .'

The reference caption_1:
'The blue river with a wide riverbed has some buildings on one side .'
The reference caption_2:
'A curved light green

Collecting reference captions:  73%|███████▎  | 2310/3150 [00:12<00:04, 197.43it/s]

The reference caption_2:
'A curvy turquoise river with small islands, Green and tan mixed land on both sides, And buildings .'
The reference caption_3:
'The river goes through bare and green farmland .'
The reference caption_4:
'Many neat places are near the river .'
The reference caption_5:
'There are many farmlands and buildings near the river .'

The reference caption_1:
'The dark green river with a big channel bar is flanked by some fields and residential areas .'
The reference caption_2:
'A curvy green river with small islands, Green and tan mixed land on both sides, And buildings .'
The reference caption_3:
'The river goes through farmland and houses .'
The reference caption_4:
'There are many trees around the river .'
The reference caption_5:
'There are many farmlands and buildings near the river .'

The reference caption_1:
'The dark green river with a winding bank goes through the farmland in the mountainous area .'
The reference caption_2:
'A curved green river with approxima

Collecting reference captions:  75%|███████▍  | 2350/3150 [00:12<00:04, 189.03it/s]

The reference caption_4:
'Many cars are beside the roundabout .'
The reference caption_5:
'There are many cars on the parking lot beside the roundabout .'

The reference caption_1:
'The roundabout with five exits and entrances is surrounded by some buildings .'
The reference caption_2:
'There are several moving vehicles on the road at the roundabout, Many buildings and some trees and a few lawns around the roundabout .'
The reference caption_3:
'The roundabout is next to buildings and trees .'
The reference caption_4:
'Several buildings of different shapes are beside the roundabout .'
The reference caption_5:
'There are some brown buildings beside the roundabout .'

The reference caption_1:
'The roundabout with three exits and entrances is between the residential area and the lawn .'
The reference caption_2:
'There are several moving vehicles on the road at the roundabout, And there are some buildings and a large area of \u200b\u200bfarmland around the roundabout .'
The reference capti

Collecting reference captions:  76%|███████▌  | 2388/3150 [00:12<00:04, 183.16it/s]


'Several buildings of different shapes are beside the roundabout .'
The reference caption_5:
'There are some buildings and green trees beside the roundabout .'

The reference caption_1:
'The roundabout with five exits and entrances is between some buildings and a clearing .'
The reference caption_2:
'There are many moving and waiting vehicles at the roundabout, And there are some buildings and some trees and a few lawns around the roundabout .'
The reference caption_3:
'The roundabout is on the bare land next to buildings .'
The reference caption_4:
'Many buildings of different shapes are around the roundabout .'
The reference caption_5:
'There are many brown buildings beside the roundabout .'

The reference caption_1:
'The roundabout connects three roads and some buildings are next to the roundabout .'
The reference caption_2:
'There are several moving vehicles on the road at the roundabout, Many buildings and some trees and a few lawns around the roundabout .'
The reference caption_

Collecting reference captions:  77%|███████▋  | 2427/3150 [00:12<00:03, 187.95it/s]

The reference caption_5:
'The runway with white marks is beside a green lawn .'

The reference caption_1:
'The runway has a threshold marking and the designation marking is 21 .'
The reference caption_2:
"Next to the runway is a green lawn with white '12' and dash and arrow signs ."
The reference caption_3:
'The runway is on the grass next to the trees .'
The reference caption_4:
'The grass is beside the runway .'
The reference caption_5:
'There are green lawns beside the runway with white marks .'

The reference caption_1:
'The runway goes through the lawn and several touchdown zone markings and aiming point markings are on the runway .'
The reference caption_2:
'There is a green lawn next to the runway, With white dashes of varying thickness on the runway .'
The reference caption_3:
'The runway is on the grass .'
The reference caption_4:
'The runway is surrounded by grass .'
The reference caption_5:
'There are green lawns beside the runway .'

The reference caption_1:
'The runway wit

Collecting reference captions:  78%|███████▊  | 2469/3150 [00:13<00:03, 197.53it/s]

The reference caption_1:
'The two runways are next to each other and the designation marking is 28L and 28R .'
The reference caption_2:
'Next to the two parallel runways is brown bare ground  There is an airplane on the runway, And there are white short and thick line signs on the runway .'
The reference caption_3:
'The runway is on the bare land with an airplane .'
The reference caption_4:
'The runway is surrounded by grass .'
The reference caption_5:
'There are brown bare land near the runway with white marks .'

The reference caption_1:
'The runway with some markings goes through the lawn .'
The reference caption_2:
'There is a green lawn next to the runway, With white dashes of varying thickness on the runway .'
The reference caption_3:
'The runway is on the grass next to the bare land .'
The reference caption_4:
'The grass is around the runway .'
The reference caption_5:
'There are green lawns beside the runway .'

The reference caption_1:
'The runway connects some other roads and

Collecting reference captions:  80%|███████▉  | 2510/3150 [00:13<00:03, 199.13it/s]

The reference caption_1:
'A lot of sea ice of different sizes and thickness floats in the deep blue sea .'
The reference caption_2:
'Countless ice cubes of various sizes are connected into a loose, Large piece floating on the blue sea .'
The reference caption_3:
'There are many ice on the black sea .'
The reference caption_4:
'The sea is surrounded by sea ice .'
The reference caption_5:
'There are many pieces of sea ice .'

The reference caption_1:
'A large piece of sea ice and several small pieces of sea ice float in the deep blue sea .'
The reference caption_2:
'A large block of ice and the sparse small blocks around it float on the deep blue sea .'
The reference caption_3:
'There is a big ice and many small ice on the dark blue sea .'
The reference caption_4:
'The sea ice in the middle is larger .'
The reference caption_5:
'There are some small pieces of sea ice and a larger one .'

The reference caption_1:
'A lot of sea ice of different sizes and thickness floats in the deep blue s

Collecting reference captions:  81%|████████  | 2550/3150 [00:13<00:03, 192.93it/s]

The reference caption_3:
'There are many ice on the blue sea next to the land .'
The reference caption_4:
'There are different shapes of sea ice .'
The reference caption_5:
'There are many pieces of sea ice .'

The reference caption_1:
'Several pieces of sea ice of different sizes float in the deep blue sea .'
The reference caption_2:
'Many ice cubes of various sizes are scattered floating on the blue sea .'
The reference caption_3:
'There are many ice on the black sea .'
The reference caption_4:
'There are sea ice of different sizes .'
The reference caption_5:
'There are some pieces of sea ice .'

The reference caption_1:
'A lot of small pieces of sea ice float in the deep blue sea .'
The reference caption_2:
'Countless ice cubes of various sizes are connected into a loose, Large piece floating on the blue sea .'
The reference caption_3:
'There are many ice on the black sea .'
The reference caption_4:
'There are sea ice of different shapes and sizes .'
The reference caption_5:
'There 

Collecting reference captions:  82%|████████▏ | 2590/3150 [00:13<00:02, 192.03it/s]

The reference caption_5:
'There are several ships at the harbor .'

The reference caption_1:
'The cargo ship is docked at the dock and the water is deep blue .'
The reference caption_2:
'An orange-red bulk carrier docked by the dock with powdery materials on the dock .'
The reference caption_3:
'There is a ship in the blue water on the shore .'
The reference caption_4:
'There are many cars beside the ship on the dock .'
The reference caption_5:
'The ship is at the harbor .'

The reference caption_1:
'Several ships of different sizes are docked at the dock and some buildings are on the bank .'
The reference caption_2:
'Several off-white bulk carriers docked at the dock with powdery materials on the dock .'
The reference caption_3:
'There are two ships in the black water on the shore .'
The reference caption_4:
'Many buildings of different shapes are beside the ship .'
The reference caption_5:
'The ship is at the harbor with some buildings .'

The reference caption_1:
'The orange cargo s

Collecting reference captions:  83%|████████▎ | 2610/3150 [00:13<00:02, 185.55it/s]

The reference caption_4:
'There are many plants on the snowberg .'
The reference caption_5:
'There are some snow and green land on the snowberg .'

The reference caption_1:
'The snowberg is partly covered with snow and ice and the color of the mountain is brown .'
The reference caption_2:
'A snow-covered mountain covered in large areas .'
The reference caption_3:
'The snow berg is consist of bare land and white snow .'
The reference caption_4:
'There is snow on the part of the snowberg .'
The reference caption_5:
'There are much snow on the snowberg .'

The reference caption_1:
'The snowberg is covered with glaciers and vegetation and the snowberg also is partly exposed .'
The reference caption_2:
'The snow on the snowy mountains is distributed at the junction of brown and green mountains .'
The reference caption_3:
'The snow berg is consist of green plants,Bare land and white snow .'
The reference caption_4:
'The snowberg is covered with snow .'
The reference caption_5:
'There are muc

Collecting reference captions:  84%|████████▍ | 2648/3150 [00:14<00:02, 180.96it/s]

The reference caption_4:
'The ground on the snowberg is uneven .'
The reference caption_5:
'There are some snow on the snowberg .'

The reference caption_1:
'The snowberg is partly covered with snow and ice and partly covered with vegetation .'
The reference caption_2:
'The snow on the snowy mountains is distributed at the junction of brown and green mountains .'
The reference caption_3:
'The snow berg is consist of green plants and white snow .'
The reference caption_4:
'There are many plants on the snowberg .'
The reference caption_5:
'There are some snow and green land on the snowberg .'

The reference caption_1:
'The snowberg is covered with snow and ice .'
The reference caption_2:
'A snow-covered mountain covered in large areas .'
The reference caption_3:
'The snow berg is consist of bare land and white snow .'
The reference caption_4:
'The snowberg is covered with snow .'
The reference caption_5:
'There are full of snow on the snowberg .'

The reference caption_1:
'The snowberg i

Collecting reference captions:  85%|████████▌ | 2686/3150 [00:14<00:02, 180.25it/s]

The reference caption_4:
'A unique  building in a sparse residential .'
The reference caption_5:
'A sparse residential area with a gray building on the green meadow .'

The reference caption_1:
'A path leads to this house with black roof .'
The reference caption_2:
'In a sparse residential area there is a house and many withered trees, Surrounded by meadows and a road .'
The reference caption_3:
'The sparse residential is on the grass next to some trees and bare land .'
The reference caption_4:
'One road is next to the sparse residential .'
The reference caption_5:
'A sparse residential area with many brown trees and a brown building .'

The reference caption_1:
'There are circular swimming pools beside this house .'
The reference caption_2:
'In a sparse residential area there is a house and several withered trees surrounded by grass .'
The reference caption_3:
'The sparse residential is on the grass next to some trees and bare land .'
The reference caption_4:
'Many trees are beside th

Collecting reference captions:  86%|████████▋ | 2724/3150 [00:14<00:02, 180.08it/s]


'In a sparse residential area there is a gray house and several green trees, Surrounded by farmland, And a road .'
The reference caption_3:
'The sparse residential is on the grass next to some trees and a lake .'
The reference caption_4:
'Many trees are beside the sparse residential .'
The reference caption_5:
'A sparse residential area with a black building on the green meadow .'

The reference caption_1:
'There are several houses among the lush woods .'
The reference caption_2:
'In a sparse residential area there is a gray house and many green trees, And a road .'
The reference caption_3:
'The sparse residential is on the grass next to some trees and a road .'
The reference caption_4:
'There are many trees around the sparse residential .'
The reference caption_5:
'A sparse residential area with some green trees and a black building .'

The reference caption_1:
'There is a house by the road, With a few trees behind it .'
The reference caption_2:
'A grey house in a sparse residential 

Collecting reference captions:  88%|████████▊ | 2762/3150 [00:14<00:02, 181.34it/s]

The reference caption_3:
'The stadium is on the grass next to some buildings and trees .'
The reference caption_4:
'There are many trees around the stadium .'
The reference caption_5:
'There are many green trees near the stadium with a white roof .'

The reference caption_1:
'This stadium has a white podium and a blue runway .'
The reference caption_2:
'Oval stadium surrounded by road .'
The reference caption_3:
'The stadium is next to some buildings and roads .'
The reference caption_4:
'There are numerous buildings beside the stadium .'
The reference caption_5:
'There are some buildings around the stadium with a white roof .'

The reference caption_1:
'This stadium has a row of white signs on each side .'
The reference caption_2:
'The circular stadium is surrounded by lawns, Next to the parking lot .'
The reference caption_3:
'The stadium is next to the grass and parking lot .'
The reference caption_4:
'There are many cars in the parking lot next to the stadium .'
The reference capti

Collecting reference captions:  89%|████████▉ | 2799/3150 [00:14<00:01, 179.04it/s]

The reference caption_4:
'There are many buildings beside the stadium .'
The reference caption_5:
'There are many green trees around the stadium with a white roof .'

The reference caption_1:
'The stadium has a blue roof on three sides and a red roof on the other side .'
The reference caption_2:
'Oval gymnasium is surrounded by residential areas, Surrounded by vegetation .'
The reference caption_3:
'The stadium is next to some buildings and trees .'
The reference caption_4:
'Many buildings of different shapes are beside the stadium .'
The reference caption_5:
'There are many green trees and buildings around the stadium .'

The reference caption_1:
'This stadium has seats on two sides only .'
The reference caption_2:
'The square stadium is built on the lawn, Next to the residential area and the woods .'
The reference caption_3:
'The stadium is on the bare land next to some buildings and trees .'
The reference caption_4:
'There are many buildings of different shapes and sizes next to the

Collecting reference captions:  90%|█████████ | 2836/3150 [00:15<00:01, 179.77it/s]

The reference caption_5:
'There are some storage tanks of different sizes .'

The reference caption_1:
'There are several rows of buildings next to these storage tanks .'
The reference caption_2:
'There are many neatly arranged storage tanks and many cars in the factory .'
The reference caption_3:
'There are some storage tanks on the bare land .'
The reference caption_4:
'There are several roads of different lengths beside the storage tanks .'
The reference caption_5:
'There are some storage tanks of the same size .'

The reference caption_1:
'There are five storage tanks on the side of the white house .'
The reference caption_2:
'There are many neatly arranged storage tanks and many cars in the factory, And there are large lawns in the factory .'
The reference caption_3:
'There are some storage tanks on the grass .'
The reference caption_4:
'There are many trees beside the storage tanks .'
The reference caption_5:
'There are several storage tanks beside many green trees .'

The refere

Collecting reference captions:  91%|█████████ | 2874/3150 [00:15<00:01, 182.06it/s]

The reference caption_2:
'There are many different sized storage tanks and many buildings in the factory .'
The reference caption_3:
'There are some storage tanks on the bare land .'
The reference caption_4:
'The storage tanks are surrounded by roads .'
The reference caption_5:
'There are some storage tanks of the same size .'

The reference caption_1:
'There is a black house beside the storage tanks .'
The reference caption_2:
'There are many neatly arranged storage tanks and many cars in the factory .'
The reference caption_3:
'There are some white and silver storage tanks on the bare land .'
The reference caption_4:
'There are several buildings of different sizes near the storage tanks .'
The reference caption_5:
'There are some storage tanks of different sizes .'

The reference caption_1:
'There are two identical storage tanks beside a white house .'
The reference caption_2:
'There are many neatly arranged storage tanks and many cars in the factory .'
The reference caption_3:
'Ther

Collecting reference captions:  92%|█████████▏| 2912/3150 [00:15<00:01, 183.01it/s]


The reference caption_5:
'The tennis courts are on a green meadow beside some withered trees .'

The reference caption_1:
'There are two tennis courts of the same size in red and green .'
The reference caption_2:
'The tennis court on the lawn is a road away from the residential area .'
The reference caption_3:
'The tennis court is on the bare land next to some buildings and roads .'
The reference caption_4:
'There are many buildings of different shapes beside the tennis courts .'
The reference caption_5:
'There are some buildings near the tennis courts .'

The reference caption_1:
'There is a small tennis court in the upper left corner .'
The reference caption_2:
'Tennis court and swimming pool in the middle of a residential area surrounded by trees .'
The reference caption_3:
'The tennis court is on the grass next to some buildings and trees .'
The reference caption_4:
'Many buildings are beside the tennis courts .'
The reference caption_5:
'There are some buildings and green trees b

Collecting reference captions:  94%|█████████▎| 2950/3150 [00:15<00:01, 180.36it/s]

The reference caption_2:
'Tennis court on lawn next to residential area .'
The reference caption_3:
'The tennis court is on the grass next to some buildings and trees .'
The reference caption_4:
'There are many different shapes and sizes of buildings next to the tennis courts .'
The reference caption_5:
'There are some buildings beside the tennis court .'

The reference caption_1:
'There are two tennis courts of the same size in red and green .'
The reference caption_2:
'Tennis court on bare ground next to the building .'
The reference caption_3:
'The tennis court is on the bare land next to some buildings .'
The reference caption_4:
'Many buildings are beside the tennis courts .'
The reference caption_5:
'There are some white buildings near the tennis courts .'

The reference caption_1:
'There is a small tennis court in the upper left corner .'
The reference caption_2:
'Many trees and lawns near the tennis court in the middle of the residential area .'
The reference caption_3:
'The te

Collecting reference captions:  95%|█████████▍| 2988/3150 [00:15<00:00, 180.18it/s]

The reference caption_2:
'A large layer of brown and green mixed terraces .'
The reference caption_3:
'The bare and green terrace is next to some trees .'
The reference caption_4:
'There are many plants on the terrace .'
The reference caption_5:
'There are many yellow terraces and green ones .'

The reference caption_1:
'There is a winding path between the terraces .'
The reference caption_2:
'There are several houses in a large layer of brown and green mixed terraces .'
The reference caption_3:
'The bare and green terrace is next to some trees .'
The reference caption_4:
'The green plants are on the terrace .'
The reference caption_5:
'There are some green trees beside the terraces .'

The reference caption_1:
'There is a path between the terraces leading to a building .'
The reference caption_2:
'Many irregularly shaped brown and green mixed farm fields .'
The reference caption_3:
'There are bare and green terrace .'
The reference caption_4:
'There are many plants on the terrace .'
T

Collecting reference captions:  96%|█████████▌| 3028/3150 [00:16<00:00, 188.39it/s]



The reference caption_1:
'Many trees are planted in the terraces which are in the middle of the image .'
The reference caption_2:
'A large layer of brown and green mixed terraces .'
The reference caption_3:
'The bare terrace is next to some trees .'
The reference caption_4:
'The green plants are on the terrace .'
The reference caption_5:
'There are many yellow terraces of different sizes .'

The reference caption_1:
'Half of the terraces are full of crops, and the other half are empty .'
The reference caption_2:
'A large layer of brown and green mixed terraces with several winding narrow paths .'
The reference caption_3:
'The bare and green terrace is next to some trees .'
The reference caption_4:
'Many green plants are on the terrace .'
The reference caption_5:
'There are green meadows beside the terraces .'

The reference caption_1:
'There is a path between the terraces leading to a building .'
The reference caption_2:
'There are paths and houses in a patch of brown and green terra

Collecting reference captions:  97%|█████████▋| 3066/3150 [00:16<00:00, 185.33it/s]

The reference caption_1:
'There are two thermal power stations at the top right of the picture, and the left one is emitting smoke .'
The reference caption_2:
'In the thermal power plant there are plant buildings and machinery, two chimneys are emitting smoke .'
The reference caption_3:
'The thermal power station is next to some buildings with white mist above .'
The reference caption_4:
'There are many buildings of different shapes next to the thermal power station .'
The reference caption_5:
'There are some buildings and three chimneys on the thermal power station .'

The reference caption_1:
'Here is a smoking thermal power station with a row of red houses next to it .'
The reference caption_2:
'There are plant and machinery in the thermal power station, and two cooling towers emit steam .'
The reference caption_3:
'The thermal power station is next to some buildings and trees with white mist above .'
The reference caption_4:
'Many buildings of different shapes are next to the therm

Collecting reference captions:  99%|█████████▊| 3104/3150 [00:16<00:00, 182.96it/s]

The reference caption_1:
'There are seven thermal power stations here, two of which are thinner and taller than others .'
The reference caption_2:
'In the thermal power station there are neatly planned plants and equipment, there is a cooling tower with steam, and a chimney .'
The reference caption_3:
'The thermal power station is on the grass next to some buildings with white mist above .'
The reference caption_4:
'There are many buildings next to the thermal power station .'
The reference caption_5:
'There are many buildings and a chimney on the thermal power station beside a green meadow .'

The reference caption_1:
'Here are three thermal power stations, two of which are emitting smoke .'
The reference caption_2:
'There is a cooling tower in a thermal power station .'
The reference caption_3:
'The thermal power station is on the grass next to some trees .'
The reference caption_4:
'Numerous buildings of different shapes are next to the thermal power station .'
The reference caption

Collecting reference captions:  99%|█████████▉| 3124/3150 [00:16<00:00, 184.78it/s]

The reference caption_3:
'The wetland consists of dry and green plants and water .'
The reference caption_4:
'There are many green plants in the wetland .'
The reference caption_5:
'There are many buildings near the wetland .'

The reference caption_1:
'Except for the waters, this wetland is covered by vegetation .'
The reference caption_2:
'There is a large land mixed with green and brown in the river in a wetland .'
The reference caption_3:
'The wetland consists of bare land, Green plants and water .'
The reference caption_4:
'There are many different shapes and sizes of water around the wetlands .'
The reference caption_5:
'There are much bare land and many water areas on the wetland .'

The reference caption_1:
'There are several scattered trees in the middle of this wetland .'
The reference caption_2:
'There is a large green area in a lake in a wetland with lush green plants .'
The reference caption_3:
'The wetland consists of green plants and water .'
The reference caption_4:
'Th

Collecting reference captions: 100%|██████████| 3150/3150 [00:16<00:00, 187.66it/s]

The reference caption_4:
'Many different shapes of wetlands are surrounded by water .'
The reference caption_5:
'There are many green islands on the wetland .'

The reference caption_1:
'Except for the waters, this wetland is covered by vegetation .'
The reference caption_2:
'There is a large area of \u200b\u200bdense brown bare land in a lake in a wetland .'
The reference caption_3:
'The wetland consists of dry and green plants and water .'
The reference caption_4:
'Many waters of different sizes are surrounded by wetlands .'
The reference caption_5:
'There are much bare land and many water areas on the wetland .'

The reference caption_1:
'There are several small islands in the waters of this wetland .'
The reference caption_2:
'There are a few brown bare lands in the river in a wetland with sparse green plants on the bare lands .'
The reference caption_3:
'The wetland consists of dry and green plants and water .'
The reference caption_4:
'The green plants on the left of the wetland 




In [None]:
# Check the format of the reference captions
print(all_references[:5])

[['A gray plane on the runway and the lawn beside .', 'A grey plane is on the runway by the lawn .', 'There is an airplane on the runway with a large lawn by the runway .', 'A plane is parked on the runway next to the grass .', 'There is a plane on the runway beside the grass .'], ['Three small planes parked in a line on the airport and a big plane behind them .', 'There are four aircraft on the open ground, The largest of which is three times as large as the smallest one .', 'There are many planes of different sizes in a clearing .', 'Four planes are parked on the runway .', 'Four planes of different sizes were on the marked ground .'], ['A plane parked in a line on the airport with some marks .', 'A white plane was parked on the instruction line .', 'An airplane parked in an open area with many containers next to it .', 'A plane is parked on the open space .', 'There is 1 plane on the ground marked .'], ['A small plane and a big plane parked next to boarding bridges .', 'A white plan

In [None]:
# Check the format of the predicted captions. Each sample starts with a new line
print(predictions[:5])

['is on the grass next to some trees .', 'is on the grass next to some buildings .', 'is on the grass next to some buildings .', 'many planes on the runway at the airport, There are many cars on the runway, And there are many buildings on the runway .', 'is on the grass next to some trees .']


In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction

nltk.download('punkt_tab')
nltk.download('punkt')

# Tokenize references and predictions:
tokenized_refs = [
    [nltk.word_tokenize(ref.lower()) for ref in refs]
    for refs in all_references
]

tokenized_hyps = [nltk.word_tokenize(pred.lower()) for pred in predictions]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
tokenized_refs[0]

[['a',
  'gray',
  'plane',
  'on',
  'the',
  'runway',
  'and',
  'the',
  'lawn',
  'beside',
  '.'],
 ['a', 'grey', 'plane', 'is', 'on', 'the', 'runway', 'by', 'the', 'lawn', '.'],
 ['there',
  'is',
  'an',
  'airplane',
  'on',
  'the',
  'runway',
  'with',
  'a',
  'large',
  'lawn',
  'by',
  'the',
  'runway',
  '.'],
 ['a',
  'plane',
  'is',
  'parked',
  'on',
  'the',
  'runway',
  'next',
  'to',
  'the',
  'grass',
  '.'],
 ['there',
  'is',
  'a',
  'plane',
  'on',
  'the',
  'runway',
  'beside',
  'the',
  'grass',
  '.']]

In [None]:
# Sentence-level BLEU-2
smooth = SmoothingFunction().method1
for i, (refs_per_sample, hyp_tok) in enumerate(zip(tokenized_refs, tokenized_hyps)):
    scores = []
    for refs_tok in refs_per_sample:
        score = sentence_bleu(
            [refs_tok],
            hyp_tok,
            weights=(1/2, 1/2),
            smoothing_function=smooth
        )
        scores.append(score)
    max_score = max(scores)
    print(f"Example {i+1:2d} BLEU-2: {max_score*100:.2f}")

Example  1 BLEU-2: 38.70
Example  2 BLEU-2: 20.41
Example  3 BLEU-2: 23.57
Example  4 BLEU-2: 10.38
Example  5 BLEU-2: 23.57
Example  6 BLEU-2: 16.89
Example  7 BLEU-2: 9.69
Example  8 BLEU-2: 20.41
Example  9 BLEU-2: 25.68
Example 10 BLEU-2: 14.63
Example 11 BLEU-2: 21.38
Example 12 BLEU-2: 20.41
Example 13 BLEU-2: 20.41
Example 14 BLEU-2: 21.08
Example 15 BLEU-2: 36.55
Example 16 BLEU-2: 20.41
Example 17 BLEU-2: 20.41
Example 18 BLEU-2: 29.84
Example 19 BLEU-2: 35.83
Example 20 BLEU-2: 35.83
Example 21 BLEU-2: 24.76
Example 22 BLEU-2: 30.61
Example 23 BLEU-2: 33.83
Example 24 BLEU-2: 20.41
Example 25 BLEU-2: 24.69
Example 26 BLEU-2: 33.83
Example 27 BLEU-2: 15.32
Example 28 BLEU-2: 18.26
Example 29 BLEU-2: 49.93
Example 30 BLEU-2: 18.26
Example 31 BLEU-2: 21.08
Example 32 BLEU-2: 21.10
Example 33 BLEU-2: 13.09
Example 34 BLEU-2: 18.87
Example 35 BLEU-2: 3.38
Example 36 BLEU-2: 40.84
Example 37 BLEU-2: 25.14
Example 38 BLEU-2: 15.19
Example 39 BLEU-2: 20.41
Example 40 BLEU-2: 21.08
Ex

In [None]:
# Corpus-level BLEU-2
# corpus_bleu expects list-of-list-of-tokens refs, and list-of-tokens hyps
corpus_score = corpus_bleu(
    tokenized_refs,
    tokenized_hyps,
    weights=(1/2, 1/2),
    smoothing_function=smooth
)
print(f"\nCorpus BLEU-2: {corpus_score*100:.2f}")


Corpus BLEU-2: 46.94


In [None]:
# Sentence-level BLEU-3
smooth = SmoothingFunction().method1
for i, (refs_per_sample, hyp_tok) in enumerate(zip(tokenized_refs, tokenized_hyps)):
    scores = []
    for refs_tok in refs_per_sample:
        score = sentence_bleu(
            [refs_tok],
            hyp_tok,
            weights=(1/3, 1/3, 1/3),
            smoothing_function=smooth
        )
        scores.append(score)
    max_score = max(scores)
    print(f"Example {i+1:2d} BLEU-3: {max_score*100:.2f}")

Example  1 BLEU-3: 20.12
Example  2 BLEU-3: 8.41
Example  3 BLEU-3: 9.26
Example  4 BLEU-3: 3.55
Example  5 BLEU-3: 9.26
Example  6 BLEU-3: 6.63
Example  7 BLEU-3: 3.81
Example  8 BLEU-3: 8.41
Example  9 BLEU-3: 16.69
Example 10 BLEU-3: 6.03
Example 11 BLEU-3: 8.41
Example 12 BLEU-3: 8.41
Example 13 BLEU-3: 8.41
Example 14 BLEU-3: 8.22
Example 15 BLEU-3: 24.81
Example 16 BLEU-3: 8.41
Example 17 BLEU-3: 8.41
Example 18 BLEU-3: 10.06
Example 19 BLEU-3: 10.95
Example 20 BLEU-3: 10.95
Example 21 BLEU-3: 16.69
Example 22 BLEU-3: 18.59
Example 23 BLEU-3: 25.89
Example 24 BLEU-3: 8.41
Example 25 BLEU-3: 8.27
Example 26 BLEU-3: 25.89
Example 27 BLEU-3: 6.03
Example 28 BLEU-3: 7.47
Example 29 BLEU-3: 30.55
Example 30 BLEU-3: 7.47
Example 31 BLEU-3: 8.22
Example 32 BLEU-3: 7.99
Example 33 BLEU-3: 5.39
Example 34 BLEU-3: 7.41
Example 35 BLEU-3: 2.42
Example 36 BLEU-3: 27.73
Example 37 BLEU-3: 8.37
Example 38 BLEU-3: 9.87
Example 39 BLEU-3: 8.41
Example 40 BLEU-3: 8.22
Example 41 BLEU-3: 10.95
Exa

In [None]:
# Corpus-level BLEU-3
# corpus_bleu expects list-of-list-of-tokens refs, and list-of-tokens hyps
corpus_score = corpus_bleu(
    tokenized_refs,
    tokenized_hyps,
    weights=(1/3, 1/3, 1/3),
    smoothing_function=smooth
)
print(f"\nCorpus BLEU-3: {corpus_score*100:.2f}")


Corpus BLEU-3: 35.25


In [None]:
# Sentence-level BLEU-4
smooth = SmoothingFunction().method1
for i, (refs_per_sample, hyp_tok) in enumerate(zip(tokenized_refs, tokenized_hyps)):
    scores = []
    for refs_tok in refs_per_sample:
        score = sentence_bleu(
            [refs_tok],
            hyp_tok,
            weights=(1/4, 1/4, 1/4, 1/4),
            smoothing_function=smooth
        )
        scores.append(score)
    max_score = max(scores)
    print(f"Example {i+1:2d} BLEU-4: {max_score*100:.2f}")

Example  1 BLEU-4: 10.21
Example  2 BLEU-4: 5.61
Example  3 BLEU-4: 6.03
Example  4 BLEU-4: 2.10
Example  5 BLEU-4: 6.03
Example  6 BLEU-4: 4.32
Example  7 BLEU-4: 2.48
Example  8 BLEU-4: 5.61
Example  9 BLEU-4: 7.79
Example 10 BLEU-4: 4.02
Example 11 BLEU-4: 5.61
Example 12 BLEU-4: 5.61
Example 13 BLEU-4: 5.61
Example 14 BLEU-4: 5.31
Example 15 BLEU-4: 11.95
Example 16 BLEU-4: 5.61
Example 17 BLEU-4: 5.61
Example 18 BLEU-4: 6.07
Example 19 BLEU-4: 6.29
Example 20 BLEU-4: 6.29
Example 21 BLEU-4: 7.97
Example 22 BLEU-4: 8.43
Example 23 BLEU-4: 19.69
Example 24 BLEU-4: 5.61
Example 25 BLEU-4: 4.94
Example 26 BLEU-4: 19.69
Example 27 BLEU-4: 4.02
Example 28 BLEU-4: 4.94
Example 29 BLEU-4: 13.97
Example 30 BLEU-4: 4.94
Example 31 BLEU-4: 5.31
Example 32 BLEU-4: 5.11
Example 33 BLEU-4: 3.60
Example 34 BLEU-4: 4.83
Example 35 BLEU-4: 2.15
Example 36 BLEU-4: 13.35
Example 37 BLEU-4: 4.97
Example 38 BLEU-4: 4.52
Example 39 BLEU-4: 5.61
Example 40 BLEU-4: 5.31
Example 41 BLEU-4: 6.29
Example 42

In [None]:
# Corpus-level BLEU-4
# corpus_bleu expects list-of-list-of-tokens refs, and list-of-tokens hyps
corpus_score = corpus_bleu(
    tokenized_refs,
    tokenized_hyps,
    weights=(1/4, 1/4, 1/4, 1/4),
    smoothing_function=smooth
)
print(f"\nCorpus BLEU-4: {corpus_score*100:.2f}")


Corpus BLEU-4: 28.16


In [None]:
# Go on to calculate ROUGE scores
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=76d23b64cabd404df8a95d97c87a4f671b86bc90731d1bfea2da7f7f0cf63bf2
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import nltk
from collections import Counter

# Ensure tokenizer
nltk.download('punkt', quiet=True)

def rouge_n(ref: str, hyp: str, n: int = 4):
    ref_toks = nltk.word_tokenize(ref.lower())
    hyp_toks = nltk.word_tokenize(hyp.lower())
    ref_ngrams = list(nltk.ngrams(ref_toks, n))
    hyp_ngrams = list(nltk.ngrams(hyp_toks, n))
    ref_counts = Counter(ref_ngrams)
    hyp_counts = Counter(hyp_ngrams)
    overlap = sum(min(ref_counts[ng], hyp_counts[ng]) for ng in ref_counts)
    recall = overlap / max(len(ref_ngrams), 1)
    precision = overlap / max(len(hyp_ngrams), 1)
    f1 = 2 * recall * precision / (recall + precision + 1e-8)
    return (recall, precision, f1)

# Compute ROUGE-2
all_recalls, all_precisions, all_f1s = [], [], []
for refs, pred in zip(all_references, predictions):
    recalls_per_sample, precisions_per_sample, f1s_per_sample = [], [], []
    for ref in refs:
        r, p, f = rouge_n(ref, pred, n=2)
        recalls_per_sample.append(r)
        precisions_per_sample.append(p)
        f1s_per_sample.append(f)

    max_score = max(f1s_per_sample)
    max_index = f1s_per_sample.index(max_score)
    all_recalls.append(recalls_per_sample[max_index])
    all_precisions.append(precisions_per_sample[max_index])
    all_f1s.append(f1s_per_sample[max_index])
    print(f"REF:  {refs[max_index]!r}")
    print(f"HYP:  {pred!r}")
    print(f"   ROUGE-2 Recall:    {recalls_per_sample[max_index] * 100:.2f}%")
    print(f"   ROUGE-2 Precision: {precisions_per_sample[max_index] * 100:.2f}%")
    print(f"   ROUGE-2 F1:        {f1s_per_sample[max_index] * 100:.2f}%\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   ROUGE-2 F1:        66.67%

REF:  'There are several vehicles on the road at the roundabout, And many cars are parked on both sides of the road next to the roundabout .'
HYP:  'bout is on the grass next to some trees .'
   ROUGE-2 Recall:    7.69%
   ROUGE-2 Precision: 22.22%
   ROUGE-2 F1:        11.43%

REF:  'The roundabout is on the grass next to trees and buildings .'
HYP:  'bout is next to some trees and grass .'
   ROUGE-2 Recall:    18.18%
   ROUGE-2 Precision: 25.00%
   ROUGE-2 F1:        21.05%

REF:  'The roundabout is on the bare land next to buildings and trees .'
HYP:  'bout is on the grass next to some trees .'
   ROUGE-2 Recall:    33.33%
   ROUGE-2 Precision: 44.44%
   ROUGE-2 F1:        38.10%

REF:  'The roundabout is on the grass next to trees and buildings .'
HYP:  'y is next to some trees and buildings .'
   ROUGE-2 Recall:    36.36%
   ROUGE-2 Precision: 50.00%
   ROUGE-2 F1:        42.11%

REF:  

In [None]:
# Report overall averages
avg_r = sum(all_recalls) / len(all_recalls)
avg_p = sum(all_precisions) / len(all_precisions)
avg_f = sum(all_f1s) / len(all_f1s)
print("=== AVERAGE ROUGE-2 METRICS ===")
print(f"Recall:    {avg_r*100:.2f}")
print(f"Precision: {avg_p*100:.2f}")
print(f"F1:        {avg_f*100:.2f}")

=== AVERAGE ROUGE-2 METRICS ===
Recall:    23.28
Precision: 28.88
F1:        25.38


In [None]:
# Compute ROUGE-3
all_recalls, all_precisions, all_f1s = [], [], []
for refs, pred in zip(all_references, predictions):
    recalls_per_sample, precisions_per_sample, f1s_per_sample = [], [], []
    for ref in refs:
        r, p, f = rouge_n(ref, pred, n=3)
        recalls_per_sample.append(r)
        precisions_per_sample.append(p)
        f1s_per_sample.append(f)

    max_score = max(f1s_per_sample)
    max_index = f1s_per_sample.index(max_score)
    all_recalls.append(recalls_per_sample[max_index])
    all_precisions.append(precisions_per_sample[max_index])
    all_f1s.append(f1s_per_sample[max_index])
    print(f"REF:  {refs[max_index]!r}")
    print(f"HYP:  {pred!r}")
    print(f"   ROUGE-3 Recall:    {recalls_per_sample[max_index] * 100:.2f}%")
    print(f"   ROUGE-3 Precision: {precisions_per_sample[max_index] * 100:.2f}%")
    print(f"   ROUGE-3 F1:        {f1s_per_sample[max_index] * 100:.2f}%\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   ROUGE-3 F1:        50.00%

REF:  'The roundabout connects three roads and a sculpture is in the middle of the roundabout .'
HYP:  'bout is on the grass next to some trees .'
   ROUGE-3 Recall:    0.00%
   ROUGE-3 Precision: 0.00%
   ROUGE-3 F1:        0.00%

REF:  'There are several moving vehicles on the road at the roundabout, Many buildings and some trees and a few lawns around the roundabout .'
HYP:  'bout is next to some trees and grass .'
   ROUGE-3 Recall:    4.35%
   ROUGE-3 Precision: 14.29%
   ROUGE-3 F1:        6.67%

REF:  'The roundabout is on the bare land next to buildings and trees .'
HYP:  'bout is on the grass next to some trees .'
   ROUGE-3 Recall:    9.09%
   ROUGE-3 Precision: 12.50%
   ROUGE-3 F1:        10.53%

REF:  'The roundabout is on the grass next to trees and buildings .'
HYP:  'y is next to some trees and buildings .'
   ROUGE-3 Recall:    20.00%
   ROUGE-3 Precision: 28.57%
   ROUGE-3 F

In [None]:
# Report overall averages
avg_r = sum(all_recalls) / len(all_recalls)
avg_p = sum(all_precisions) / len(all_precisions)
avg_f = sum(all_f1s) / len(all_f1s)
print("=== AVERAGE ROUGE-3 METRICS ===")
print(f"Recall:    {avg_r*100:.2f}")
print(f"Precision: {avg_p*100:.2f}")
print(f"F1:        {avg_f*100:.2f}")

=== AVERAGE ROUGE-3 METRICS ===
Recall:    15.11
Precision: 19.06
F1:        16.62


In [None]:
# Compute ROUGE-4
all_recalls, all_precisions, all_f1s = [], [], []
for refs, pred in zip(all_references, predictions):
    recalls_per_sample, precisions_per_sample, f1s_per_sample = [], [], []
    for ref in refs:
        r, p, f = rouge_n(ref, pred, n=4)
        recalls_per_sample.append(r)
        precisions_per_sample.append(p)
        f1s_per_sample.append(f)

    max_score = max(f1s_per_sample)
    max_index = f1s_per_sample.index(max_score)
    all_recalls.append(recalls_per_sample[max_index])
    all_precisions.append(precisions_per_sample[max_index])
    all_f1s.append(f1s_per_sample[max_index])
    print(f"REF:  {refs[max_index]!r}")
    print(f"HYP:  {pred!r}")
    print(f"   ROUGE-4 Recall:    {recalls_per_sample[max_index] * 100:.2f}%")
    print(f"   ROUGE-4 Precision: {precisions_per_sample[max_index] * 100:.2f}%")
    print(f"   ROUGE-4 F1:        {f1s_per_sample[max_index] * 100:.2f}%\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   ROUGE-4 F1:        42.86%

REF:  'The roundabout connects three roads and a sculpture is in the middle of the roundabout .'
HYP:  'bout is on the grass next to some trees .'
   ROUGE-4 Recall:    0.00%
   ROUGE-4 Precision: 0.00%
   ROUGE-4 F1:        0.00%

REF:  'The roundabout with three exits and entrances is in the residential area .'
HYP:  'bout is next to some trees and grass .'
   ROUGE-4 Recall:    0.00%
   ROUGE-4 Precision: 0.00%
   ROUGE-4 F1:        0.00%

REF:  'The roundabout connects four roads and a building is next to the roundabout .'
HYP:  'bout is on the grass next to some trees .'
   ROUGE-4 Recall:    0.00%
   ROUGE-4 Precision: 0.00%
   ROUGE-4 F1:        0.00%

REF:  'The roundabout is on the grass next to trees and buildings .'
HYP:  'y is next to some trees and buildings .'
   ROUGE-4 Recall:    11.11%
   ROUGE-4 Precision: 16.67%
   ROUGE-4 F1:        13.33%

REF:  'The roundabout with four 

In [None]:
# Report overall averages
avg_r = sum(all_recalls) / len(all_recalls)
avg_p = sum(all_precisions) / len(all_precisions)
avg_f = sum(all_f1s) / len(all_f1s)
print("=== AVERAGE ROUGE-4 METRICS ===")
print(f"Recall:    {avg_r*100:.2f}")
print(f"Precision: {avg_p*100:.2f}")
print(f"F1:        {avg_f*100:.2f}")

=== AVERAGE ROUGE-4 METRICS ===
Recall:    10.74
Precision: 13.93
F1:        11.96


In [None]:
import os

save_dir = "/content/drive/MyDrive/DI725 - Transformers and Attention-based Deep Networks/Term Project/siglip-gpt2-custom_vlm_finetuned"
os.makedirs(save_dir, exist_ok=True)

# Save tokenizer
tokenizer.save_pretrained(save_dir)

# Save merged language model (now a clean GPT-2)
model.language_model.save_pretrained(save_dir)

# Save vision encoder and image processor
model.vision_model.save_pretrained(f"{save_dir}/vision_encoder")
image_processor.save_pretrained(f"{save_dir}/vision_encoder")

# Save vision projection layer
torch.save(model.vision_proj.state_dict(), f"{save_dir}/vision_proj.pt")

# Save config for reinitialization
import json
config = {
    "vision_encoder_path": "vision_encoder",
    "language_model_path": ".",
    "vision_proj_path": "vision_proj.pt",
    "vision_hidden_size": model.vision_proj.in_features,
    "language_hidden_size": model.vision_proj.out_features
}
with open(os.path.join(save_dir, "custom_vlm_config.json"), "w") as f:
    json.dump(config, f, indent=2)


In [None]:
print("DONE")

DONE
