<a href="https://colab.research.google.com/github/azernik/semeval_2025_task1/blob/main/admire_generate_predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Download and load data

In [3]:
import gdown

# dev dataset
file_id = "1RCTQGF5DG0SmiU-GMYJ5owQ900BAsjno"
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, "taskA_dev.zip", quiet=True)
! unzip -q - taskA_dev.zip

In [4]:
import os
import pandas as pd

# define locations
taska_folder = "dev"
taska_tsv_filename = "subtask_a_dev.tsv"

# load data
df = pd.read_csv(f"{taska_folder}/{taska_tsv_filename}", delimiter="\t")

In [5]:
# For reference, not required

df['sentence_type_true'] = [
    'idiomatic',  # monkey business
    'literal',    # grass roots
    'literal',    # marching orders
    'literal',    # panda car
    'idiomatic',  # bread and butter
    'literal',    # chocolate teapot
    'idiomatic',  # pig's ear
    'idiomatic',  # best man
    'literal',    # big cheese
    'idiomatic',  # eager beaver
    'literal',    # hair of the dog
    'idiomatic',  # thin ice
    'idiomatic',  # snake in the grass
    'literal',    # flea market
    'literal',    # big fish
]

In [6]:
def preprocess_data(df, dir_name):
    """
    Loads and preprocesses the dataset, setting up image paths.
    """
    image_name_cols = ['image1_name', 'image2_name', 'image3_name', 'image4_name', 'image5_name']
    df['image_paths'] = df.apply(lambda row: [os.path.join(dir_name, row['compound'].replace("'", "_"), row[image_name]) for image_name in image_name_cols], axis=1)
    return df

  df = preprocess_data(df, taska_folder)

In [27]:
# For scoring

df['top_1_true'] = [
    '04129294826.png',  # image1 / monkey business
    '88610497135.png',  # image5 / grass roots
    '65125915005.png',  # image5 / marching orders
    '38506509761.png',  # image3 / panda car
    '98148844665.png',  # image5 (?) / bread and butter
    '24249382249.png',  # image3 / chocolate teapot
    '28970148854.png',  # image1 / pig's ear
    '95555063454.png',  # image5 / best man
    '22784021150.png',  # image3 / big cheese
    '97713215186.png',  # image5 / eager beaver
    '92505052147.png',  # image5 / hair of the dog
    '66964681897.png',  # image4 / thin ice
    '06740727918.png',  # image1 / snake in the grass
    '70941962850.png',  # image5 / flea market
    '66431812447.png',  # image / big fish
]

df['top_1_col'] = [
    'image1',  # image1 / monkey business
    'image5',  # image5 / grass roots
    'image5',  # image5 / marching orders
    'image3',  # image3 / panda car
    'image5',  # image5 (?) / bread and butter
    'image3',  # image3 / chocolate teapot
    'image1',  # image1 / pig's ear
    'image5',  # image5 / best man
    'image3',  # image3 / big cheese
    'image5',  # image5 / eager beaver
    'image5',  # image5 / hair of the dog
    'image4',  # image4 / thin ice
    'image1',  # image1 / snake in the grass
    'image5',  # image5 / flea market
    'image3',  # image3 / big fish
]

### Text input generation

In [10]:
from google.colab import userdata
from openai import OpenAI

# OpenAI initialization
client = OpenAI(api_key=userdata.get('OPENAI_PROJECT_KEY'))

In [11]:
# GPT Prompt - Sentence type
import json

def gpt_sentence_types(compounds, sentences):
    """
    Prompt GPT-4 to get the sentence type, literal or figurative, for a batch of sentences.
    """
    # Create a combined prompt
    samples = "\n\n".join([
        f'Target phrase: "{nc}"\nContext sentence: "{sentence}"' for nc, sentence in zip(compounds, sentences)
    ])

    prompt = f"""
You are a linguistics expert specializing in figurative language. You will be given a set of samples, each containing a \"target phrase\" paired with a \"context sentence\" containing a usage of said phrase.
The target phrases all have idiomatic (i.e. figurative) meanings, but they might be used literally in these context sentences!
For each sample, you are to do the following:
1. Looking at the target phrase in isolation, state its *idiomatic* meaning and its *literal* meaning. The literal meaning might be awkward, as some of these phrases are almost always used idiomatically.
2. *Carefully* consider how the target phrase is used in the context sentence. Is it used in its idiomatic sense (in most cases, they way that we"re used to understanding it), or is it used as a literal composition of its component words?
3. Verbose explanation: Given your familiarity with the phrase"s possible meanings, and having considered how it"s used in the sentence, give an explanation of what the phrase means in the context of the sentence. This can be a few sentences long.
4. Final usage determination: Based on steps 1-3, state whether the phrase"s use in the context sentence is "literal" or "idiomatic".

Example input:
Target phrase: "cold turkey"
Context sentence: "John quit smoking cold turkey and never looked back, not that it was easy."

Target phrase: "ghost town"
Context sentence: "Our wanderings had led us perilously close to the walls of the ghost town where restless spirits haunted the streets, eager to absorb the vitatlity of the living."

---

Example response:
{{"samples": [
    {{
      "target_phrase": "cold turkey",
      "idiomatic_meaning": "To stop a habit or addiction abruptly and completely, without gradually reducing or tapering off. It often refers to ceasing a harmful behavior or substance like smoking or drugs.",
      "literal_meaning": "A cold dish made from turkey meat, often eaten as leftovers."
      "contextual_considerations": "In the sentence, \"John quit smoking cold turkey and never looked back, not that it was easy,\" the phrase \"cold turkey\" clearly does not refer to food. It is used in the context of quitting smoking, which aligns with the idiomatic usage of the term.",
      "verbose_explanation": "The phrase \"cold turkey\" in this sentence means that John abruptly stopped smoking without tapering off or using substitutes like nicotine patches. The description highlights the difficulty of this approach, suggesting that quitting \"cold turkey\" was challenging but ultimately successful. The context does not mention anything about literal turkey, further affirming the idiomatic interpretation.",
      "result": "idiomatic"
    }},
    {{
      "target_phrase": "ghost town",
      "idiomatic_meaning": "A deserted town or settlement that was once populated but is now abandoned, often evoking a sense of desolation or emptiness.",
      "literal_meaning": "A town inhabited by ghosts or supernatural entities, as in fictional or mythological contexts."
      "contextual_considerations": "In the sentence, \"Our wanderings had led us perilously close to the walls of the ghost town where restless spirits haunted the streets, eager to absorb the vitality of the living,\" the description explicitly mentions \"restless spirits\" and their interaction with the living. This strongly suggests a literal interpretation involving supernatural elements.",
      "verbose_explanation": "Here, \"ghost town\" refers to a literal place inhabited by ghosts or spirits, as indicated by the detailed imagery of \"restless spirits\" and their haunting presence. The context does not suggest the metaphorical use of the term as an abandoned, non-supernatural settlement.",
      "result": "literal"
    }},
    #...
]}}

---

Output JSON schema:
{{
  "type": "object",
  "properties": {{
    "samples": {{
      "type": "array",
      "items": {{
        "type": "object",
        "properties": {{
          "target_phrase": {{
            "type": "string"
          }},
          "idiomatic_meaning": {{
            "type": "string"
          }},
          "literal_meaning": {{
            "type": "string"
          }},
          "contextual_considerations": {{
            "type": "string"
          }},
          "verbose_explanation": {{
            "type": "string"
          }},
          "result": {{
            "type": "string",
            "enum": ["idiomatic", "literal"]
          }}
        }},
        "required": [
          "target_phrase",
          "idiomatic_meaning",
          "literal_meaning",
          "contextual_considerations",
          "verbose_explanation",
          "result"
        ]
      }}
    }}
  }},
  "required": ["samples"]
}}

Your turn. These are the samples:
{samples}
"""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
    )
    try:
        raw_content = response.choices[0].message.content.strip()
        # print("Raw GPT response:", raw_content)  # Debug print
        content = json.loads(raw_content)
        print(json.dumps(content, indent=4))  # Debug formatted output
        return content
    except json.JSONDecodeError as e:
        print(f"JSON decoding error: {e}")
        print("Response content that caused error:", raw_content)
        return {}

In [12]:
batch_size = 10
results = {}
for i in range(0, len(df), batch_size):
    batch = df.iloc[i:i + batch_size]
    compounds = batch['compound'].tolist()
    sentences = batch['sentence'].tolist()
    responses = gpt_sentence_types(compounds, sentences)
    batch_results = {item["target_phrase"]: item["result"] for item in responses["samples"]}
    results.update(batch_results)

df['sentence_type_pred'] = df['compound'].map(results)

{
    "samples": [
        {
            "target_phrase": "monkey business",
            "idiomatic_meaning": "Dishonest or deceitful behavior, often involving trickery or non-transparent activities.",
            "literal_meaning": "The actions or activities of monkeys.",
            "contextual_considerations": "In this sentence, the phrase 'monkey business' is used within the context of describing a place's cleanliness and propriety. There is no suggestion of actual monkeys or their behaviors.",
            "verbose_explanation": "In the given sentence, 'monkey business' is used to emphasize the propriety of the architectural environment referred to. The term implies that no illegal or underhanded activities occur there. No literal monkey-related action is mentioned or implied, thus it's indicative of the idiomatic use of the phrase.",
            "result": "idiomatic"
        },
        {
            "target_phrase": "grass roots",
            "idiomatic_meaning": "The most basic l

In [13]:
# GPT Prompt #2
import json

def gpt_definitions(compounds, sentence_types):
    """
    Generate definitions for target phrases using GPT-4, in batches.
    """
    # Filter out literals since they don't need processing
    input_data = [
        nc for nc, sentence_type in zip(compounds, sentence_types) if sentence_type != "literal"
    ]

    # Skip batch if all are literal
    if not input_data:
        return {nc: nc for nc in compounds}  # Return original NCs for all literals

    # Create a combined prompt
    examples = "\n".join([
        f'The idiom is: "{nc}".' for nc in input_data
    ])

    prompt = f"""
You are a linguistics expert specializing in idioms. For each of the idioms below, do the following steps aloud (in writing):
1. Give a verbose explanation of the idiom, including what connotations it carries or undertones it evokes.
2. Taking into consideration your response for #1, list three potential definitions, no longer than 20 words each, that capture the underlying idea conveyed by the idiom, without referencing it as an idiom, figure of speech, or stylistic expression.
3. Choose the best definition.

Example inputs:
The idiom is: "cold turkey"
The idiom is: "bun in the oven"
The idiom is: "shrinking violet"

---

Example output:
{{"samples": [
    {{
      "target_phrase": "cold turkey",
      "explanation": "Refers to abruptly stopping a behavior, typically a bad habit, without gradual reduction or preparation. Evokes starkness, discomfort, determination, hardship, and raw honesty. Often associated with quitting addictive substances or habits, emphasizing suddenness and self-discipline.",
      "potential_definition_1": "Immediately ceasing a habit or behavior without any preparation.",
      "potential_definition_2": "Stopping something abruptly, often in a challenging or uncomfortable manner.",
      "potential_definition_3": "Making a sharp and decisive change, particularly to quit a dependency.",
      "result": "Stopping something abruptly, often in a challenging or uncomfortable manner."
    }},
    {{
      "target_phrase": "bun in the oven",
      "explanation": "A lighthearted, euphemistic way to say someone is pregnant. Carries warm, nurturing undertones, invoking growth, expectancy, and care. Sometimes used to hint at pregnancy rather than explicitly state it, implying secrecy or anticipation.",
      "potential_definition_1": "Expecting the arrival of a new baby or nurturing a life in progress.",
      "potential_definition_2": "A situation where growth and development are underway.",
      "potential_definition_3": "Preparing for an important, life-changing event, particularly childbirth.",
      "result": "Expecting the arrival of a new baby or nurturing a life in progress."
    }},
    {{
      "target_phrase": "shrinking violet",
      "explanation": "Describes someone who is exceptionally shy, reserved, or introverted, often avoiding attention or interaction. Connotations of delicacy, modesty, and a lack of assertiveness. Sometimes evokes frustration or pity due to extreme timidity or unconfidence.",
      "potential_definition_1": "A person who is extremely shy and avoids attention or interaction.",
      "potential_definition_2": "Someone whose reserved nature makes them hesitant to assert themselves.",
      "potential_definition_3": "A timid individual who prefers to stay out of the spotlight.",
      "result": "A person who is extremely shy and avoids attention or interaction."
    }}
  ]
}}

---

Output JSON schema:
{{
  "type": "object",
  "properties": {{
    "samples": {{
      "type": "array",
      "items": {{
        "type": "object",
        "properties": {{
          "target_phrase": {{
            "type": "string"
          }},
          "explanation": {{
            "type": "string"
          }},
          "potential_definition_1": {{
            "type": "string"
          }},
          "potential_definition_2": {{
            "type": "string"
          }},
          "potential_definition_3": {{
            "type": "string"
          }},
          "result": {{
            "type": "string"
          }}
        }},
        "required": [
          "target_phrase",
          "explanation",
          "potential_definition_1",
          "potential_definition_2",
          "potential_definition_3",
          "result"
        ]
      }}
    }}
  }},
  "required": ["samples"]
}}

Your turn. Here are the samples:
{examples}
"""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
    )
    try:
        raw_content = response.choices[0].message.content.strip()
        # print("Raw GPT response:", raw_content)  # Debug print
        content = json.loads(raw_content)
        print(json.dumps(content, indent=4))  # Debug formatted output
        return content
    except json.JSONDecodeError as e:
        print(f"JSON decoding error: {e}")
        print("Response content that caused error:", raw_content)
        return {}

In [14]:
# Apply to the DataFrame in batches
batch_size = 10
results = {}
for i in range(0, len(df), batch_size):
    batch = df.iloc[i:i + batch_size]
    compounds = batch['compound'].tolist()
    sentence_types = batch['sentence_type_pred'].tolist()
    responses = gpt_definitions(compounds, sentence_types)
    batch_results = {item["target_phrase"]: item["result"] for item in responses["samples"]}
    results.update(batch_results)

# Map results back to DataFrame
df['text_input'] = df['compound'].map(results)
df['text_input'].fillna(df['compound'], inplace=True)

{
    "samples": [
        {
            "target_phrase": "monkey business",
            "explanation": "Refers to actions or behavior that are deceitful, dishonest, or not to be taken seriously. Often implies mischief, tomfoolery, or irresponsibility. Mostly used to denote unethical undertakings or the initiation of a senseless mess.",
            "potential_definition_1": "Dishonest, mischievous, or foolish behavior.",
            "potential_definition_2": "Actions taken in a dishonest or underhanded manner.",
            "potential_definition_3": "Behavior that is not to be taken seriously, often due to mischief.",
            "result": "Dishonest, mischievous, or foolish behavior."
        },
        {
            "target_phrase": "bread and butter",
            "explanation": "Used to describe a person's main and steady source of income. Can also signify someone's basic needs or the thing which they primarily rely upon. Often has connotations of simplicity, necessity, and regulari

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text_input'].fillna(df['compound'], inplace=True)


### Execution

#### Image-ranking functions

In [17]:
# !pip install anyio==3.5.0 openai==1.55.3 httpx==0.27.2 --force-reinstall --quiet

In [18]:
# install clip
!pip install -q ftfy regex tqdm
!pip install -q git+https://github.com/openai/CLIP.git

import clip

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for clip (setup.py) ... [?25l[?25hdone


In [20]:
import torch
from PIL import Image
from ast import literal_eval

def get_image_ranking_clip(model, image_processor, image_paths, sentence):
    image_inputs = torch.stack([image_processor(Image.open(ipath)) for ipath in image_paths]).to(device)
    text_input = clip.tokenize(sentence).to(device)

    with torch.no_grad():
        # compute embeddings
        image_features = model.encode_image(image_inputs)
        text_features = model.encode_text(text_input)

    # normalize features
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    # compute similarity scores
    similarity = (100.0 * text_features @ image_features.T).softmax(dim=-1)

    # rank images by similarity
    probs, indices = similarity[0].topk(5)
    return probs, indices

In [32]:
def openclip_image_ranking(model, image_processor, tokenizer, image_paths, sentence):
    image_inputs = torch.stack([preprocess_openclip(Image.open(ipath)) for ipath in image_paths]).to(device)
    text_input = tokenizer([sentence]).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image_inputs)
        text_features = model.encode_text(text_input)

    # normalise features
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    # dot product & softmax
    similarity = (100.0 * text_features @ image_features.T).softmax(dim=-1)

    # order by similarity
    probs, indices = similarity[0].topk(5)
    return probs, indices

In [33]:
def get_image_ranking_align(model, processor, image_paths, sentence):
    image_inputs = [Image.open(ipath) for ipath in image_paths]
    inputs = processor(images=image_inputs ,text=sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    logits_per_text = outputs.logits_per_text[0]
    probs = logits_per_text.softmax(dim=-1)
    ids_sorted = torch.argsort(probs, descending=True)
    return probs[ids_sorted], ids_sorted

### Model config

In [35]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [36]:
!pip install open_clip_torch
import open_clip

openclip_model_version = "ViT-B-32"
model_openclip, _, preprocess_openclip = open_clip.create_model_and_transforms(openclip_model_version, pretrained='laion2b_s34b_b79k')
model_openclip.to(device)
open_clip_tokenizer = open_clip.get_tokenizer(openclip_model_version)
model_openclip.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active

Collecting open_clip_torch
  Downloading open_clip_torch-2.29.0-py3-none-any.whl.metadata (31 kB)
Downloading open_clip_torch-2.29.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: open_clip_torch
Successfully installed open_clip_torch-2.29.0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


open_clip_pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [46]:
from transformers import AlignProcessor, AlignModel

models = [
    # {
    #     "base_model": "CLIP",
    #     "model_name": "ViT-B/32",
    #     "model": clip.load("ViT-B/32", device)[0],
    #     "preprocess": clip.load("ViT-B/32", device)[1]
    # },
    # {
    #     "base_model": "CLIP",
    #     "model_name": "ViT-L/14",
    #     "model": clip.load("ViT-L/14", device)[0],
    #     "preprocess": clip.load("ViT-L/14", device)[1]
    # },
    {
        "base_model": "CLIP",
        "model_name": "RN50x64",
        "model": clip.load("RN50x64", device)[0],
        "preprocess": clip.load("RN50x64", device)[1]
    },
    # {
    #     "base_model": "Align",
    #     "model_name": "Base",
    #     "model": AlignModel.from_pretrained("kakaobrain/align-base"),
    #     "preprocess": AlignProcessor.from_pretrained("kakaobrain/align-base")
    # },
    {
        "base_model": "open_clip",
        "model_name": openclip_model_version,
        "model": model_openclip,
        "preprocess": preprocess_openclip,
        "tokenizer": open_clip_tokenizer,
    }
]

In [37]:
def get_predictions(model, processor, image_paths_list, text_inputs, base_model, model_name, tokenizer=None):
    """
    Uses get_image_ranking to generate predictions and confidence scores for a image-list, text-input pairs
    """
    predictions, confidence_scores = [], []

    for ipaths, text in zip(image_paths_list, text_inputs):
        if len(ipaths) == 0:
            predictions.append([])
            confidence_scores.append([])
            continue

        # values, indices = get_image_ranking(ipaths, text)
        if base_model == "CLIP":
          values, indices = get_image_ranking_clip(model, processor, ipaths, text)
        elif base_model == "Align":
          values, indices = get_image_ranking_align(model, processor, ipaths, text)
        elif base_model == "open_clip":
          values, indices = openclip_image_ranking(model, processor, tokenizer, ipaths, text)
        else:
          raise ValueError(f"Unknown base_model: {base_model}")
        predictions.append(list(indices.cpu()))
        confidence_scores.append(100 * values)

    return predictions, confidence_scores


In [None]:
def run_test(df, model, processor, base_model, model_name, output_file, tokenizer=None):
    """
    Generates predictions using the provided model and saves the output as a TSV.
    """
    # Generate predictions
    predictions, confidence_scores = get_predictions(
        model=model,
        processor=processor,
        image_paths_list=df['image_paths'],
        text_inputs=df['text_input'],
        base_model=base_model,
        model_name=model_name
    )

    write_submission_file(df, predictions, confidence_scores, output_file)

In [52]:
# Generate predictions

model_config = models[0] # CLIP RN50x64
# model_config = models[1] # OpenCLIP ViT-B-32

predictions, confidence_scores = get_predictions(
    model=model_config['model'],
    processor=model_config['preprocess'],
    image_paths_list=df['image_paths'],
    text_inputs=df['text_input'],
    base_model=model_config['base_model'],
    model_name=model_config['model_name']
)

In [76]:
submission_df = pd.read_csv(f"{taska_folder}/{taska_tsv_filename}", delimiter="\t")
submission_df = submission_df[['compound', 'expected_order']]

formatted_results = []

# Iterate over predictions and confidence scores
for i, (pred, conf) in enumerate(zip(predictions, confidence_scores)):
    # Get ranked file names
    ranked_files = [os.path.basename(df['image_paths'].iloc[i][j]) for j in pred]

    # Update the df DataFrame with the expected order as a Python-style list string
    submission_df.loc[i, 'expected_order'] = str(ranked_files)

    # Create a formatted result for results_df
    formatted_results.append({
        "index": i,
        "compound": df["compound"].iloc[i],
        "ranked_image_files": " ".join(ranked_files),
        "confidence_scores": " ".join(map(lambda x: f"{x:.3f}", conf))
    })

submission_df.to_csv('submission_EN.tsv', sep="\t", index=False)

# For further inspection, uncomment
# results_df = pd.DataFrame(formatted_results)
# results_df['top_1_pred'] = results_df['ranked_image_files'].apply(lambda x: x.split()[0])
# results_df['top_1_true'] = df['top_1_true']
# results_df['top1_correct'] = results_df['top_1_true'] == results_df['top_1_pred']

  df_raw.loc[i, 'expected_order'] = str(ranked_files)
