<a href="https://colab.research.google.com/github/azernik/semeval_2025_task1/blob/main/admire_experiments_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup and Preprocessing

In [None]:
# for downloading the train zip from Drive
!pip install -q gdown

import gdown

# install clip
!pip install -q ftfy regex tqdm
!pip install -q git+https://github.com/openai/CLIP.git

import clip

  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
!pip install anyio==3.5.0 openai==1.55.3 httpx==0.27.2 --force-reinstall --quiet

In [None]:
# download taskA file from Adam's Drive (public) and unzip
file_id = "105JdQU_u98w_xSYaNNSj-r4RsyTPXZEF"
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, "taskA.zip", quiet=True)
! unzip -q - taskA.zip

replace train/acid test/02817176209.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: A


In [None]:
import os
import pandas as pd

# define locations
taska_folder = "train"
taska_tsv_filename = "subtask_a_train.tsv"

# load data
df = pd.read_csv(f"{taska_folder}/{taska_tsv_filename}", delimiter="\t")

# fix incorrect row in dataset
df.loc[df['compound'] == "pain in the neck", 'sentence_type'] = 'literal'

In [None]:
# df[['compound', 'sentence', 'sentence_type', 'expected_order_indices']]

In [None]:
from ast import literal_eval

# Preprocess dataframe (image paths, etc.)
image_name_cols = ['image1_name', 'image2_name', 'image3_name', 'image4_name', 'image5_name']
df['image_paths'] = df.apply(lambda row: [os.path.join(taska_folder, row['compound'].replace("'", "_"), row[image_name]) for image_name in image_name_cols], axis=1)
df['image_idx_map'] = df.apply(lambda row: {row[name]: i for i, name in enumerate(image_name_cols)}, axis=1)
df['expected_order_indices'] = df.apply(lambda row: [row['image_idx_map'][name] for name in literal_eval(row['expected_order'])], axis=1)

### Model-specific functions

In [None]:
import torch
from PIL import Image
from ast import literal_eval

def get_image_ranking_clip(model, image_processor, image_paths, sentence):
    image_inputs = torch.stack([image_processor(Image.open(ipath)) for ipath in image_paths]).to(device)
    text_input = clip.tokenize(sentence).to(device)

    with torch.no_grad():
        # compute embeddings
        image_features = model.encode_image(image_inputs)
        text_features = model.encode_text(text_input)

    # normalize features
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    # compute similarity scores
    similarity = (100.0 * text_features @ image_features.T).softmax(dim=-1)

    # rank images by similarity
    probs, indices = similarity[0].topk(5)
    return probs, indices

In [None]:
def get_image_ranking_align(model, processor, image_paths, sentence):
    image_inputs = [Image.open(ipath) for ipath in image_paths]
    inputs = processor(images=image_inputs ,text=sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    logits_per_text = outputs.logits_per_text[0]
    probs = logits_per_text.softmax(dim=-1)
    ids_sorted = torch.argsort(probs, descending=True)
    return probs[ids_sorted], ids_sorted

In [None]:
!pip install open_clip_torch
import open_clip

def openclip_image_ranking(model, image_processor, tokenizer, image_paths, sentence):
    image_inputs = torch.stack([preprocess_openclip(Image.open(ipath)) for ipath in image_paths]).to(device)
    text_input = tokenizer([sentence]).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image_inputs)
        text_features = model.encode_text(text_input)

    # normalise features
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    # dot product & softmax
    similarity = (100.0 * text_features @ image_features.T).softmax(dim=-1)

    # order by similarity
    probs, indices = similarity[0].topk(5)
    return probs, indices



### General functions for experiment execution

In [None]:
def get_predictions(model, processor, image_paths_list, text_inputs, base_model, model_name):
    """
    Uses get_image_ranking to generate predictions and confidence scores for a image-list, text-input pairs
    """
    predictions, confidence_scores = [], []

    if base_model == "open_clip":
      tokenizer = open_clip.get_tokenizer(model_name)

    for ipaths, text in zip(image_paths_list, text_inputs):
        if len(ipaths) == 0:
            predictions.append([])
            confidence_scores.append([])
            continue

        # values, indices = get_image_ranking(ipaths, text)
        if base_model == "CLIP":
          values, indices = get_image_ranking_clip(model, processor, ipaths, text)
        elif base_model == "Align":
          values, indices = get_image_ranking_align(model, processor, ipaths, text)
        elif base_model == "open_clip":
          # values, indices = get_image_ranking_open_clip(model, processor, ipaths, text, model_name)
          values, indices = openclip_image_ranking(model, processor, tokenizer, ipaths, text)
        # elif base_model == "BLIP":
        #   values, indices = get_image_ranking_blip(model, image_processor, ipaths, text)
        else:
          raise ValueError(f"Unknown base_model: {base_model}")
        predictions.append(list(indices.cpu()))
        confidence_scores.append(100 * values)

    return predictions, confidence_scores


In [None]:
from scipy.stats import spearmanr

def evaluate_predictions(predictions, df, weights=[0.4, 0.3, 0.2, 0.1, 0.0]):
    """
    Takes predictions, returns three types of evaluation metrics:
    - Top-1 Accuracy
    - Average Spearman Correlation
    - Average Weighted Accuracy
    """
    correct_top1 = 0
    spearman_scores, weighted_scores = [], []

    for i in range(len(predictions)):
        if len(predictions[i]) == 0:
            continue

        # Ground truth and predictions
        pred_order = [df['image_idx_map'].iloc[i][os.path.basename(df['image_paths'].iloc[i][j])] for j in predictions[i]]
        ground_truth_order = df['expected_order_indices'].iloc[i]

        # Top-1 accuracy
        if pred_order[0] == ground_truth_order[0]:
            correct_top1 += 1

        # Spearman correlation
        score, _ = spearmanr(pred_order, ground_truth_order)
        spearman_scores.append(score)

        # Weighted accuracy
        weighted_score = sum(weights[j] for j, img in enumerate(pred_order) if img == ground_truth_order[j])
        weighted_scores.append(weighted_score)

    return {
        "top1_accuracy": correct_top1 / len(predictions),
        "average_spearman": sum(spearman_scores) / len(spearman_scores),
        "average_weighted_accuracy": sum(weighted_scores) / len(weighted_scores),
        "spearman_scores": spearman_scores,
        "weighted_scores": weighted_scores
    }


In [None]:
import csv

def save_results(experiment_name, base_model, model_name, metrics, results_file="experiment_results.csv"):
    """
    Save experiment results to a CSV file.
    """
    # Add experiment name to metrics
    results_row = {
        "base_model": base_model,
        "model": model_name,
        "experiment": experiment_name,
        "top1_accuracy": metrics["top1_accuracy"],
        "average_spearman": metrics["average_spearman"],
        "average_weighted_accuracy": metrics["average_weighted_accuracy"],
    }

    # Write results to CSV
    write_header = not os.path.exists(results_file)
    with open(results_file, mode="a", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=results_row.keys())
        if write_header:
            writer.writeheader()
        writer.writerow(results_row)

    print(f"Results saved to {results_file}")

In [None]:
def run_experiment(model, processor, df, image_paths, text_inputs, model_name, experiment_name, base_model):
    """
    Run an experiment using transformed embeddings, evaluate metrics, and print results.
    """
    predictions, confidence_scores = get_predictions(model, processor, image_paths, text_inputs, base_model, model_name)
    metrics = evaluate_predictions(predictions, df)
    save_results(experiment_name, base_model, model_name, metrics)

    print(f"Top-1 Accuracy: {metrics['top1_accuracy'] * 100:.2f}%")
    print(f"Average Spearman Correlation: {metrics['average_spearman']:.2f}")
    print(f"Average Weighted Accuracy: {metrics['average_weighted_accuracy']:.2f}")

    return metrics, predictions, confidence_scores

In [None]:
import re

def save_predictions(df, image_paths, predictions, confidence_scores, metrics, prefix, preds_dir='predictions'):
    """
    Save detailed predictions and confidence scores for each example.
    """
    # create 'preds' directory if doesn't exist
    if not os.path.exists(preds_dir):
        os.makedirs(preds_dir)

    # generate output filename
    prefix = prefix.strip().replace(" ", "_")
    prefix = re.sub(r'[^a-zA-Z0-9_-]', '', prefix)
    output_path = f"{preds_dir}/{prefix}_preds.csv"

    spearman_scores = metrics["spearman_scores"]
    weighted_scores = metrics["weighted_scores"]
    with open(output_path, mode="w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["index", "compound", "ground_truth_order", "predicted_order", "top1_score", "spearman_score", "weighted_score", "confidence_scores"])

        for i, (pred, conf) in enumerate(zip(predictions, confidence_scores)):
            pred_order = [df['image_idx_map'].iloc[i][os.path.basename(image_paths.iloc[i][j])] for j in pred]
            ground_truth_order = df["expected_order_indices"].iloc[i]
            top1_score = 1 if pred_order[0] == ground_truth_order[0] else 0
            spearman_score = round(spearman_scores[i], 3)
            weighted_score = round(weighted_scores[i], 3)
            formatted_conf_scores = [round(c.item(), 3) for c in conf]
            writer.writerow([i, df["compound"].iloc[i], ground_truth_order, pred_order, top1_score, spearman_score, weighted_score, formatted_conf_scores])

    print(f"Predictions saved to {output_path}")


### Prompt GPT & populate data columns

#### GPT Prompt 1: NC definition

In [None]:
# GPT Prompt #1

import json
from google.colab import userdata
from openai import OpenAI

# initialize openai client with colab secret key
client = OpenAI(api_key=userdata.get('OPENAI_PROJECT_KEY'))

def generate_paraphrases_batched(compounds, sentence_types):
    """
    Generate paraphrases in batches using GPT-4.
    """
    # Filter out literals since they don't need processing
    input_data = [
        nc for nc, sentence_type in zip(compounds, sentence_types) if sentence_type != "literal"
    ]
    # Skip batch if all are literal
    if not input_data:
        return {nc: nc for nc in compounds}  # Return original NCs for all literals

    # Create a combined prompt
    examples = "\n".join([
        f'The idiom is: "{nc}".' for nc in input_data
    ])

    prompt = f"""
You are a linguistics expert specializing in idioms. For each of the idioms below, provide a definition that balances accuracy with its emotional and cultural essence.

Example:
Idiom: "cold turkey"
Definition: "Abruptly quitting a habit, marked by discomfort and determination."

Idioms to define:
{examples}

Respond in this format:
{{"idioms": [
    {{"idiom": "cold turkey", "definition": "Abruptly quitting a habit, marked by discomfort and determination."}},
    {{"idiom": "another idiom", "definition": "Its definition here."}}
]}}
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
        )
        content = json.loads(response.choices[0].message.content.strip())
        results = {item["idiom"]: item["definition"] for item in content["idioms"]}
        for idiom, definition in results.items():
            print(f"Idiom: {idiom}\nDefinition: {definition}\n")
        return results
    except Exception as e:
        print(f"Error generating paraphrases: {e}")
        return {}

In [None]:
# Apply to the DataFrame in batches
batch_size = 10
results = {}
for i in range(0, len(df), batch_size):
    batch = df.iloc[i:i + batch_size]
    compounds = batch['compound'].tolist()
    sentence_types = batch['sentence_type'].tolist()
    batch_results = generate_paraphrases_batched(compounds, sentence_types)
    results.update(batch_results)

# Map results back to DataFrame
df['gpt_prompt_1_resp'] = df['compound'].map(results)

# For rows with null 'paraphrased_nc', set equal to 'compound'
df['gpt_prompt_1_resp'].fillna(df['compound'], inplace=True)

Idiom: elbow grease
Definition: Hard physical work, especially vigorous cleaning, characterized by personal effort and perseverance.

Idiom: night owl
Definition: A person habitually active or awake at night, often associated with creativity or solitude.

Idiom: heart of gold
Definition: Incredibly kind and generous nature, reflecting a person's ability to be compassionate and unselfish.

Idiom: agony aunt
Definition: A person, often a columnist, who gives advice to people with personal problems, embodying sensitivity and wisdom.

Idiom: shrinking violet
Definition: A person who is shy or modest and avoids drawing attention to themselves, often associated with introversion or lack of confidence.

Idiom: banana republic
Definition: A small nation, especially in the tropics, dependent on one crop or the influx of foreign capital, often associated with instability, corruption, and inequality.

Idiom: private eye
Definition: A private investigator, often operating outside law enforcement, 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['gpt_prompt_1_resp'].fillna(df['compound'], inplace=True)


#### GPT Prompt 2: NC definition with multi-step reasoning

In [None]:
# GPT Prompt #2

import json
from google.colab import userdata
from openai import OpenAI

# initialize openai client with colab secret key
client = OpenAI(api_key=userdata.get('OPENAI_PROJECT_KEY'))

def generate_paraphrases_batched_2(compounds, sentence_types):
    """
    Generate paraphrases in batches using GPT-4.
    """
    # Filter out literals since they don't need processing
    input_data = [
        nc for nc, sentence_type in zip(compounds, sentence_types) if sentence_type != "literal"
    ]
    # Skip batch if all are literal
    if not input_data:
        return {nc: nc for nc in compounds}  # Return original NCs for all literals

    # Create a combined prompt
    examples = "\n".join([
        f'The idiom is: "{nc}".' for nc in input_data
    ])

    prompt = f"""
You are a linguistics expert specializing in idioms. For each of the idioms below, do the following steps aloud (in writing):
1. Give a verbose explanation of the idiom, including what connotations it carries or undertones it evokes.
2. List three potential definitions, no longer than 20 words each, that capture the essence of the phrase in a general manner.
3. Choose the best definition.

Example #1:
Idiom: "cold turkey"
Definition: "Abruptly quitting a habit or addiction, overcoming discomfort or pain, requiring determination."

Example #2:
Idiom: "piece of cake"
Definition: "Easy, simple to accomplish, requiring little effort, not a problem."

Idioms to define:
{examples}

Respond in this format:
{{"idioms": [
    {{"idiom": "cold turkey", "verbose_definition": "<verbose_definition>", "possible_definitions": "<three possible definitions>", "definition": "Abruptly quitting a habit, marked by discomfort and determination."}},
    {{"idiom": "<another idiom>", "verbose_definition": "<verbose_definition>", "possible_definitions": "<three possible definitions>" , "definition": "<its definition>" }}
]}}
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
        )
        content = json.loads(response.choices[0].message.content.strip())
        results = {item["idiom"]: item["definition"] for item in content["idioms"]}
        for item in content["idioms"]:
            print(f"Idiom: {item['idiom']}\nVerbose: {item['verbose_definition']}\nPossible: {item['possible_definitions']}\nDefinition: {item['definition']}\n")
        return results
    except Exception as e:
        print(f"Error generating paraphrases: {e}")
        return {}

In [None]:
# Apply to the DataFrame in batches
batch_size = 10
results = {}
for i in range(0, len(df), batch_size):
    batch = df.iloc[i:i + batch_size]
    compounds = batch['compound'].tolist()
    sentence_types = batch['sentence_type'].tolist()
    batch_results = generate_paraphrases_batched_2(compounds, sentence_types)
    results.update(batch_results)

# Map results back to DataFrame
df['gpt_prompt_2_resp'] = df['compound'].map(results)

# For rows with null 'paraphrased_nc', set equal to 'compound'
df['gpt_prompt_2_resp'].fillna(df['compound'], inplace=True)

Idiom: elbow grease
Verbose: Hard work and determination applied to a menial or difficult task, often referring to a physically demanding one. The idiom evokes the image of putting in extra effort or exertion.
Possible: ['Applying hard work and commitment.', 'Effort used in scrubbing or cleaning.', 'Physical exertion done determinedly.']
Definition: Applying hard work and commitment.

Idiom: night owl
Verbose: Refers to a person who is naturally most alert, productive, or creative during the night or late hours. The phrase evokes the nocturnal habits of the owl.
Possible: ['Individual who stays up late.', 'Person preferring late hours for productivity.', 'Someone active during night-time.']
Definition: Person preferring late hours for productivity.

Idiom: heart of gold
Verbose: Describes someone who is genuinely kind, compassionate, or selfless, much like the way gold is valued for its worth. This idiom suggests selflessness and moral worth.
Possible: ['Being kind and generous.', 'Cha

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['gpt_prompt_2_resp'].fillna(df['compound'], inplace=True)


#### alternative chain of thought

In [None]:
# GPT Prompt #4

import json
from google.colab import userdata
from openai import OpenAI

# initialize openai client with colab secret key
client = OpenAI(api_key=userdata.get('OPENAI_PROJECT_KEY'))

def generate_paraphrases_4(compound, sentence):
    """
    Generate paraphrases using GPT-4.
    """
    # Create a combined prompt
    # examples = "\n".join([
    #     f'Phrase: "{nc}"\nSentence: "{sent}"\n' for nc, sent in zip(compounds, sentences)
    # ])
    prompt = f"""You are a linguistics expert specializing in idioms. Given a potentially idiomatic phrase, and a sentence containing that phrase, you should determine whether the phrase is used literally or idiomatically in this particular instance, and describe the meaning of the phrase in this context.
Explain your reasoning.

Example #1
Phrase: "kick the bucket"
Sentence: "As John stood up from the table he accidentally kicked the bucket that was hidden underneath it."
Reasoning: To kick is to strike with the foot. A bucket is a container made of metal or plastic for carrying water. Kicking a bucket therefore means striking a container with the foot.
Alternatively, to kick the bucket, when used idiomatically, can mean to die.
Which option makes more sense in this case?
In the literal case, the sentence would mean that John is kicking a physical object, which we are told is underneath the table. It is plausible that a bucket could be hidden underneath a table and that standing up would cause one to kick that bucket.
In the idiomatic case, the sentence would mean that John accidentally dies as he stands up from the table, but then what is hidden underneath the table? The action of dying cannot be underneath a table. This usage does not make sense.
Therefore, the phrase is used literally in this instance.
Example 1 response: {{"phrase": "kick the bucket", "usage": "literal", "meaning": "kicking a bucket, which might cause pain in the foot or cause a loud noise."}}


Example #2
Idiom: "piece of cake"
Sentence: "Sarah thought the final exam was a piece of cake compared to the midterm."
Reasoning: A cake is a baked dessert, a piece of cake is a small amount of desert.
Alternatively, in idiomatic usage, something is considered a piece of cake if it is easy or requires little effort.
Which option makes more sense in this case?
In the literal case, the sentence would mean that Sarah thinks her exam is a piece of dessert. The piece of cake is compared to a different exam. It is unlikely that an exam is literally a piece of cake, and cake is not a comparable value like distance or size or intensity. This usage is unlikely.
In the idiomatic case, the sentence would mean that Sarah thinks her final exam was easy compared to the midterm. It is common to compare one exam with another in terms of its difficulty, so this usage makes sense.
Therefore, the phrase is used idiomatically in this instance.
Example 2 response: {{"phrase": "piece of cake", "usage": "idiomatic", "meaning": "an activity or task that is delightfully easy, requiring little effort and quickly accomplished."}}


Example #3
Phrase: "{compound}"
Sentence: "{sentence}"
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
        )
        pos = response.choices[0].message.content.strip().find('Example 3 response')
        json_response = response.choices[0].message.content.strip()[pos+19:]
        content = json.loads(json_response)
        print(f"Idiom: {content['phrase']}\nMeaning: {content['meaning']}\nUsage: {content['usage']}")
        return content
    except Exception as e:
        print(f"Error generating paraphrases: {e}")
        return {}

In [None]:
# Apply to the DataFrame in batches
results = []
for i in range(len(df)):
    batch = df.iloc[i]
    compound = batch['compound']
    sentence = batch['sentence']
    batch_results = generate_paraphrases_4(compound, sentence)
    results.append(batch_results)

# # Map results back to DataFrame
df['gpt_prompt_4_resp'] = [res['meaning'] for res in results]
df['gpt_prompt_4_type'] = [res['usage'] for res in results]

In [None]:
# with open('gpt_prompt_4_results.csv', mode="w", newline="", encoding="utf-8") as file:
#     fieldnames = results[0].keys()
#     writer = csv.DictWriter(file, fieldnames=fieldnames)
#     writer.writeheader()
#     writer.writerows(results)

In [None]:
df['gpt_prompt_4_correct_type'] = df['gpt_prompt_4_type'] == df['sentence_type']

In [None]:
df['gpt_prompt_4_correct_type'].sum()

#### GPT Prompt 3: NC in-context definition, prompt decomposition (subtasks)

In [None]:
# GPT Prompt #3

import json
from google.colab import userdata
from openai import OpenAI

# initialize openai client with colab secret key
client = OpenAI(api_key=userdata.get('OPENAI_PROJECT_KEY'))

def generate_paraphrases_batched_3(compounds, sentences):
    """
    Generate paraphrases in batches using GPT-4.
    """
    # Create a combined prompt
    examples = "\n\n".join([
        f'Phrase: "{nc}"\nSentence: "{sentence}"' for nc, sentence in zip(compounds, sentences)
    ])

    prompt = f"""
You are a linguistics expert specializing in figurative language. You will be given a set samples, each containing a potentially figurative English phrase paired with a sentence that said phrase is used in.
For each sample, you are to do the following:
1. Read the sentence; consider how the phrase is used in the sentence. It might be used figuratively (i.e. as an idiom), and it might be used literally (i.e. word composition).
2. Verbose explanation: Given your familiarity with the phrase's possible meanings, and having considered how it's used in the sentence, give a verbose explanation of what the phrase means in the context of the sentence. This can be a few sentences long.
3. Determine usage: State whether the phrase is used figuratively or literally in the sentence.
4. Definition: A concise, generalized definition of the phrase in this sentence.
5. Other usage definition: Assuming the phrase has both a literal definition and a figurative definition, give the definition for the OTHER usage.

Example #1:
<Sample>
Phrase: "cold turkey"
Sentence: "John quit smoking cold turkey and never looked back, not that it was easy."
---
<Output>
#3 - "Figuratively"
#4 - "Abruptly quitting a habit, marked by discomfort and determination"
#5 - "A turkey, which is a type of bird, that is cold"

Example #2:
<Sample>
Phrase: "piece of cake"
Sentence: "The boy eyed the piece of cake from afar as the waitress approached from across the room."
---
<Output>
#3 - "Literally"
#4 - "A slice of a sweet, baked dessert"
#5 - "A task that can be completed with no difficulty; something easy to accomplish"

Respond in this format:
{{"samples": [
    {{"phrase": "cold turkey", "verbose_definition": "<see #2 above>", "usage": "<see #3 above>", "definition": "<see #4 above>", "other_definition": "<see #5 above>"}},
    {{"phrase": "piece of cake", "verbose_definition": "<see #2 above>", "usage": "<see #3 above>", "definition": "<see #4 above>", "other_definition": "<see #5 above>" }}
]}}

In your definitions, do not preface your definition with phrases like, "The literal meaning would be..." or "The figurative meaning is...". Just give the definition, such that it could be used in a downstream task.
These are the samples:
{examples}
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
        )
        content = json.loads(response.choices[0].message.content.strip())
        print(json.dumps(content, indent=4))
        results = {item["phrase"]: item["definition"] for item in content["samples"]}
        # for item in content["samples"]:
        #     print(f"Phrase: {item['phrase']}\nVerbose: {item['verbose_definition']}\nUsage: {item['usage']}\nDefinition: {item['definition']}\nOther Definition: {item['other_definition']}")
        return results
    except Exception as e:
        print(f"Error generating paraphrases: {e}")
        return {}

In [None]:
# Apply to the DataFrame in batches
batch_size = 5
results = {}
for i in range(0, len(df), batch_size):
    batch = df.iloc[i:i + batch_size]
    compounds = batch['compound'].tolist()
    sentences = batch['sentence'].tolist()
    batch_results = generate_paraphrases_batched_3(compounds, sentences)
    results.update(batch_results)

# Map results back to DataFrame
df['gpt_prompt_3_resp'] = df['compound'].map(results)

{
    "samples": [
        {
            "phrase": "elbow grease",
            "verbose_definition": "In the context of the sentence, 'elbow grease' is used to denote effort and hard work that one puts into a task or activity. It implies a scenario where the person has devoted considerable physical effort and possibly time to make the engine function again.",
            "usage": "Figuratively",
            "definition": "Determined effort, energy, and hard work",
            "other_definition": "Natural oil created by the skin around the elbow joint."
        },
        {
            "phrase": "night owl",
            "verbose_definition": "The phrase 'night owl' in this sentence is used to denote someone who is more active, alert, or productive during the nighttime hours as opposed to the daytime. It's a figurative expression to describe the person's sleep habits and preferred active hours.",
            "usage": "Figuratively",
            "definition": "A person who is habitually a

In [None]:
df[['compound', 'sentence_type', 'sentence', 'gpt_prompt_1_resp', 'gpt_prompt_2_resp', 'gpt_prompt_3_resp']].to_csv('gpt_prompt_responses.csv', index=False)

#### Prompts 4 and 5 (variations of 3)

In [None]:
df['gpt_prompt_3_with_nc_for_literal'] = df.apply(
    lambda row: row['gpt_prompt_3_resp'] if row['sentence_type'] == 'literal' else row['compound'],
    axis=1
)

In [None]:
df['gpt_prompt_3_with_sentence_for_literal'] = df.apply(
    lambda row: row['gpt_prompt_3_resp'] if row['sentence_type'] == 'literal' else row['compound'],
    axis=1
)

In [None]:
df_sentence_type_preds = pd.read_csv('sentence_type_predictions.csv')

In [None]:
print(len(df_sentence_type_preds))
print(len(df))

70
70


In [None]:
df['sentence_type_pred'] = df_sentence_type_preds['predicted_label'].apply(
    lambda x: 'literal' if x == 0 else 'idiomatic'
)


In [None]:
df['sentence_type_pred_correct'] = df_sentence_type_preds['is_correct']

In [None]:
df.head()

Unnamed: 0,compound,subset,sentence_type,sentence,expected_order,image1_name,image1_caption,image2_name,image2_caption,image3_name,...,image_paths,image_idx_map,expected_order_indices,gpt_prompt_1_resp,gpt_prompt_2_resp,gpt_prompt_3_resp,gpt_prompt_3_with_nc_for_literal,gpt_prompt_3_with_sentence_for_literal,sentence_type_pred,sentence_type_pred_correct
0,elbow grease,Train,idiomatic,It took a lot of elbow grease to get the old e...,"['35234427395.png', '53378381715.png', '399382...",35234427395.png,The image depicts a hand holding a sponge and ...,39938261459.png,The image depicts a hand wearing a yellow work...,53378381715.png,...,"[train/elbow grease/35234427395.png, train/elb...","{'35234427395.png': 0, '39938261459.png': 1, '...","[0, 2, 1, 4, 3]","Hard physical work, especially vigorous cleani...",Applying hard work and commitment.,"Determined effort, energy, and hard work",elbow grease,elbow grease,literal,False
1,night owl,Train,idiomatic,"It's a constant battle for us, as he is a morn...","['61697797701.png', '93189810779.png', '893752...",00982495584.png,The image depicts a nighttime scene with a lar...,61697797701.png,The image depicts a cartoon-style illustration...,89375227504.png,...,"[train/night owl/00982495584.png, train/night ...","{'00982495584.png': 0, '61697797701.png': 1, '...","[1, 3, 2, 0, 4]","A person habitually active or awake at night, ...",Person preferring late hours for productivity.,A person who is habitually active or awake at ...,night owl,night owl,literal,False
2,heart of gold,Train,idiomatic,Even the somewhat seedy failed private eye has...,"['86137977215.png', '78062290185.png', '542405...",54240592941.png,"The image depicts a large, metallic safe with ...",78062290185.png,The image depicts a joyful scene featuring a y...,86137977215.png,...,"[train/heart of gold/54240592941.png, train/he...","{'54240592941.png': 0, '78062290185.png': 1, '...","[2, 1, 0, 4, 3]","Incredibly kind and generous nature, reflectin...",Being kind and generous.,A person with a very kind and good nature,heart of gold,heart of gold,idiomatic,True
3,agony aunt,Sample,idiomatic,ESA's Space Weather Office is like Europe's st...,"['83600499282.png', '57658144685.png', '025128...",02512838127.png,The image depicts a serene outdoor scene featu...,32964421720.png,The image depicts a cartoon-style illustration...,57658144685.png,...,"[train/agony aunt/02512838127.png, train/agony...","{'02512838127.png': 0, '32964421720.png': 1, '...","[3, 2, 0, 1, 4]","A person, often a columnist, who gives advice ...",Person who offers advice or solutions.,A person who offers advice or solutions to pro...,agony aunt,agony aunt,idiomatic,True
4,shrinking violet,Train,idiomatic,"This aged, rich wine is no shrinking violet wi...","['77861539717.png', '68016869942.png', '118443...",11844321898.png,The image depicts a bouquet of purple tulips a...,45394842176.png,"The image depicts a magnifying glass, commonly...",68016869942.png,...,"[train/shrinking violet/11844321898.png, train...","{'11844321898.png': 0, '45394842176.png': 1, '...","[3, 2, 0, 1, 4]",A person who is shy or modest and avoids drawi...,Excessively shy individual.,Referred to something not shrinking back or be...,shrinking violet,shrinking violet,idiomatic,True


### Experiment and model configurations

#### Experiments config

In [None]:
experiments = [
    {
        "name": "Baseline (Sentences)",
        "text_inputs": df['sentence']
    },
    {
        "name": "NC-Only",
        "text_inputs": df['compound']
    },
    {
        "name": "GPT Prompt 1",
        "text_inputs": df['gpt_prompt_1_resp']
    },
    {
        "name": "GPT Prompt 2",
        "text_inputs": df['gpt_prompt_2_resp']
    },
    {
        "name": "GPT Prompt 3",
        "text_inputs": df['gpt_prompt_3_resp']
    },
    {
        "name": "GPT Prompt 3 with nc for literal",
        "text_inputs": df['gpt_prompt_3_with_nc_for_literal']
    },
    {
        "name": "GPT Prompt 3 with sentence for literal",
        "text_inputs": df['gpt_prompt_3_with_sentence_for_literal']
    }
]

In [None]:
experiments.append({
        "name": "GPT Prompt 4 with sentence if predicted literal",
        "text_inputs": df['text_input_exp_4']
    })

#### Models config

In [None]:
from transformers import AlignProcessor, AlignModel

device = "cuda" if torch.cuda.is_available() else "cpu"

models = [
    {
        "base_model": "CLIP",
        "model_name": "ViT-B/32",
        "model": clip.load("ViT-B/32", device)[0],
        "preprocess": clip.load("ViT-B/32", device)[1]
    },
    {
        "base_model": "CLIP",
        "model_name": "ViT-L/14",
        "model": clip.load("ViT-L/14", device)[0],
        "preprocess": clip.load("ViT-L/14", device)[1]
    },
    {
        "base_model": "CLIP",
        "model_name": "RN50x64",
        "model": clip.load("RN50x64", device)[0],
        "preprocess": clip.load("RN50x64", device)[1]
    },
    {
        "base_model": "Align",
        "model_name": "Base",
        "model": AlignModel.from_pretrained("kakaobrain/align-base"),
        "preprocess": AlignProcessor.from_pretrained("kakaobrain/align-base")
    }
]

100%|███████████████████████████████████████| 338M/338M [00:06<00:00, 53.9MiB/s]
100%|███████████████████████████████████████| 890M/890M [00:11<00:00, 77.9MiB/s]
100%|█████████████████████████████████████| 1.26G/1.26G [00:22<00:00, 60.7MiB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/5.25k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/690M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
openclip_model_version = "ViT-B-32"
model_openclip, _, preprocess_openclip = open_clip.create_model_and_transforms(openclip_model_version, pretrained='laion2b_s34b_b79k')
model_openclip.to(device)
model_openclip.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active

models.append({
    "base_model": "open_clip",
    "model_name": openclip_model_version,
    "model": model_openclip,
    "preprocess": preprocess_openclip
})

open_clip_pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

### RUN ALL

In [None]:
# Run all experiments on all models
for exp in experiments:
    print(f"\nRunning \"{exp['name']}\" on {len(models)} models...")

    for idx, conf in enumerate(models):
      print(f"\nModel [{idx+1}/{len(models)}]: {conf['model_name']} ({conf['base_model']})")

      metrics, preds, confidence = run_experiment(
          model=conf['model'],
          processor=conf['preprocess'],
          df=df,
          image_paths=df['image_paths'],
          text_inputs=exp['text_inputs'],
          model_name=conf['model_name'],
          experiment_name=exp['name'],
          base_model=conf['base_model']
      )
      save_predictions(
          df=df,
          image_paths=df['image_paths'],
          predictions=preds,
          confidence_scores=confidence,
          metrics=metrics,
          prefix=f"{conf['base_model']}_{conf['model_name']}_{exp['name'].lower()}"
      )


Running "Baseline (Sentences)" on 5 models...

Model [1/5]: ViT-B/32 (CLIP)
Results saved to experiment_results.csv
Top-1 Accuracy: 45.71%
Average Spearman Correlation: 0.12
Average Weighted Accuracy: 0.35
Predictions saved to predictions/CLIP_ViT-B32_baseline_sentences_preds.csv

Model [2/5]: ViT-L/14 (CLIP)
Results saved to experiment_results.csv
Top-1 Accuracy: 47.14%
Average Spearman Correlation: 0.15
Average Weighted Accuracy: 0.36
Predictions saved to predictions/CLIP_ViT-L14_baseline_sentences_preds.csv

Model [3/5]: RN50x64 (CLIP)
Results saved to experiment_results.csv
Top-1 Accuracy: 50.00%
Average Spearman Correlation: 0.18
Average Weighted Accuracy: 0.37
Predictions saved to predictions/CLIP_RN50x64_baseline_sentences_preds.csv

Model [4/5]: Base (Align)
Results saved to experiment_results.csv
Top-1 Accuracy: 48.57%
Average Spearman Correlation: 0.15
Average Weighted Accuracy: 0.39
Predictions saved to predictions/Align_Base_baseline_sentences_preds.csv

Model [5/5]: ViT-B

In [None]:
import os
import pandas as pd

def combine_predictions(models, experiments, preds_dir='predictions'):
    """
    Combine all prediction files into one DataFrame, using the exact filename construction logic.
    """
    combined_preds = []

    # Iterate through all experiments and models
    for exp in experiments:
        for conf in models:
            # Construct the prefix exactly as in your experiment loop
            prefix = f"{conf['base_model']}_{conf['model_name']}_{exp['name'].lower()}"
            prefix = prefix.strip().replace(" ", "_")
            prefix = re.sub(r'[^a-zA-Z0-9_-]', '', prefix)

            # Generate the full file path
            preds_file_path = f"{preds_dir}/{prefix}_preds.csv"

            # Check if the file exists before loading
            if os.path.exists(preds_file_path):
                print(f"Loading predictions from: {preds_file_path}")

                # Load the predictions file
                df = pd.read_csv(preds_file_path)

                # Add metadata columns
                df['base_model'] = conf['base_model']
                df['model_name'] = conf['model_name']
                df['experiment'] = exp['name']

                # Append to the combined list
                combined_preds.append(df)
            else:
                print(f"Warning: Predictions file not found: {preds_file_path}")

    # Concatenate all DataFrames into one
    combined_preds_df = pd.concat(combined_preds, ignore_index=True) if combined_preds else pd.DataFrame()
    return combined_preds_df


# Example usage:
combined_preds_df = combine_predictions(models, experiments)

# Save to CSV (optional)
combined_preds_df.to_csv('combined_predictions.csv', index=False)
print("Combined predictions saved to combined_predictions.csv")


Loading predictions from: predictions/CLIP_ViT-B32_baseline_sentences_preds.csv
Loading predictions from: predictions/CLIP_ViT-L14_baseline_sentences_preds.csv
Loading predictions from: predictions/CLIP_RN50x64_baseline_sentences_preds.csv
Loading predictions from: predictions/Align_Base_baseline_sentences_preds.csv
Loading predictions from: predictions/open_clip_ViT-B-32_baseline_sentences_preds.csv
Loading predictions from: predictions/CLIP_ViT-B32_nc-only_preds.csv
Loading predictions from: predictions/CLIP_ViT-L14_nc-only_preds.csv
Loading predictions from: predictions/CLIP_RN50x64_nc-only_preds.csv
Loading predictions from: predictions/Align_Base_nc-only_preds.csv
Loading predictions from: predictions/open_clip_ViT-B-32_nc-only_preds.csv
Loading predictions from: predictions/CLIP_ViT-B32_gpt_prompt_1_preds.csv
Loading predictions from: predictions/CLIP_ViT-L14_gpt_prompt_1_preds.csv
Loading predictions from: predictions/CLIP_RN50x64_gpt_prompt_1_preds.csv
Loading predictions from:

In [None]:
import numpy as np

df[['compound', 'sentence_type', 'sentence', 'gpt_prompt_1_resp', 'gpt_prompt_2_resp', 'gpt_prompt_3_resp', 'gpt_prompt_3_with_nc_for_literal', 'gpt_prompt_3_with_sentence_for_literal']].to_csv('gpt_prompt_responses_2.csv', index=False)


### Deprecated (run specific standalone experiments)

#### Baseline 1: Sentence only

In [None]:
# Baseline: Full context sentence
exp_name = "Baseline (Sentences)"

print(f"\nRunning \"{exp_name}\" on {len(models)} models...")
for idx, conf in enumerate(models):
  print(f"\nModel [{idx+1}/{len(models)}]: {conf['model_name']} ({conf['base_model']})")

  metrics, preds, confidence = run_experiment(
      model=conf['model'],
      processor=conf['preprocess'],
      df=df,
      image_paths=df['image_paths'],
      text_inputs=df['compound'],
      model_name=conf['model_name'],
      experiment_name=exp_name,
      base_model=conf['base_model']
  )
  save_predictions(
      df=df,
      image_paths=df['image_paths'],
      predictions=preds,
      confidence_scores=confidence,
      metrics=metrics,
      prefix=f"{conf['base_model']}_{conf['model_name']}_{exp_name.lower()}"
  )

#### Baseline 2: NC only

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Experiment 2: NC-only
exp_name = "NC-Only"

print(f"\nRunning \"{exp_name}\" on {len(models)} models...")
for idx, conf in enumerate(models):
  print(f"\nModel [{idx+1}/{len(models)}]: {conf['model_name']} ({conf['base_model']})")

  metrics, preds, confidence = run_experiment(
      model=conf['model'],
      processor=conf['preprocess'],
      df=df,
      image_paths=df['image_paths'],
      text_inputs=df['compound'],
      model_name=conf['model_name'],
      experiment_name=exp_name,
      base_model=conf['base_model']
  )
  save_predictions(
      df=df,
      image_paths=df['image_paths'],
      predictions=preds,
      confidence_scores=confidence,
      metrics=metrics,
      prefix=f"{conf['base_model']}_{conf['model_name']}_{exp_name}"
  )

#### Run Experiment 3

In [None]:
# Experiment 2: NC-only
exp_name = "GPT Prompt 1"

print(f"\nRunning \"{exp_name}\" on {len(models)} models...")
for idx, conf in enumerate(models):
  print(f"\nModel [{idx+1}/{len(models)}]: {conf['model_name']} ({conf['base_model']})")

  metrics, preds, confidence = run_experiment(
      model=conf['model'],
      processor=conf['preprocess'],
      df=df,
      image_paths=df['image_paths'],
      text_inputs=df['compound'],
      model_name=conf['model_name'],
      experiment_name=exp_name,
      base_model=conf['base_model']
  )
  save_predictions(
      df=df,
      image_paths=df['image_paths'],
      predictions=preds,
      confidence_scores=confidence,
      metrics=metrics,
      prefix=f"{conf['base_model']}_{conf['model_name']}_{exp_name.lower()}"
  )

In [None]:
!zip -r results_1751.zip predictions/ experiment_results.csv gpt_prompt_responses_2.csv
from google.colab import files
files.download('results_1751.zip')


  adding: predictions/ (stored 0%)
  adding: predictions/CLIP_ViT-L14_gpt_prompt_3_with_sentence_for_literal_preds.csv (deflated 64%)
  adding: predictions/CLIP_ViT-B32_gpt_prompt_3_preds.csv (deflated 62%)
  adding: predictions/CLIP_ViT-B32_gpt_prompt_3_with_nc_for_literal_preds.csv (deflated 62%)
  adding: predictions/CLIP_ViT-L14_gpt_prompt_3_with_nc_for_literal_preds.csv (deflated 63%)
  adding: predictions/CLIP_ViT-L14_gpt_prompt_3_preds.csv (deflated 63%)
  adding: predictions/CLIP_ViT-L14_gpt_prompt_2_preds.csv (deflated 63%)
  adding: predictions/Align_Base_gpt_prompt_1_preds.csv (deflated 62%)
  adding: predictions/open_clip_ViT-B-32_gpt_prompt_2_preds.csv (deflated 65%)
  adding: predictions/Align_Base_gpt_prompt_3_preds.csv (deflated 62%)
  adding: predictions/CLIP_ViT-B32_gpt_prompt_3_with_sentence_for_literal_preds.csv (deflated 62%)
  adding: predictions/Align_Base_gpt_prompt_3_with_sentence_for_literal_preds.csv (deflated 62%)
  adding: predictions/CLIP_RN50x64_gpt_promp

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>