In [1]:
import pandas as pd
from datasets import load_dataset
from pprint import pprint
from tqdm import tqdm   
import random
import os
import json
# shuffle list
random.seed(42)
from datasets import concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
prompts = [
    "Does the image entail or contradict the claim REPLACE_CLAIM? Explain your reasoning and provide a label between Entails or Contradicts.",
    "Is the image consistent with the statement REPLACE_CLAIM? Justify your answer and classify it as either Entails or Contradicts.",
    "Does the picture support or refute the assertion REPLACE_CLAIM? Offer your rationale and select a label: Entails or Contradicts.",
    "Can the image be seen as validating or opposing the claim REPLACE_CLAIM? Explain your thought process and assign a label of Entails or Contradicts.",
    "Is there agreement or disagreement between the image and the claim REPLACE_CLAIM? Provide your analysis and choose between Entails or Contradicts.",
    "Does this image confirm or deny the claim REPLACE_CLAIM? Discuss your reasoning and determine a label: Entails or Contradicts.",
    "Is the image in harmony with or in conflict with the statement REPLACE_CLAIM? Explain your justification and label it as Entails or Contradicts.",
    "Does the image corroborate or dispute the claim REPLACE_CLAIM? Outline your reasoning and categorize it under Entails or Contradicts.",
    "Is the depiction aligned with or against the claim REPLACE_CLAIM? Share your evaluation and identify it as either Entails or Contradicts.",
    "Does the visual evidence support or counter the claim REPLACE_CLAIM? Provide your explanation and assign it a label of Entails or Contradicts.",
    "Is the content of the image endorsing or challenging the claim REPLACE_CLAIM? Justify your position and label it as Entails or Contradicts.",
    "Does the illustration affirm or negate the claim REPLACE_CLAIM? Articulate your reasoning and apply a label: Entails or Contradicts.",
    "Is the portrayal in the image consistent with or contradictory to the claim REPLACE_CLAIM? Offer your insights and select between Entails or Contradicts.",
    "Does the image agree with or dispute the claim REPLACE_CLAIM? Explain your analysis and mark it as Entails or Contradicts.",
    "Is the image's message supporting or opposing the claim REPLACE_CLAIM? Discuss your rationale and determine the appropriate label: Entails or Contradicts.",
    "Does the illustration affirm or contest the claim REPLACE_CLAIM? Provide your argument and choose a label: Entails or Contradicts.",
    "Is the visual portrayal compatible with or adverse to the claim REPLACE_CLAIM? Justify your viewpoint and label it as Entails or Contradicts.",
    "Does the image's depiction validate or refute the claim REPLACE_CLAIM? Explain your point of view and select a label: Entails or Contradicts.",
    "Is the visual content in agreement or disagreement with the claim REPLACE_CLAIM? Offer your explanation and categorize it under Entails or Contradicts.",
    "Does the image's narrative confirm or disprove the claim REPLACE_CLAIM? Discuss your reasoning and identify it as either Entails or Contradicts.",
    "Is the image's representation supportive of or contradictory to the claim REPLACE_CLAIM? Articulate your analysis and assign the label: Entails or Contradicts."
]
prompts = [p.replace('Entails or Contradicts', 'entailment or contradiction') for p in prompts]

# Convert v-flute

In [6]:
data_train = load_dataset("ColumbiaNLP/V-FLUTE", cache_dir="./huggingface_cache", split="train")
data_valid = load_dataset("ColumbiaNLP/V-FLUTE", cache_dir="./huggingface_cache", split="validation")
data_test = load_dataset("ColumbiaNLP/V-FLUTE", cache_dir="./huggingface_cache", split="test")

## all prompts

In [None]:
random.seed(42)
for split, data_portion in zip(["train", "valid", "test"], 
                                [data_train, data_valid, data_test]):
# for split, data_portion in zip([ "test"], 
#                                 [data_test.select(range(10)) ]):
    ft_data = []
    for i, row in tqdm(enumerate(data_portion), total=len(data_portion)):

        # save image from row[image] to data_dir
        # save_dir_path = f"/mnt/swordfish-pool2/asaakyan/visEntail/data/VFLUTE-v2/{row['source_dataset']}/{split}"
        save_dir_path = "ENTER YOUR PATH"
        if not os.path.exists(save_dir_path): os.makedirs(save_dir_path)
        img_format = "jpg" if row['image'].format == "JPEG" else "png"
        # im_path = f"{save_dir_path}/{i}.{img_format}"
        im_path = f"{row['source_dataset']}/{split}/{i}.{img_format}"
        row['image'].save(f"{save_dir_path}/{i}.{img_format}")
        img_id = f"{row['source_dataset']}-{split}-{i}"

        claim = row['claim'].strip()
        expl = row['explanation'].strip()
        label = row['label']

        # USE ALL PROMPTS 
        sampled_prompt = random.choice(prompts)
        sampled_prompt_repl = sampled_prompt.replace("REPLACE_CLAIM", '"' + f"{claim}" + '"').strip()

        transformed = {
            "id": img_id,  
            "source_dataset": row['source_dataset'],
            "phenomenon": row['phenomenon'],
            "claim": claim,
            "label": label,
            "explanation": expl,
            "prompt": sampled_prompt,
            "image": im_path,
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>\n{sampled_prompt_repl}"
                },
                {
                    "from": "gpt",
                    "value": f"{expl}\nLABEL: {label}"
                }
            ]
        }
        ft_data.append([img_id, 
                        row['source_dataset'], 
                        row['phenomenon'],
                        im_path,
                        claim, 
                        label,
                        expl, 
                        sampled_prompt,
                        transformed])

    df = pd.DataFrame(ft_data, columns=["id", "source_dataset", "phenomenon", "path",
                                        "claim", "label", "explanation",
                                        "prompt", "transformed"])
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    df.to_csv(f"../data/flute-v-llava-clean/vflute-v2-{split}.csv", index=False)
    print(df.shape)
    print(df['source_dataset'].value_counts())  
    print(df['source_dataset'].value_counts()/df.shape[0]*100)  
    with open(f'../data/flute-v-llava-clean/vflute-v2-{split}.json', 'w') as f:
        json.dump(df['transformed'].to_list(), f, indent=4, ensure_ascii=False)


## No image

In [7]:
random.seed(42)
for split, data_portion in zip(["train", "valid", "test"], 
                                [data_train, data_valid, data_test]):
# for split, data_portion in zip([ "test"], 
#                                 [data_test.select(range(10)) ]):
    ft_data = []
    for i, row in tqdm(enumerate(data_portion), total=len(data_portion)):

        # save image from row[image] to data_dir
        # save_dir_path = f"/mnt/swordfish-pool2/asaakyan/visEntail/data/VFLUTE-v2/{row['source_dataset']}/{split}"
        # if not os.path.exists(save_dir_path): os.makedirs(save_dir_path)
        # img_format = "jpg" if row['image'].format == "JPEG" else "png"
        # im_path = f"{save_dir_path}/{i}.{img_format}"
        im_path = f"white.png"
        # row['image'].save(f"{save_dir_path}/{i}.{img_format}")
        img_id = f"{row['source_dataset']}-{split}-{i}"

        claim = row['claim'].strip()
        expl = row['explanation'].strip()
        label = row['label']

        # USE ALL PROMPTS 
        sampled_prompt = random.choice(prompts)
        sampled_prompt_repl = sampled_prompt.replace("REPLACE_CLAIM", '"' + f"{claim}" + '"').strip()

        transformed = {
            "id": img_id,  
            "source_dataset": row['source_dataset'],
            "phenomenon": row['phenomenon'],
            "claim": claim,
            "label": label,
            "explanation": expl,
            "prompt": sampled_prompt,
            "image": im_path,
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>\n{sampled_prompt_repl}"
                },
                {
                    "from": "gpt",
                    "value": f"{expl}\nLABEL: {label}"
                }
            ]
        }
        ft_data.append([img_id, 
                        row['source_dataset'], 
                        row['phenomenon'],
                        im_path,
                        claim, 
                        label,
                        expl, 
                        sampled_prompt,
                        transformed])

    df = pd.DataFrame(ft_data, columns=["id", "source_dataset", "phenomenon", "path",
                                        "claim", "label", "explanation",
                                        "prompt", "transformed"])
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    df.to_csv(f"../data/flute-v-llava-clean/vflute-v2-noimage-{split}.csv", index=False)
    print(df.shape)
    print(df['source_dataset'].value_counts())  
    print(df['source_dataset'].value_counts()/df.shape[0]*100)  
    with open(f'../data/flute-v-llava-clean/vflute-v2-noimage-{split}.json', 'w') as f:
        json.dump(df['transformed'].to_list(), f, indent=4, ensure_ascii=False)


100%|██████████| 4578/4578 [01:38<00:00, 46.66it/s] 


(4578, 9)
source_dataset
memecap       1566
irfl          1082
muse           830
vismet         649
nycartoons     451
Name: count, dtype: int64
source_dataset
memecap       34.207077
irfl          23.634775
muse          18.130188
vismet        14.176496
nycartoons     9.851464
Name: count, dtype: float64


100%|██████████| 726/726 [00:14<00:00, 51.26it/s] 


(726, 9)
source_dataset
irfl          217
memecap       196
vismet        107
muse          106
nycartoons    100
Name: count, dtype: int64
source_dataset
irfl          29.889807
memecap       26.997245
vismet        14.738292
muse          14.600551
nycartoons    13.774105
Name: count, dtype: float64


100%|██████████| 723/723 [00:14<00:00, 49.41it/s] 


(723, 9)
source_dataset
irfl          220
memecap       196
muse          106
vismet        101
nycartoons    100
Name: count, dtype: int64
source_dataset
irfl          30.428769
memecap       27.109267
muse          14.661134
vismet        13.969571
nycartoons    13.831259
Name: count, dtype: float64


# evil

In [25]:
data_dir = "../e-ViL/data"
train_df = pd.read_csv(f"{data_dir}/esnlive_train.csv")
train_df = train_df[train_df['gold_label'] != 'neutral']
valid_df = pd.read_csv(f"{data_dir}/esnlive_dev.csv")
valid_df = valid_df[valid_df['gold_label'] != 'neutral']
test_df = pd.read_csv(f"{data_dir}/esnlive_test.csv")
test_df = test_df[test_df['gold_label'] != 'neutral']
print(valid_df.shape, train_df.shape, test_df.shape)

random.seed(42)
for split, dataset in zip(["train", "valid", "test"],
                                [train_df, valid_df, test_df]):

    ft_data = []
    for i, row in tqdm(dataset.iterrows()):
        #using ALL prompts
        sampled_prompt = random.choice(prompts)
        sampled_prompt = sampled_prompt.replace("REPLACE_CLAIM", '"' + f"{row['hypothesis']}" + '"')
        transformed = {
            "id": f"evil-{split}-{row['Flickr30kID']}", 
            "image": f"evil/flickr30k_images/flickr30k_images/{row['Flickr30kID']}",
            "conversations": [
                {x
                    "from": "human",
                    "value": f"<image>\n{sampled_prompt}"
                },
                {
                    "from": "gpt",
                    "value": f"{row['explanation']}\nLABEL: {row['gold_label']}"
                }
            ]
        }
        ft_data.append(transformed)
    with open(f'../data/evil-llava-clean/{split}.json', 'w') as f:
        json.dump(ft_data, f, indent=4)

(10897, 6) (275815, 6) (10939, 6)


275815it [00:15, 17295.63it/s]
10897it [00:00, 18311.54it/s]
10939it [00:00, 19143.77it/s]


# eViL+vflute


In [12]:
with open('../data/flute-v-llava-clean/vflute-v2-train.json', 'r') as f:
    train_json_vflute= json.load(f)
for row in train_json_vflute:
    row['image'] = "VFLUTE-v2" + "/" + row['image']
train_json_vflute[0]

In [29]:
with open('../data/flute-v-llava-clean/vflute-v2-valid.json', 'r') as f:
    valid_json_vflute= json.load(f)
for row in valid_json_vflute:
    row['image'] = "VFLUTE-v2" + "/" + row['image']
with open('../data/flute-v-llava-clean/evil_vflute_valid.json', 'w') as f:
    json.dump(valid_json_vflute, f, indent=4)

In [27]:
with open('../data/evil-llava-clean/train.json', 'r') as f:
    train_json_evil = json.load(f)
# for row in train_json_evil:
#     row['image'] = "evil/flickr30k_images/flickr30k_images" + "/" + row['image']
train_json_evil[0]

{'id': 'evil-train-4564320256.jpg',
 'image': 'evil/flickr30k_images/flickr30k_images/4564320256.jpg',
 'conversations': [{'from': 'human',
   'value': '<image>\nIs the image\'s representation supportive of or contradictory to the claim "Two old men robbing a convenience store."? Articulate your analysis and assign the label: entailment or contradiction.'},
  {'from': 'gpt',
   'value': 'A lady and her granddaughter cannot also be two men\nLABEL: contradiction'}]}

In [28]:
train_json_evil_flute = train_json_evil + train_json_vflute 
print(len(train_json_evil), len(train_json_vflute), len(train_json_evil_flute))
with open('../data/flute-v-llava-clean/evil_vflute_train.json', 'w') as f:
    json.dump(train_json_evil_flute, f, indent=4)

275815 4578 280393
