<h1>Prepare the dataset for the experiments</h1>
This notebook prepares the CAD from Flickr generated by Kaushin et. al.


We first import the three original splits in the repository, i.e. train dev test.
Then, we map each orginal instance with its corresponding revision (either premise or hypothesis revision).

Then, we generate a 5-fold-cross validation split for our experiments.

For every fold, we take the training set and we sample "val_prop" from the training set.

script version: TODO prepare_flickrr_for_experiments.py

In [78]:
import pandas as pd
import numpy as np
import random
import sklearn.model_selection
import os

def load_set(url):
    return pd.read_csv(url, sep='\t')

## Build original dataset

In [49]:
origin_dev = load_set('https://raw.githubusercontent.com/acmi-lab/counterfactually-augmented-data/master/NLI/original/dev.tsv')
origin_dev = origin_dev.rename(columns={"sentence1": "premise", "sentence2": "hypothesis", "gold_label": "label"})
print(f"len of dev:{len(origin_dev)}")

origin_test = load_set('https://raw.githubusercontent.com/acmi-lab/counterfactually-augmented-data/master/NLI/original/test.tsv')
origin_test = origin_test.rename(columns={"sentence1": "premise", "sentence2": "hypothesis", "gold_label": "label"})
print(f"len of test:{len(origin_test)}")

origin_train = load_set('https://raw.githubusercontent.com/acmi-lab/counterfactually-augmented-data/master/NLI/original/train.tsv')
origin_train = origin_train.rename(columns={"sentence1": "premise", "sentence2": "hypothesis", "gold_label": "label"})
print(f"train of dev:{len(origin_train)}")

# append datasets
dataset_original = pd.concat([origin_dev, origin_test, origin_train], ignore_index=True)

print(f"len of dataset:{len(dataset_original)}")
dataset_original.head(1)

len of dev:200
len of test:400
train of dev:1666
len of dataset:2266


Unnamed: 0,premise,hypothesis,label
0,The little boy in jean shorts kicks the soccer...,A little boy is playing soccer outside.,neutral


## Build revised premises dataset (RP task)

In [50]:
def find_original_prems(df, original_df):
    # find original premise for each row
    original_prems = []
    original_labels = []
    for _, row in df.iterrows():
        original_hyp = row["original_hyp"]
        not_found = True

        for _, r in original_df.iterrows():
            if original_hyp == r["hypothesis"]:
                not_found = False
                original_prems.append(r["premise"])
                original_labels.append(r["label"])
                break

        if not_found:
            original_prems.append(np.nan)
            original_labels.append(np.nan)

    df["original_prem"] = original_prems
    df["original_label"] = original_labels
    return df

In [53]:
dev = load_set('https://raw.githubusercontent.com/acmi-lab/counterfactually-augmented-data/master/NLI/revised_premise/dev.tsv')

# add columns
dev["task"] = ["RP" for el in range(len(dev))]
dev["counter_hyp"] = ["" for el in range(len(dev))]

# rename columns
dev = dev.rename(columns={"sentence1": "counter_prem", "sentence2": "original_hyp", "gold_label": "counter_label"})

# find original premises
dev = find_original_prems(dev, origin_dev)
dev = dev.dropna()
print("Done with dev!")
##########################################################################################################################

test = load_set('https://raw.githubusercontent.com/acmi-lab/counterfactually-augmented-data/master/NLI/revised_premise/test.tsv')

# add columns
test["task"] = ["RP" for el in range(len(test))]
test["counter_hyp"] = ["" for el in range(len(test))]

# rename columns
test = test.rename(columns={"sentence1": "counter_prem", "sentence2": "original_hyp", "gold_label": "counter_label"})

# find original premises
test = find_original_prems(test, origin_test)
test = test.dropna()
print("Done with test!")
##########################################################################################################################

train = load_set('https://raw.githubusercontent.com/acmi-lab/counterfactually-augmented-data/master/NLI/revised_premise/train.tsv')

# add columns
train["task"] = ["RP" for el in range(len(train))]
train["counter_hyp"] = ["" for el in range(len(train))]

# rename columns
train = train.rename(columns={"sentence1": "counter_prem", "sentence2": "original_hyp", "gold_label": "counter_label"})

# find original premises
train = find_original_prems(train, origin_train)
train = train.dropna()
print("Done with train!")
##########################################################################################################################

# append datasets
dataset_rp = pd.concat([dev, test, train], ignore_index=True)
print(f"len of dataset:{len(dataset_rp)}")
dataset_rp.head(3)

Done with dev!
Done with test!
Done with train!
len of dataset:4521


Unnamed: 0,counter_prem,original_hyp,counter_label,task,counter_hyp,original_prem,original_label
0,The little boy in jean shorts kicks the soccer...,A little boy is playing soccer outside.,entailment,RP,,The little boy in jean shorts kicks the soccer...,neutral
1,The little boy in jean shorts kicks the soccer...,A little boy is playing soccer outside.,contradiction,RP,,The little boy in jean shorts kicks the soccer...,neutral
2,Lovers running hand in hand.,people running holding hands because they are ...,entailment,RP,,Friends running a race hand in hand.,neutral


## Build revised premises dataset (RH task)

In [54]:
def find_original_hyps(df, original_df):
    # find original premise for each row
    original_hyps = []
    original_labels = []
    for _, row in df.iterrows():
        original_prem = row["original_prem"]
        not_found = True

        for _, r in original_df.iterrows():
            if original_prem == r["premise"]:
                not_found = False
                original_hyps.append(r["hypothesis"])
                original_labels.append(r["label"])
                break

        if not_found:
            original_hyps.append(np.nan)
            original_labels.append(np.nan)

    df["original_hyp"] = original_hyps
    df["original_label"] = original_labels
    return df

In [55]:
dev = load_set('https://raw.githubusercontent.com/acmi-lab/counterfactually-augmented-data/master/NLI/revised_hypothesis/dev.tsv')

# add columns
dev["task"] = ["RH" for el in range(len(dev))]
dev["counter_prem"] = ["" for el in range(len(dev))]

# rename columns
dev = dev.rename(columns={"sentence1": "original_prem", "sentence2": "counter_hyp", "gold_label": "counter_label"})

# find original premises
dev = find_original_hyps(dev, origin_dev)
dev = dev.dropna()
print("Done with dev!")
##########################################################################################################################

test = load_set('https://raw.githubusercontent.com/acmi-lab/counterfactually-augmented-data/master/NLI/revised_hypothesis/test.tsv')

# add columns
test["task"] = ["RH" for el in range(len(test))]
test["counter_prem"] = ["" for el in range(len(test))]

# rename columns
test = test.rename(columns={"sentence1": "original_prem", "sentence2": "counter_hyp", "gold_label": "counter_label"})

# find original premises
test = find_original_hyps(test, origin_test)
test = test.dropna()
print("Done with test!")
##########################################################################################################################

train = load_set('https://raw.githubusercontent.com/acmi-lab/counterfactually-augmented-data/master/NLI/revised_hypothesis/train.tsv')

# add columns
train["task"] = ["RH" for el in range(len(train))]
train["counter_prem"] = ["" for el in range(len(train))]

# rename columns
train = train.rename(columns={"sentence1": "original_prem", "sentence2": "counter_hyp", "gold_label": "counter_label"})

# find original premises
train = find_original_hyps(train, origin_train)
train = train.dropna()
print("Done with train!")
##########################################################################################################################

# append datasets
dataset_rh = pd.concat([dev, test, train], ignore_index=True)
print(f"len of dataset:{len(dataset_rh)}")
dataset_rh.head(3)

Done with dev!
Done with test!
Done with train!
len of dataset:4532


Unnamed: 0,original_prem,counter_hyp,counter_label,task,counter_prem,original_hyp,original_label
0,The little boy in jean shorts kicks the soccer...,A little boy is playing cricket.,contradiction,RH,,A little boy is playing soccer outside.,neutral
1,The little boy in jean shorts kicks the soccer...,A little boy is playing soccer.,entailment,RH,,A little boy is playing soccer outside.,neutral
2,Friends running a race hand in hand.,The people running hand and hand aren't friends.,contradiction,RH,,people running holding hands because they are ...,neutral


## Print datasets

In [67]:
dataset_path = "../cad_flickr_nli/"
dataset_rp.to_csv(dataset_path + "revised_prems.tsv", index=False, header=True, sep='\t')
dataset_rh.to_csv(dataset_path + "revised_hyps.tsv", index=False, header=True, sep='\t')
dataset_original.to_csv(dataset_path + "original.tsv", index=False, header=True, sep='\t')

# Prepare datasets for the experiments

In [60]:
dataset_path = "../cad_flickr_nli/"

In [77]:
dataset_rp = pd.read_csv(f"{dataset_path}revised_prems.tsv", sep='\t')
print(dataset_rp["original_label"].value_counts())
print()
print(dataset_rp["counter_label"].value_counts())

entailment       1559
neutral          1498
contradiction    1464
Name: original_label, dtype: int64

contradiction    1529
neutral          1510
entailment       1482
Name: counter_label, dtype: int64


In [75]:
dataset_rh = pd.read_csv(f"{dataset_path}revised_hyps.tsv", sep='\t')
print(dataset_rh["original_label"].value_counts())
print()
print(dataset_rh["counter_label"].value_counts())

entailment       1572
neutral          1508
contradiction    1452
Name: original_label, dtype: int64

contradiction    1534
neutral          1514
entailment       1484
Name: counter_label, dtype: int64


In [80]:
dataset = pd.concat([dataset_rp, dataset_rh], ignore_index=True)
dataset.reset_index(inplace=True, drop=True)
dataset.head(1)

Unnamed: 0,counter_prem,original_hyp,counter_label,task,counter_hyp,original_prem,original_label
0,The little boy in jean shorts kicks the soccer...,A little boy is playing soccer outside.,entailment,RP,,The little boy in jean shorts kicks the soccer...,neutral


Labels are more or less fairly distributed across the dataset, so that a random split should maintain the same proportion across classes.

In [85]:
K = 5
seed_shuffle = 5
random_seed_folds = 2022
val_prop = 0.15

In [86]:
def random_shuffle_df(df, seed):
    random.seed(seed)
    df = sklearn.utils.shuffle(df)

    return df

In [87]:
dataset = random_shuffle_df(dataset, seed_shuffle)

In [88]:
# Implementing cross validation via sklearn
kf = sklearn.model_selection.KFold(n_splits=K, shuffle=True, random_state=random_seed_folds)

folds = {}
i = 0
for train_index , test_index in kf.split(dataset):
    df_train_and_val, df_test = dataset.iloc[train_index], dataset.iloc[test_index]

    # sample a validation set
    df_val = df_train_and_val.sample(frac=val_prop, random_state=random_seed_folds)
    df_training = df_train_and_val.drop(df_val.index)

    folds[str(i)] = (df_training, df_val, df_test)
    i += 1

In [90]:
exp_fold = folds['0'][0]
print(exp_fold["original_label"].value_counts())
print()
print(exp_fold["counter_label"].value_counts())
print()
print(exp_fold["task"].value_counts())

entailment       2123
neutral          2037
contradiction    1996
Name: original_label, dtype: int64

contradiction    2072
neutral          2051
entailment       2033
Name: counter_label, dtype: int64

RP    3094
RH    3062
Name: task, dtype: int64


### As expected, fold 0 is balanced across labels and tasks!

In [95]:
for fold in folds:
    # create folder for fold
    fold_path = "../cad_flickr_nli/fold_" + fold
    os.mkdir(fold_path)

    # print train, val, test
    folds[fold][0].to_csv(fold_path + "/training_set.tsv", index=False, header=True, sep='\t')
    folds[fold][1].to_csv(fold_path + "/val_set.tsv", index=False, header=True, sep='\t')
    folds[fold][2].to_csv(fold_path +  "/test_set.tsv", index=False, header=True, sep='\t')

In [96]:
train_set = pd.read_csv("../cad_flickr_nli/fold_0/training_set.tsv", sep='\t')
val_set = pd.read_csv("../cad_flickr_nli/fold_0/val_set.tsv", sep='\t')
test_set = pd.read_csv("../cad_flickr_nli/fold_0/test_set.tsv", sep='\t')

In [97]:
print(f"Len train:{len(train_set)}")
print(f"Len val:{len(val_set)}")
print(f"Len test:{len(test_set)}")

Len train:6156
Len val:1086
Len test:1811


In [98]:
dataset.to_csv("../cad_flickr_nli/cad_flickr_nli.tsv", index=False, header=True, sep='\t')