<h1>Prepare the dataset for the experiments</h1>
This notebook prepares the CAD from Imdb generated by Kaushin et. al.

We first import the three original splits in the repository, i.e. train val test.
Then, we merge them and we generated a 5-fold-cross validation split for our experiments.

For every fold, we take the training set and we sample "val_prop" from the training set.

script version: prepare_imdb_for_experiments.py

In [8]:
K = 5
random_seed_counter = 5
random_seed_folds = 2022
val_prop = 0.15

In [9]:
import pandas as pd
import random
import sklearn.model_selection
import bs4
import os

In [10]:
def reformat_sentiment(x):
    return int(x == 'Positive')

def load_dataset(name):
    # load the dataset
    url = 'https://raw.githubusercontent.com/acmi-lab/counterfactually-augmented-data/master/sentiment/combined/paired/' + name
    dataset = pd.read_csv(url, sep='\t')
    dataset.rename(columns={"Sentiment": "sentiment", "Text": "text", "batch_id": "paired_id"}, inplace=True)
    # reformat 'sentiment' column
    dataset['sentiment'] = dataset['sentiment'].apply(lambda value: reformat_sentiment(value))

    return dataset

def random_shuffle_df(df, seed):
    random.seed(seed)
    df = sklearn.utils.shuffle(df)

    return df

def set_example_counter(idx, found_idsx):
    if idx in found_idsx:
        return 0
    else:
        found_idsx[idx] = 0
        return 1

def randomly_assign_conterfactuals(df, seed=1):
    # prepare the proper Dataframe for the dataset
    df = random_shuffle_df(df, seed)
    paired_ids = df['paired_id'].values
    found_ids = {}
    counterfactual_column = []
    for idx in paired_ids:
        counterfactual_column.append(set_example_counter(idx, found_ids))
    df['is_counterfactual'] = counterfactual_column

    return df

# prepare the dataset with input-counterfactuals instances
def prepare_dataframe_with_counterfacuals(df):

    # group by paired_id
    gb = df.groupby(by=["paired_id"])

    # create new columns "example" and "counterfactual"
    example_column = []
    counter_column = []
    paired_id_column = []
    label_ex = []
    label_counter = []
    for group_id in gb.groups: # group_id == paired_id
        group = gb.get_group(group_id)
        is_counterfactual_column = group['is_counterfactual'].values
        text_column = group['text'].values
        sentiment_column = group['sentiment'].values
        for is_counter, text, label in zip(is_counterfactual_column,
                                           text_column,
                                           sentiment_column):
            if is_counter:
                counter_column.append(text)
                label_counter.append(label)
            else:
                example_column.append(text)
                label_ex.append(label)

        paired_id_column.append(group_id)

    # clean the text from html tags
    example_column = [bs4.BeautifulSoup(el, "lxml").text for el in example_column]
    counter_column = [bs4.BeautifulSoup(el, "lxml").text for el in counter_column]

    # add the new columns to a new dataframe
    d = {'paired_id': paired_id_column,
         'example': example_column,
         'label_ex': label_ex,
         'counterfactual': counter_column,
         'label_counter': label_counter}
    df_with_counterfactuals = pd.DataFrame(data=d)
    df_with_counterfactuals.sort_values(by="paired_id", ascending=True, inplace=True)

    return  df_with_counterfactuals

In [11]:
training_set = load_dataset("train_paired.tsv")
dev_set = load_dataset("dev_paired.tsv")
test_set = load_dataset("test_paired.tsv")
print(f"Datasets are of type {type(test_set)}")
print(f"# of samples in the training set:{len(training_set)}")
print(f"# of samples in the dev set:{len(dev_set)}")
print(f"# of samples in the test set:{len(test_set)}")

Datasets are of type <class 'pandas.core.frame.DataFrame'>
# of samples in the training set:3414
# of samples in the dev set:490
# of samples in the test set:976


In [12]:
# append the 3 datasets
imdb_dataframe = pd.concat([training_set, dev_set, test_set], ignore_index=True)
print(f"# of samples:{len(imdb_dataframe)}")
imdb_dataframe.head(2)

# of samples:4880


Unnamed: 0,sentiment,text,paired_id
0,0,"Long, boring, blasphemous. Never have I been s...",4
1,1,"Long, fascinating, soulful. Never have I been ...",4


In [13]:
df_processed = randomly_assign_conterfactuals(imdb_dataframe.copy(deep=True), random_seed_counter)
df_processed = prepare_dataframe_with_counterfacuals(df_processed)
print(f"# of samples:{len(df_processed)}")
df_processed.head(2)

# of samples:2440


Unnamed: 0,paired_id,example,label_ex,counterfactual,label_counter
0,4,"Long, boring, blasphemous. Never have I been s...",0,"Long, fascinating, soulful. Never have I been ...",1
1,13,"If you haven't seen this, it's terrible. It is...",0,"If you haven't seen this, it's incredible. It ...",1


In [14]:
# Implementing cross validation via sklearn
kf = sklearn.model_selection.KFold(n_splits=K, shuffle=True, random_state=random_seed_folds)

folds = {}
i = 0
for train_index , test_index in kf.split(df_processed):
    df_train_and_val, df_test = df_processed.iloc[train_index], df_processed.iloc[test_index]

    # sample a validation set
    df_val = df_train_and_val.sample(frac=val_prop, random_state=random_seed_folds)
    df_training = df_train_and_val.drop(df_val.index)

    folds[str(i)] = (df_training, df_val, df_test)
    i += 1

In [15]:
folds['0'][0]

Unnamed: 0,paired_id,example,label_ex,counterfactual,label_counter
0,4,"Long, boring, blasphemous. Never have I been s...",0,"Long, fascinating, soulful. Never have I been ...",1
1,13,"If you haven't seen this, it's terrible. It is...",0,"If you haven't seen this, it's incredible. It ...",1
2,40,Not good! Rent or buy the original! Watch this...,0,"So good! Rent or buy the original, too! Watch ...",1
3,46,"being a NI supporter, it's easy to objectively...",1,"being a NI supporter, it's hard to objectively...",0
4,47,"This movie is so bad, it can only be compared ...",0,"This movie is so good, it can only be compared...",1
...,...,...,...,...,...
2423,22322,A lot has been said about Shinjuku Triad Socie...,1,A lot has been said about Shinjuku Triad Socie...,0
2424,22368,Greetings again from the darkness. Mary Heron ...,0,Greetings again from the darkness. Mary Heron ...,1
2425,22381,Director Douglas Sirk strikesout again with th...,0,"Director Douglas Sirk scores again with this, ...",1
2426,22387,** possible spoilers **I hate this film and no...,0,** possible spoilers **I like this film and ha...,1


In [18]:
for fold in folds:
    # create folder for fold
    fold_path = "../cad_imdb/fold_" + fold
    os.mkdir(fold_path)

    # print train, val, test
    folds[fold][0].to_csv(fold_path + "/training_set", index=False, header=True, sep='\t')
    folds[fold][1].to_csv(fold_path + "/val_set", index=False, header=True, sep='\t')
    folds[fold][2].to_csv(fold_path +  "/test_set", index=False, header=True, sep='\t')

In [19]:
train_set = pd.read_csv("../cad_imdb/fold_0/training_set", sep='\t')
val_set = pd.read_csv("../cad_imdb/fold_0/val_set", sep='\t')
test_set = pd.read_csv("../cad_imdb/fold_0/test_set", sep='\t')

In [20]:
print(f"Len train:{len(train_set)}")
print(f"Len val:{len(val_set)}")
print(f"Len test:{len(test_set)}")

Len train:1659
Len val:293
Len test:488
