In [1]:
import os
os.environ["HF_HUB_CACHE"] = "/NS/formal-grammar-and-memorization/nobackup/shared/huggingface_cache/hub"
os.environ["HF_DATASETS_CACHE"] = "/NS/formal-grammar-and-memorization/nobackup/shared/huggingface_cache/datasets"

In [2]:
from datasets import load_dataset

# Step 1: Load the RTE dataset
dataset = load_dataset("glue", "rte")
df_train = dataset["train"].to_pandas()
df_validation = dataset["validation"].to_pandas()
# df_test = dataset["test"].to_pandas()

In [3]:
df_train

Unnamed: 0,sentence1,sentence2,label,idx
0,No Weapons of Mass Destruction Found in Iraq Yet.,Weapons of Mass Destruction Found in Iraq.,1,0
1,"A place of sorrow, after Pope John Paul II die...",Pope Benedict XVI is the new leader of the Rom...,0,1
2,Herceptin was already approved to treat the si...,Herceptin can be used to treat breast cancer.,0,2
3,"Judie Vivian, chief executive at ProMedica, a ...",The previous name of Ho Chi Minh City was Saigon.,0,3
4,A man is due in court later charged with the m...,Paul Stewart Hutchinson is accused of having s...,1,4
...,...,...,...,...
2485,There is none. They found as many weapons in t...,Weapons of mass destruction found in Iraq.,1,2485
2486,"Dr. Eric Goosby, a pioneer in the fight agains...",Pepfar is committed to fighting AIDS.,0,2486
2487,"NASA's Saturn exploration spacecraft, Cassini ...",Titan is the fifteenth of Saturn's known satel...,1,2487
2488,Brooklyn Borough Hall featured a Who's Who in ...,The Brooklyn Book Festival is held in Brooklyn...,0,2488


In [4]:
label_map = {
    0: "entailment",
    1: "contradiction",
}

In [5]:
import random
def process_columns(df):
    random.seed(5)
    df['label_flipped'] = df['label'].apply(
        lambda x: 0 if x == 1 else 1
    )

    assert (df['label_flipped'] != df['label']).all()
    df['label_str'] = df['label'].map(label_map)
    df['label_flipped_str'] = df['label_flipped'].map(label_map)
    df['in-language'] = 'Sentence 1: ' + df["sentence1"] + '\nSentence 2: ' + df["sentence2"] + '\nLabel: ' + df["label_str"]
    df['out-language'] = 'Sentence 1: ' + df["sentence1"] + '\nSentence 2: ' + df["sentence2"] + '\nLabel: ' + df["label_flipped_str"]
    return df

df_train = process_columns(df_train)
df_validation = process_columns(df_validation)
# df_test = process_columns(df_test)
df_train.shape, df_validation.shape

((2490, 9), (277, 9))

In [6]:
print(df_train.iloc[2]['in-language'])

Sentence 1: Herceptin was already approved to treat the sickest breast cancer patients, and the company said, Monday, it will discuss with federal regulators the possibility of prescribing the drug for more breast cancer patients.
Sentence 2: Herceptin can be used to treat breast cancer.
Label: entailment


In [7]:
print(df_train.iloc[2]['out-language'])

Sentence 1: Herceptin was already approved to treat the sickest breast cancer patients, and the company said, Monday, it will discuss with federal regulators the possibility of prescribing the drug for more breast cancer patients.
Sentence 2: Herceptin can be used to treat breast cancer.
Label: contradiction


In [8]:
print(df_validation.iloc[2]['in-language'])

Sentence 1: Cairo is now home to some 15 million people - a burgeoning population that produces approximately 10,000 tonnes of rubbish per day, putting an enormous strain on public services. In the past 10 years, the government has tried hard to encourage private investment in the refuse sector, but some estimate 4,000 tonnes of waste is left behind every day, festering in the heat as it waits for someone to clear it up. It is often the people in the poorest neighbourhoods that are worst affected. But in some areas they are fighting back. In Shubra, one of the northern districts of the city, the residents have taken to the streets armed with dustpans and brushes to clean up public areas which have been used as public dumps.
Sentence 2: 15 million tonnes of rubbish are produced daily in Cairo.
Label: contradiction


In [9]:
print(df_validation.iloc[2]['out-language'])

Sentence 1: Cairo is now home to some 15 million people - a burgeoning population that produces approximately 10,000 tonnes of rubbish per day, putting an enormous strain on public services. In the past 10 years, the government has tried hard to encourage private investment in the refuse sector, but some estimate 4,000 tonnes of waste is left behind every day, festering in the heat as it waits for someone to clear it up. It is often the people in the poorest neighbourhoods that are worst affected. But in some areas they are fighting back. In Shubra, one of the northern districts of the city, the residents have taken to the streets armed with dustpans and brushes to clean up public areas which have been used as public dumps.
Sentence 2: 15 million tonnes of rubbish are produced daily in Cairo.
Label: entailment


In [11]:
df_train['in-language_len'] = df_train['in-language'].apply(lambda x: len(x.strip()))
df_train['out-language_len'] = df_train['out-language'].apply(lambda x: len(x.strip()))
df_validation['in-language_len'] = df_validation['in-language'].apply(lambda x: len(x.strip()))
df_validation['out-language_len'] = df_validation['out-language'].apply(lambda x: len(x.strip()))

In [12]:
df_train[df_train['in-language_len'] == 0].shape, df_train[df_train['out-language_len'] == 0].shape, df_validation[df_validation['in-language_len'] == 0].shape, df_validation[df_validation['out-language_len'] == 0].shape

((0, 11), (0, 11), (0, 11), (0, 11))

In [13]:
rte_dataset_in_distribution_dict = {
    "train_sequences": df_train['in-language'].tolist(),
    "test_sequences": df_validation['in-language'].tolist(),
    "non_grammatical_test_sequences_edit_distance_1": df_validation['out-language'].tolist(),
}

In [14]:
print(rte_dataset_in_distribution_dict["test_sequences"][0])

Sentence 1: Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation.
Sentence 2: Christopher Reeve had an accident.
Label: contradiction


In [16]:
print(rte_dataset_in_distribution_dict["non_grammatical_test_sequences_edit_distance_1"][0])

Sentence 1: Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation.
Sentence 2: Christopher Reeve had an accident.
Label: entailment


In [None]:
filename_in_distribution = "../data/rte_dataset_in_distribution/sequences_w_edit_distance_rte_dataset_in_distribution_10000_5.pkl"

os.system(f"mkdir -p ../data/rte_dataset_in_distribution")

import pickle

with open(filename_in_distribution, 'wb') as f:
    pickle.dump(rte_dataset_in_distribution_dict, f)
