# Prepare Data for NLI finetuning

In [3]:
from dotenv import load_dotenv
from pathlib import Path
import os

dotenv_path = Path('aic_averitec/.env')
load_dotenv(dotenv_path)

DATASTORE_PATH = os.environ.get("DATASTORE_PATH")
DATASET_PATH = os.environ.get("DATASET_PATH")
MODELS_PATH = os.environ.get("MODELS_PATH")

%load_ext autoreload
%autoreload 2

DEV_PATH = str(os.path.join(DATASET_PATH, 'dev.json'))
TRAIN_PATH = str(os.path.join(DATASET_PATH, 'train.json'))

In [4]:
import json
#load originial data
with open(TRAIN_PATH, "r") as f:
    training_data = json.load(f)

print(training_data[0])
print(len(training_data))

{'claim': 'Hunter Biden had no experience in Ukraine or in the energy sector when he joined the board of Burisma.', 'required_reannotation': False, 'label': 'Supported', 'justification': 'No former experience stated.', 'claim_date': '25-8-2020', 'speaker': 'Pam Bondi', 'original_claim_url': None, 'fact_checking_article': 'https://web.archive.org/web/20210111003633/https://www.politifact.com/article/2020/aug/26/fact-checking-second-night-2020-rnc/', 'reporting_source': 'Speech at The Republican National Convention', 'location_ISO_code': 'US', 'claim_types': ['Position Statement'], 'fact_checking_strategies': ['Written Evidence'], 'questions': [{'question': 'Did Hunter Biden have any experience in the energy sector at the time he joined the board of the  Burisma energy company in 2014', 'answers': [{'answer': 'No', 'answer_type': 'Boolean', 'source_url': 'https://en.wikipedia.org/wiki/Hunter_Biden', 'source_medium': 'Web text', 'boolean_explanation': "Hunter bidens previous career histor

In [45]:
def get_result_data(input_data):
    result_data = []
    for data in input_data:
        for question in data["questions"]:
            for answer in question["answers"]:
                #discard conflictign evidence/cherrypicking
                if data["label"] != "Conflicting Evidence/Cherrypicking":
                    if answer["answer_type"] == "Boolean":
                        result_data.append({"claim": data["claim"], "evidence": answer["boolean_explanation"], "label": data["label"]})
                    elif answer["answer_type"] == "Unanswerable":
                        pass
                    else:
                        result_data.append({"claim": data["claim"], "evidence": answer["answer"], "label": data["label"]})
    
    return result_data

In [46]:
#extract the text and the label to json

result_data = get_result_data(training_data)

print(result_data[0])
print(len(result_data))

{'claim': 'Hunter Biden had no experience in Ukraine or in the energy sector when he joined the board of Burisma.', 'evidence': "Hunter bidens previous career history does not include work for energy company's.", 'label': 'Supported'}
7321


In [44]:
def save_to_jsonl(data, dataset_name):
    with open(os.path.join(DATASET_PATH, dataset_name), "w") as f:
        for item in data:
            f.write(json.dumps(item) + "\n")

In [47]:
save_to_jsonl(result_data, "train_nli_a.jsonl")

#### QA nli data

In [48]:
def get_result_data_qa(input_data):
    result_data_qa = []
    for data in input_data:
        for question in data["questions"]:
            for answer in question["answers"]:
                #discard conflictign evidence/cherrypicking
                if data["label"] != "Conflicting Evidence/Cherrypicking":
                    if answer["answer_type"] == "Boolean":
                        result_data_qa.append({"claim": data["claim"], "evidence": question["question"] + " " + answer["answer"] + " " +  answer["boolean_explanation"], "label": data["label"]})
                    else:
                        result_data_qa.append({"claim": data["claim"], "evidence": question["question"] + " " + answer["answer"], "label": data["label"]})

    return result_data_qa

In [49]:
#extract the text and the label to json

result_data_qa = get_result_data_qa(training_data)

print(result_data_qa[0])
print(len(result_data_qa))

{'claim': 'Hunter Biden had no experience in Ukraine or in the energy sector when he joined the board of Burisma.', 'evidence': "Did Hunter Biden have any experience in the energy sector at the time he joined the board of the  Burisma energy company in 2014 No Hunter bidens previous career history does not include work for energy company's.", 'label': 'Supported'}
7688


In [50]:
save_to_jsonl(result_data_qa, "train_nli_qa.jsonl")

#### 4 labels, concat



In [51]:
def get_result_data_4concat(input_data):
    result_data_qa = []
    for data in input_data:
        evidence:str = ""
        for question in data["questions"]:
            for answer in question["answers"]:
                if answer["answer_type"] == "Boolean":
                    evidence += question["question"] + " " + answer["answer"] + " " +  answer["boolean_explanation"] + " "
                else:
                    evidence += question["question"] + " " + answer["answer"] + " "
        
        #evidence without last space
        result_data_qa.append({"claim": data["claim"], "evidence": evidence[:-1], "label": data["label"]})

            
            

    return result_data_qa

In [52]:
result_data_4concat = get_result_data_4concat(training_data)

print(result_data_4concat[0])
print(len(result_data_4concat))

{'claim': 'Hunter Biden had no experience in Ukraine or in the energy sector when he joined the board of Burisma.', 'evidence': "Did Hunter Biden have any experience in the energy sector at the time he joined the board of the  Burisma energy company in 2014 No Hunter bidens previous career history does not include work for energy company's. Did Hunter Biden have any experience in Ukraine at the time he joined the board of the  Burisma energy company in 2014 No Hunter Bidens previous career history does not include working with Ukrainian company's.", 'label': 'Supported'}
3068


In [53]:
save_to_jsonl(result_data_4concat, "train_nli_4concat.jsonl")

#### 4 labels concat in all orders

In [39]:
from itertools import permutations
import random
import numpy as np

def get_result_data_4concat_all_orders(input_data):
    result_data_qa = []
    for data in input_data:
        

        qas = [(q["question"],a["answer"] if a["answer_type"] != "Boolean" else a["answer"] + " "+ a["boolean_explanation"]) for q in data["questions"] for a in q["answers"]]
        
        for i in range(min(len(qas), 10)):
            random.shuffle(qas)
            perm = qas
            evidence:str = ""
            for qa in perm:
                evidence += qa[0] + " " + qa[1] + " "

            result_data_qa.append({"claim": data["claim"], "evidence": evidence[:-1], "label": data["label"]})
            

    return result_data_qa

In [54]:
result_data_4_concat_all_orders = get_result_data_4concat_all_orders(training_data)

print(result_data_4_concat_all_orders[20])
print(len(result_data_4_concat_all_orders))

{'claim': 'Biden will take away the Second Amendment.', 'evidence': 'Has Joe Biden\'s plan for gun regulation and control infringed on the second amendment Biden’s plan to end gun violence says "It’s within our grasp to end our gun violence epidemic and respect the Second Amendment, which is limited,"', 'label': 'Refuted'}
8451


In [55]:
save_to_jsonl(result_data_4_concat_all_orders, "train_nli_4concat_all_orders.jsonl")

#### 4 labesl concat with [SEP]

In [56]:
def get_result_data_4_sep(input_data):
    result_data_qa = []
    for data in input_data:
        evidence:str = ""
        for question in data["questions"]:
            for answer in question["answers"]:
                if answer["answer_type"] == "Boolean":
                    evidence += question["question"] + " " + answer["answer"] + " " +  answer["boolean_explanation"] + "[SEP]"
                else:
                    evidence += question["question"] + " " + answer["answer"] + "[SEP]"
        
        #evidence without last space
        result_data_qa.append({"claim": data["claim"], "evidence": evidence[:-5], "label": data["label"]})

            
            

    return result_data_qa

In [57]:
result_data_4_sep = get_result_data_4_sep(training_data)

print(result_data_4_sep[100])
print(len(result_data_4_sep))

{'claim': 'Facebook deleted a photo of Melania Trump with her sister and mother.', 'evidence': 'Does the photo show Melania with her sister and her mother? it shows Melania, her sister and audrey gruss (not Melanias mother)[SEP]Was this image removed by facebook? No the photo was not removed from Facebook', 'label': 'Refuted'}
3068


In [14]:
save_to_jsonl(result_data_4_sep, "train_nli_4sep.jsonl")

#### bin cherrypicking

In [58]:

def get_result_data_bin_concat(input_data):
    result_data_qa = []
    for data in input_data:
        evidence:str = ""
        for question in data["questions"]:
            for answer in question["answers"]:
                if answer["answer_type"] == "Boolean":
                    evidence += question["question"] + " " + answer["answer"] + " " +  answer["boolean_explanation"] + " "
                else:
                    evidence += question["question"] + " " + answer["answer"] + " "
        
        #evidence without last space
        label = 1 if data["label"] == "Conflicting Evidence/Cherrypicking" else 0
        result_data_qa.append({"claim": data["claim"], "evidence": evidence[:-1], "label": label})

            
            

    return result_data_qa

In [59]:
result_data_bin_concat = get_result_data_bin_concat(training_data)

print(result_data_bin_concat[0])
print(len(result_data_bin_concat))

{'claim': 'Hunter Biden had no experience in Ukraine or in the energy sector when he joined the board of Burisma.', 'evidence': "Did Hunter Biden have any experience in the energy sector at the time he joined the board of the  Burisma energy company in 2014 No Hunter bidens previous career history does not include work for energy company's. Did Hunter Biden have any experience in Ukraine at the time he joined the board of the  Burisma energy company in 2014 No Hunter Bidens previous career history does not include working with Ukrainian company's.", 'label': 0}
3068


In [19]:
save_to_jsonl(result_data_bin_concat, "train_nli_bin_concat.jsonl")

## STATS

In [60]:
labels = [dat["label"] for dat in result_data]
labels_qa = [dat["label"] for dat in result_data_qa]
labels_4concat = [dat["label"] for dat in result_data_4concat]
labels_bin_concat = [dat["label"] for dat in result_data_bin_concat]
labels_4sep = [dat["label"] for dat in result_data_4_sep]
labels_4_concat_all_orders = [dat["label"] for dat in result_data_4_concat_all_orders]

In [61]:
import numpy as np

unique, counts = np.unique(labels, return_counts=True)
print(dict(zip(unique, counts)))

unique, counts = np.unique(labels_qa, return_counts=True)
print(dict(zip(unique, counts)))

unique, counts = np.unique(labels_4concat, return_counts=True)
print(dict(zip(unique, counts)))

unique, counts = np.unique(labels_bin_concat, return_counts=True)
print(dict(zip(unique, counts)))

unique, counts = np.unique(labels_4sep, return_counts=True)
print(dict(zip(unique, counts)))

unique, counts = np.unique(labels_4_concat_all_orders, return_counts=True)
print(dict(zip(unique, counts)))

{'Not Enough Evidence': 626, 'Refuted': 4471, 'Supported': 2224}
{'Not Enough Evidence': 839, 'Refuted': 4599, 'Supported': 2250}
{'Conflicting Evidence/Cherrypicking': 195, 'Not Enough Evidence': 282, 'Refuted': 1742, 'Supported': 849}
{0: 2873, 1: 195}
{'Conflicting Evidence/Cherrypicking': 195, 'Not Enough Evidence': 282, 'Refuted': 1742, 'Supported': 849}
{'Conflicting Evidence/Cherrypicking': 789, 'Not Enough Evidence': 839, 'Refuted': 4586, 'Supported': 2237}


### Dev

In [62]:
#do the same for dev data
with open(DEV_PATH, "r") as f:
    dev_data = json.load(f)

In [64]:
result_data = get_result_data(dev_data)
result_data_qa = get_result_data_qa(dev_data)
result_data_4concat = get_result_data_4concat(dev_data)
result_data_bin_concat = get_result_data_bin_concat(dev_data)
result_data_4sep = get_result_data_4_sep(dev_data)
result_data_4_concat_all_orders = get_result_data_4concat_all_orders(dev_data)

In [65]:
save_to_jsonl(result_data, "dev_nli_a.jsonl")
save_to_jsonl(result_data_qa, "dev_nli_qa.jsonl")
save_to_jsonl(result_data_4concat, "dev_nli_4concat.jsonl")
save_to_jsonl(result_data_bin_concat, "dev_nli_bin_concat.jsonl")
save_to_jsonl(result_data_4sep, "dev_nli_4sep.jsonl")
save_to_jsonl(result_data_4_concat_all_orders, "dev_nli_4concat_all_orders.jsonl")

### Try loading with HF

In [36]:
from datasets import load_dataset

dataset = load_dataset("json", data_files = {"train": os.path.join(DATASET_PATH, "train_nli_a.jsonl"), "dev": os.path.join(DATASET_PATH, "dev_nli_a.jsonl")})
print(dataset)
print(dataset["train"][0])

Generating train split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 7321
    })
    dev: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 1227
    })
})
{'claim': 'Hunter Biden had no experience in Ukraine or in the energy sector when he joined the board of Burisma.', 'evidence': "Hunter bidens previous career history does not include work for energy company's.", 'label': 'Supported'}


In [37]:
from transformers import AutoTokenizer
model_id = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)



In [38]:
def tokenize_function(examples):
    example = tokenizer(examples["claim"], examples["evidence"], truncation=True)
    example["label"] = examples["label"]
    return example

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7321 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1227 [00:00<?, ? examples/s]

In [39]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['claim', 'evidence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7321
    })
    dev: Dataset({
        features: ['claim', 'evidence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1227
    })
})