# Prepare Data for NLI finetuning

In [1]:
from dotenv import load_dotenv
from pathlib import Path
import os

dotenv_path = Path('aic_averitec/.env')
load_dotenv(dotenv_path)

DATASTORE_PATH = os.environ.get("DATASTORE_PATH")
DATASET_PATH = os.environ.get("DATASET_PATH")
MODELS_PATH = os.environ.get("MODELS_PATH")

%load_ext autoreload
%autoreload 2

DEV_PATH = str(os.path.join(DATASET_PATH, 'dev.json'))
TRAIN_PATH = str(os.path.join(DATASET_PATH, 'train.json'))

In [22]:
import json
#load originial data
with open(TRAIN_PATH, "r") as f:
    training_data = json.load(f)

print(training_data[0])
print(len(training_data))

{'claim': 'Hunter Biden had no experience in Ukraine or in the energy sector when he joined the board of Burisma.', 'required_reannotation': False, 'label': 'Supported', 'justification': 'No former experience stated.', 'claim_date': '25-8-2020', 'speaker': 'Pam Bondi', 'original_claim_url': None, 'fact_checking_article': 'https://web.archive.org/web/20210111003633/https://www.politifact.com/article/2020/aug/26/fact-checking-second-night-2020-rnc/', 'reporting_source': 'Speech at The Republican National Convention', 'location_ISO_code': 'US', 'claim_types': ['Position Statement'], 'fact_checking_strategies': ['Written Evidence'], 'questions': [{'question': 'Did Hunter Biden have any experience in the energy sector at the time he joined the board of the  Burisma energy company in 2014', 'answers': [{'answer': 'No', 'answer_type': 'Boolean', 'source_url': 'https://en.wikipedia.org/wiki/Hunter_Biden', 'source_medium': 'Web text', 'boolean_explanation': "Hunter bidens previous career histor

In [23]:
def get_result_data(input_data):
    result_data = []
    for data in input_data:
        for question in data["questions"]:
            for answer in question["answers"]:
                #discard conflictign evidence/cherrypicking
                if data["label"] != "Conflicting Evidence/Cherrypicking":
                    if answer["answer_type"] == "Boolean":
                        result_data.append({"claim": data["claim"], "evidence": answer["boolean_explanation"], "label": data["label"]})
                    elif answer["answer_type"] == "Unanswerable":
                        pass
                    else:
                        result_data.append({"claim": data["claim"], "evidence": answer["answer"], "label": data["label"]})
    
    return result_data

In [24]:
#extract the text and the label to json

result_data = get_result_data(training_data)

print(result_data[0])
print(len(result_data))

{'claim': 'Hunter Biden had no experience in Ukraine or in the energy sector when he joined the board of Burisma.', 'evidence': "Hunter bidens previous career history does not include work for energy company's.", 'label': 'Supported'}
7321


In [25]:
def save_to_jsonl(data, dataset_name):
    with open(os.path.join(DATASET_PATH, dataset_name), "w") as f:
        for item in data:
            f.write(json.dumps(item) + "\n")

In [26]:
save_to_jsonl(result_data, "train_nli_a.jsonl")

#### QA nli data

In [27]:
def get_result_data_qa(input_data):
    result_data_qa = []
    for data in input_data:
        for question in data["questions"]:
            for answer in question["answers"]:
                #discard conflictign evidence/cherrypicking
                if data["label"] != "Conflicting Evidence/Cherrypicking":
                    result_data_qa.append({"claim": data["claim"], "question": question["question"], "evidence": answer["answer"], "label": data["label"]})

    return result_data_qa

In [28]:
#extract the text and the label to json

result_data_qa = get_result_data_qa(training_data)

print(result_data_qa[0])
print(len(result_data_qa))

{'claim': 'Hunter Biden had no experience in Ukraine or in the energy sector when he joined the board of Burisma.', 'question': 'Did Hunter Biden have any experience in the energy sector at the time he joined the board of the  Burisma energy company in 2014', 'evidence': 'No', 'label': 'Supported'}
7688


In [29]:
save_to_jsonl(result_data_qa, "train_nli_qa.jsonl")

## STATS

In [8]:
labels = [dat["label"] for dat in result_data]
labels_qa = [dat["label"] for dat in result_data_qa]

In [10]:
import numpy as np

unique, counts = np.unique(labels, return_counts=True)
print(dict(zip(unique, counts)))

unique, counts = np.unique(labels_qa, return_counts=True)
print(dict(zip(unique, counts)))

{'Not Enough Evidence': 626, 'Refuted': 4471, 'Supported': 2224}
{'Not Enough Evidence': 839, 'Refuted': 4599, 'Supported': 2250}


### Dev

In [30]:
#do the same for dev data
with open(DEV_PATH, "r") as f:
    dev_data = json.load(f)

In [31]:
result_data = get_result_data(dev_data)
result_data_qa = get_result_data_qa(dev_data)

In [32]:
save_to_jsonl(result_data, "dev_nli_a.jsonl")
save_to_jsonl(result_data_qa, "dev_nli_qa.jsonl")

### Try loading with HF

In [36]:
from datasets import load_dataset

dataset = load_dataset("json", data_files = {"train": os.path.join(DATASET_PATH, "train_nli_a.jsonl"), "dev": os.path.join(DATASET_PATH, "dev_nli_a.jsonl")})
print(dataset)
print(dataset["train"][0])

Generating train split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 7321
    })
    dev: Dataset({
        features: ['claim', 'evidence', 'label'],
        num_rows: 1227
    })
})
{'claim': 'Hunter Biden had no experience in Ukraine or in the energy sector when he joined the board of Burisma.', 'evidence': "Hunter bidens previous career history does not include work for energy company's.", 'label': 'Supported'}


In [37]:
from transformers import AutoTokenizer
model_id = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)



In [38]:
def tokenize_function(examples):
    example = tokenizer(examples["claim"], examples["evidence"], truncation=True)
    example["label"] = examples["label"]
    return example

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/7321 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1227 [00:00<?, ? examples/s]

In [39]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['claim', 'evidence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7321
    })
    dev: Dataset({
        features: ['claim', 'evidence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1227
    })
})