# AVeriTeC Baseline


In [None]:
%pip install -r AVeriTeC/requirements.txt

## 0. Search for Evidence?

## 1. Scrape text from the URLs obtained by searching queries with the Google API

## 2. Rank the sentences in the knowledge store with BM25

In [3]:
import argparse
import json
import os
import time
import numpy as np
import nltk
from rank_bm25 import BM25Okapi


def combine_all_sentences(knowledge_file):
    sentences, urls = [], []

    with open(knowledge_file, "r", encoding="utf-8") as json_file:
        for i, line in enumerate(json_file):
            data = json.loads(line)
            sentences.extend(data["url2text"])
            urls.extend([data["url"] for i in range(len(data["url2text"]))])
    return sentences, urls, i + 1


def retrieve_top_k_sentences(query, document, urls, top_k):
    tokenized_docs = [nltk.word_tokenize(doc) for doc in document]
    bm25 = BM25Okapi(tokenized_docs)
    scores = bm25.get_scores(nltk.word_tokenize(query))
    top_k_idx = np.argsort(scores)[::-1][:top_k]

    return [document[i] for i in top_k_idx], [urls[i] for i in top_k_idx]

In [5]:
KNOWLEDGE_STORE_DIR = "AVeriTeC/data_store/output_dev" # The path of the knowledge_store_dir containing json files with all the retrieved sentences.
CLAIM_FILE = "AVeriTeC/data/dev.json" # The path of the file that stores the claim.
JSON_OUTPUT = "AVeriTeC/data_store/dev_top_k.json" # The output dir for JSON files to save the top 100 sentences for each claim.
TOP_K = 100 # How many documents should we pick out with BM25.
START = 0 # Staring index of the files to process.
END = -1 # End index of the files to process.
    
with open(CLAIM_FILE, "r", encoding="utf-8") as json_file:
    target_examples = json.load(json_file)
    
    #set end to number of files in the directory knowledge_store_dir
    if END == -1:
        END = len(os.listdir(KNOWLEDGE_STORE_DIR))
        print(END)

    files_to_process = list(range(START, END))
    total = len(files_to_process)

    with open(JSON_OUTPUT, "w", encoding="utf-8") as output_json:
        done = 0
        for idx, example in enumerate(target_examples):
            # Load the knowledge store for this example
            if idx in files_to_process:
                print(f"Processing claim {idx}... Progress: {done + 1} / {total}")
                document_in_sentences, sentence_urls, num_urls_this_claim = (
                    combine_all_sentences(
                        os.path.join(KNOWLEDGE_STORE_DIR, f"{idx}.json")
                    )
                )

                print(
                    f"Obtained {len(document_in_sentences)} sentences from {num_urls_this_claim} urls."
                )

                # Retrieve top_k sentences with bm25
                st = time.time()
                top_k_sentences, top_k_urls = retrieve_top_k_sentences(
                    example["claim"], document_in_sentences, sentence_urls, TOP_K
                )
                print(f"Top {TOP_K} retrieved. Time elapsed: {time.time() - st}.")

                json_data = {
                    "claim_id": idx,
                    "claim": example["claim"],
                    f"top_{TOP_K}": [
                        {"sentence": sent, "url": url}
                        for sent, url in zip(top_k_sentences, top_k_urls)
                    ],
                }
                output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
                done += 1
                # output_file.flush()


500
Processing claim 0... Progress: 1 / 500
Obtained 534489 sentences from 825 urls.
Top 100 retrieved. Time elapsed: 67.17283320426941.
Processing claim 1... Progress: 2 / 500
Obtained 344674 sentences from 714 urls.
Top 100 retrieved. Time elapsed: 42.56042766571045.
Processing claim 2... Progress: 3 / 500
Obtained 1182335 sentences from 1300 urls.
Top 100 retrieved. Time elapsed: 151.2538583278656.
Processing claim 3... Progress: 4 / 500
Obtained 1104825 sentences from 1041 urls.
Top 100 retrieved. Time elapsed: 136.4850766658783.
Processing claim 4... Progress: 5 / 500
Obtained 552547 sentences from 1064 urls.
Top 100 retrieved. Time elapsed: 69.11488580703735.
Processing claim 5... Progress: 6 / 500
Obtained 960056 sentences from 822 urls.
Top 100 retrieved. Time elapsed: 116.33663892745972.
Processing claim 6... Progress: 7 / 500
Obtained 723400 sentences from 1392 urls.
Top 100 retrieved. Time elapsed: 97.06489825248718.
Processing claim 7... Progress: 8 / 500
Obtained 729765 se

KeyboardInterrupt: 

## 3. Generate questions-answer pair for the top sentences

In [None]:
import argparse
import time
import json
import nltk
from rank_bm25 import BM25Okapi
import numpy as np
import torch
from transformers import BloomTokenizerFast, BloomForCausalLM


def claim2prompts(example):
    claim = example["claim"]

    # claim_str = "Claim: " + claim + "||Evidence: "
    claim_str = "Evidence: "

    for question in example["questions"]:
        q_text = question["question"].strip()
        if len(q_text) == 0:
            continue

        if not q_text[-1] == "?":
            q_text += "?"

        answer_strings = []

        for a in question["answers"]:
            if a["answer_type"] in ["Extractive", "Abstractive"]:
                answer_strings.append(a["answer"])
            if a["answer_type"] == "Boolean":
                answer_strings.append(
                    a["answer"]
                    + ", because "
                    + a["boolean_explanation"].lower().strip()
                )

        for a_text in answer_strings:
            if not a_text[-1] in [".", "!", ":", "?"]:
                a_text += "."

            # prompt_lookup_str = claim + " " + a_text
            prompt_lookup_str = a_text
            this_q_claim_str = (
                claim_str + " " + a_text.strip() + "||Question answered: " + q_text
            )
            yield (
                prompt_lookup_str,
                this_q_claim_str.replace("\n", " ").replace("||", "\n"),
            )



    
                

In [None]:
REFERENCE_CORPUS = "AVeriTeC/data/train.json" # The path of the reference corpus.
TARGET_FILE = "AVeriTeC/data/dev.json" # The path of the file that stores the claim.
TOP_K_TARGET_KNOWLEDGE = "data_store/dev_top_k_sentences.json" #"Directory where the sentences for the scraped data is saved."
OUTPUT_QUESTIONS = "data_store/dev_top_k_qa.json" # Directory where the sentences for the scraped data is saved.
TOP_K = 100 # How many documents should we pick out with BM25.

# few-shot learning from the training set
with open(REFERENCE_CORPUS, "r", encoding="utf-8") as json_file:
    train_examples = json.load(json_file)

prompt_corpus, tokenized_corpus = [], []

for example in train_examples:
    for lookup_str, prompt in claim2prompts(example):
        entry = nltk.word_tokenize(lookup_str)
        tokenized_corpus.append(entry)
        prompt_corpus.append(prompt)

prompt_bm25 = BM25Okapi(tokenized_corpus)

# Load the bloom model:
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
model = BloomForCausalLM.from_pretrained(
    "bigscience/bloom-7b1",
    device_map="auto",
    torch_dtype=torch.bfloat16,
    offload_folder="./offload",
)

with open(OUTPUT_QUESTIONS, "w", encoding="utf-8") as output_file:
    with open(TOP_K_TARGET_KNOWLEDGE, "r", encoding="utf-8") as json_file:
        for i, line in enumerate(json_file):
            data = json.loads(line)
            top_k_sentences_urls = data[f"top_{TOP_K}"]
            claim = data["claim"]
            claim_id = data["claim_id"]

            bm25_qau = []  # question, answer, url
            # Generate questions for those top k:
            for sent_i, sentences_urls in enumerate(top_k_sentences_urls):

                prompt_lookup_str = sentences_urls["sentence"]
                url = sentences_urls["url"]
                prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
                prompt_n = 10
                prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
                prompt_docs = [prompt_corpus[i] for i in prompt_top_n]

                claim_prompt = (
                    "Evidence: "
                    + prompt_lookup_str.replace("\n", " ")
                    + "\nQuestion answered: "
                )

                prompt = "\n\n".join(prompt_docs + [claim_prompt])

                inputs = tokenizer([prompt], padding=True, return_tensors="pt").to(
                    model.device
                )
                st = time.time()
                outputs = model.generate(
                    inputs["input_ids"],
                    max_length=5000,
                    num_beams=2,
                    no_repeat_ngram_size=2,
                    early_stopping=True,
                )
                print(f"Generated QA for sent {sent_i} in file {i}. Time elapsed: {time.time() - st}")

                tgt_text = tokenizer.batch_decode(
                    outputs[:, inputs["input_ids"].shape[-1] :],
                    skip_special_tokens=True,
                )[0]

                # We are not allowed to generate more than 250 characters:
                tgt_text = tgt_text[:250]

                qau_pair = [
                    tgt_text.strip().split("?")[0].replace("\n", " ") + "?",
                    prompt_lookup_str.replace("\n", " "),
                    url,
                ]

                bm25_qau.append(qau_pair)

            json_data = {
                "claim_id": claim_id,
                "claim": claim,
                "bm25_qau": bm25_qau,
            }
            output_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
            output_file.flush()



## 4. Rerank the QA pairs

In [None]:
import argparse
import json
import torch
import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
from src.models.DualEncoderModule import DualEncoderModule


def triple_to_string(x):
    return " </s> ".join([item.strip() for item in x])

TOP_K_QA_FILE = "data_store/dev_top_k_qa.json" #Json file with claim and top k generated question-answer pairs.
OUTPUT_FILE = "data_store/dev_top_3_rerank_qa.json" #Json file with the top3 reranked questions.
BEST_CHECKPOINT = "pretrained_models/bert_dual_encoder.ckpt"
TOP_N = 3 #top_n question answer pairs as evidence to keep.

examples = []
with open(TOP_K_QA_FILE) as f:
    for line in f:
        examples.append(json.loads(line))

bert_model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertForSequenceClassification.from_pretrained(
    bert_model_name, num_labels=2, problem_type="single_label_classification"
)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
trained_model = DualEncoderModule.load_from_checkpoint(
    BEST_CHECKPOINT, tokenizer=tokenizer, model=bert_model
).to(device)

with open(OUTPUT_FILE, "w", encoding="utf-8") as output_file:
    for example in tqdm.tqdm(examples):
        strs_to_score = []
        values = []

        bm25_qau = example["bm25_qau"] if "bm25_qau" in example else []
        claim = example["claim"]

        for question, answer, url in bm25_qau:
            str_to_score = triple_to_string([claim, question, answer])

            strs_to_score.append(str_to_score)
            values.append([question, answer, url])

        if len(bm25_qau) > 0:
            encoded_dict = tokenizer(
                strs_to_score,
                max_length=512,
                padding="longest",
                truncation=True,
                return_tensors="pt",
            ).to(device)

            input_ids = encoded_dict["input_ids"]
            attention_masks = encoded_dict["attention_mask"]

            scores = torch.softmax(
                trained_model(input_ids, attention_mask=attention_masks).logits,
                axis=-1,
            )[:, 1]

            top_n = torch.argsort(scores, descending=True)[: TOP_N]
            evidence = [
                {
                    "question": values[i][0],
                    "answer": values[i][1],
                    "url": values[i][2],
                }
                for i in top_n
            ]
        else:
            evidence = []

        json_data = {
            "claim_id": example["claim_id"],
            "claim": claim,
            "evidence": evidence,
        }
        output_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
        output_file.flush()


## 5. Veracity prediction

In [None]:
import argparse
import json
import tqdm
import torch
import pytorch_lightning as pl
from transformers import BertTokenizer, BertForSequenceClassification
from src.models.SequenceClassificationModule import SequenceClassificationModule


LABEL = [
    "Supported",
    "Refuted",
    "Not Enough Evidence",
    "Conflicting Evidence/Cherrypicking",
]


class SequenceClassificationDataLoader(pl.LightningDataModule):
    def __init__(self, tokenizer, data_file, batch_size, add_extra_nee=False):
        super().__init__()
        self.tokenizer = tokenizer
        self.data_file = data_file
        self.batch_size = batch_size
        self.add_extra_nee = add_extra_nee

    def tokenize_strings(
        self,
        source_sentences,
        max_length=400,
        pad_to_max_length=False,
        return_tensors="pt",
    ):
        encoded_dict = self.tokenizer(
            source_sentences,
            max_length=max_length,
            padding="max_length" if pad_to_max_length else "longest",
            truncation=True,
            return_tensors=return_tensors,
        )

        input_ids = encoded_dict["input_ids"]
        attention_masks = encoded_dict["attention_mask"]

        return input_ids, attention_masks

    def quadruple_to_string(self, claim, question, answer, bool_explanation=""):
        if bool_explanation is not None and len(bool_explanation) > 0:
            bool_explanation = ", because " + bool_explanation.lower().strip()
        else:
            bool_explanation = ""
        return (
            "[CLAIM] "
            + claim.strip()
            + " [QUESTION] "
            + question.strip()
            + " "
            + answer.strip()
            + bool_explanation
        )

In [None]:
CLAIM_WITH_EVIDENCE_FILE = "data_store/dev_top_3_rerank_qa.json" #Json file with claim and top question-answer pairs as evidence.
OUTPUT_FILE = "data_store/dev_veracity_prediction.json" #Json file with the veracity predictions.
BEST_CHECKPOINT = "pretrained_models/bert_veracity.ckpt"

examples = []
with open(args.claim_with_evidence_file) as f:
    for line in f:
        examples.append(json.loads(line))

bert_model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertForSequenceClassification.from_pretrained(
    bert_model_name, num_labels=4, problem_type="single_label_classification"
)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
trained_model = SequenceClassificationModule.load_from_checkpoint(
    args.best_checkpoint, tokenizer=tokenizer, model=bert_model
).to(device)

dataLoader = SequenceClassificationDataLoader(
    tokenizer=tokenizer,
    data_file="this_is_discontinued",
    batch_size=32,
    add_extra_nee=False,
)

predictions = []

for example in tqdm.tqdm(examples):
    example_strings = []
    for evidence in example["evidence"]:
        example_strings.append(
            dataLoader.quadruple_to_string(
                example["claim"], evidence["question"], evidence["answer"], ""
            )
        )

    if (
        len(example_strings) == 0
    ):  # If we found no evidence e.g. because google returned 0 pages, just output NEI.
            example["label"] = "Not Enough Evidence"
            continue

    tokenized_strings, attention_mask = dataLoader.tokenize_strings(example_strings)
    example_support = torch.argmax(
        trained_model(
            tokenized_strings.to(device), attention_mask=attention_mask.to(device)
        ).logits,
        axis=1,
    )

    has_unanswerable = False
    has_true = False
    has_false = False

    for v in example_support:
        if v == 0:
            has_true = True
        if v == 1:
            has_false = True
        if v in (
            2,
            3,
        ):  # TODO another hack -- we cant have different labels for train and test so we do this
            has_unanswerable = True

    if has_unanswerable:
        answer = 2
    elif has_true and not has_false:
        answer = 0
    elif not has_true and has_false:
        answer = 1
    else:
        answer = 3

    json_data = {
        "claim_id": example["claim_id"],
        "claim": example["claim"],
        "evidence": example["evidence"],
        "pred_label": LABEL[answer],
    }
    predictions.append(json_data)

with open(OUTPUT_FILE, "w", encoding="utf-8") as output_file:
    json.dump(predictions, output_file, ensure_ascii=False, indent=4)