# Test of veracity prediction

In [1]:
from dotenv import load_dotenv
from pathlib import Path
import os
import pytorch_lightning as pl

dotenv_path = Path('aic_averitec/.env')
load_dotenv(dotenv_path)

DATASTORE_PATH = os.environ.get("DATASTORE_PATH")
DATASET_PATH = os.environ.get("DATASET_PATH")
MODELS_PATH = os.environ.get("MODELS_PATH")

%load_ext autoreload
%autoreload 2

DEV_PATH = str(os.path.join(DATASET_PATH, 'dev.json'))

In [3]:
import json
import tqdm
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification
from aic_averitec.src.models.SequenceClassificationModule import SequenceClassificationModule
from aic_averitec.src.prediction.veracity_prediction import SequenceClassificationDataLoader

LABEL = [
    "Supported",
    "Refuted",
    "Not Enough Evidence",
    "Conflicting Evidence/Cherrypicking",
]

LABEL_TO_IDX = {label: idx for idx, label in enumerate(LABEL)}

In [4]:
#load examples
CLAIM_WITH_EVIDENCE_FILE = str(os.path.join(DATASTORE_PATH, 'dev_top_3_rerank_qa.json'))

examples = []
with open(CLAIM_WITH_EVIDENCE_FILE) as f:
    for line in f:
        examples.append(json.loads(line))

print(examples)



In [5]:
#load examples from dev.json
with open(DEV_PATH) as f:
    dev_data = json.load(f)

examples = []
for data_point in dev_data:

    evidence = []
    for question in data_point["questions"]:
        q = question["question"]
        for answer in question["answers"]:
            a = answer["answer"]
            evidence.append({"question": q, "answer": a})
    ex = {
        "claim": data_point['claim'],
        "evidence": evidence,
    }

    examples.append(ex)

print(examples)




In [108]:
bert_model_name = "bert-base-uncased"
best_checkpoint = str(os.path.join(MODELS_PATH, 'bert_veracity.ckpt'))

tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertForSequenceClassification.from_pretrained(
    bert_model_name, num_labels=4, problem_type="single_label_classification"
)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
trained_model = SequenceClassificationModule.load_from_checkpoint(
    best_checkpoint, tokenizer=tokenizer, model=bert_model
).to(device)

dataLoader = SequenceClassificationDataLoader(
    tokenizer=tokenizer,
    data_file="this_is_discontinued",
    batch_size=32,
    add_extra_nee=False,)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [109]:
predictions = []

example_supports = []

y_pred_ternary = []
y_pred_us = []

for example in tqdm.tqdm(examples):
    example_strings = []
    for evidence in example["evidence"]:
        example_strings.append(
            dataLoader.quadruple_to_string(
                example["claim"], evidence["question"], evidence["answer"], ""
            )
        )

    if (
        len(example_strings) == 0
    ):  # If we found no evidence e.g. because google returned 0 pages, just output NEI.
        example["label"] = "Not Enough Evidence"
        continue

    tokenized_strings, attention_mask = dataLoader.tokenize_strings(example_strings)
    logits = trained_model(
            tokenized_strings.to(device), attention_mask=attention_mask.to(device)
        ).logits
    
    example_support = torch.argmax(
        logits,
        axis=1,
    )

    example_supports.append(example_support)


    has_unanswerable = False
    has_true = False
    has_false = False

    for v in example_support:
        if v == 0:
            has_true = True
        if v == 1:
            has_false = True
        if v in (
            2,
            3,
        ):  # TODO another hack -- we cant have different labels for train and test so we do this
            has_unanswerable = True

    if has_unanswerable:
        answer = 2
    elif has_true and not has_false:
        answer = 0
    elif not has_true and has_false:
        answer = 1
    else:
        answer = 3

    y_pred_ternary.append(answer)

    if has_true and has_false:
        answer_us = 3
    elif has_true and not has_false:
        answer_us = 0
    elif not has_true and has_false:
        answer_us = 1
    else:
        answer_us = 2 #otherwise NEI


    y_pred_us.append(answer_us)

    

#stack the supports to a tensor
example_supports = torch.stack(example_supports)



100%|██████████| 500/500 [00:07<00:00, 65.54it/s]


RuntimeError: stack expects each tensor to be equal size, but got [2] at entry 0 and [1] at entry 1

In [30]:
print(np.unique(example_supports.detach().cpu().numpy(), return_counts=True))

(array([0, 1, 2]), array([ 418, 1070,   12]))


In [110]:
example_supports

[tensor([1, 1], device='cuda:0'),
 tensor([1], device='cuda:0'),
 tensor([1, 1, 2, 2, 1], device='cuda:0'),
 tensor([1, 2, 1], device='cuda:0'),
 tensor([2, 1, 1, 1], device='cuda:0'),
 tensor([1], device='cuda:0'),
 tensor([1, 1, 1], device='cuda:0'),
 tensor([1, 0], device='cuda:0'),
 tensor([1, 1], device='cuda:0'),
 tensor([0, 2], device='cuda:0'),
 tensor([0, 1], device='cuda:0'),
 tensor([1], device='cuda:0'),
 tensor([1, 1, 0, 1, 1, 1, 0], device='cuda:0'),
 tensor([1], device='cuda:0'),
 tensor([1, 1], device='cuda:0'),
 tensor([0, 0, 0, 1, 0, 0], device='cuda:0'),
 tensor([1, 1], device='cuda:0'),
 tensor([1, 1, 1, 1], device='cuda:0'),
 tensor([1, 0, 1], device='cuda:0'),
 tensor([1, 1], device='cuda:0'),
 tensor([1], device='cuda:0'),
 tensor([1], device='cuda:0'),
 tensor([0], device='cuda:0'),
 tensor([1], device='cuda:0'),
 tensor([1, 1, 1], device='cuda:0'),
 tensor([1], device='cuda:0'),
 tensor([1, 1], device='cuda:0'),
 tensor([0, 1], device='cuda:0'),
 tensor([1, 1],

In [111]:
print(np.unique(y_pred_ternary, return_counts=True))
print(np.unique(y_pred_us, return_counts=True))

(array([0, 1, 2, 3]), array([ 81, 292,  39,  88]))
(array([0, 1, 2, 3]), array([ 91, 312,   1,  96]))


In [12]:
#get y_true
dev_data = json.load(open(DEV_PATH))

dev_labels = [example["label"] for example in dev_data]
y_true = [LABEL_TO_IDX[label] for label in dev_labels]
print(y_true)

print(np.unique(y_true, return_counts=True))

[1, 1, 1, 1, 1, 1, 0, 0, 1, 2, 3, 3, 1, 0, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 0, 0, 3, 0, 1, 0, 1, 1, 1, 1, 1, 0, 3, 1, 1, 1, 3, 1, 1, 3, 1, 1, 0, 1, 1, 1, 0, 3, 2, 3, 1, 1, 0, 1, 1, 0, 1, 2, 1, 1, 1, 1, 3, 1, 0, 2, 0, 1, 1, 1, 3, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 3, 3, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 3, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 2, 0, 1, 0, 1, 0, 1, 2, 3, 0, 0, 0, 0, 3, 1, 1, 1, 0, 2, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 2, 0, 1, 2, 1, 1, 1, 1, 1, 2, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 1, 1, 0, 2, 1, 0, 1, 2, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 3, 1, 1, 0, 1, 1, 1, 1, 0, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 0, 1, 3, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 3, 1, 0, 3, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 3, 1, 1, 0, 2, 1, 0, 1, 1, 

In [113]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred_ternary, target_names=LABEL))
print(classification_report(y_true, y_pred_us, target_names=LABEL))

                                    precision    recall  f1-score   support

                         Supported       0.68      0.45      0.54       122
                           Refuted       0.76      0.72      0.74       305
               Not Enough Evidence       0.49      0.54      0.51        35
Conflicting Evidence/Cherrypicking       0.16      0.37      0.22        38

                          accuracy                           0.62       500
                         macro avg       0.52      0.52      0.50       500
                      weighted avg       0.67      0.62      0.64       500

                                    precision    recall  f1-score   support

                         Supported       0.60      0.45      0.52       122
                           Refuted       0.74      0.76      0.75       305
               Not Enough Evidence       1.00      0.03      0.06        35
Conflicting Evidence/Cherrypicking       0.16      0.39      0.22        38

       

                                    precision    recall  f1-score   support

                         Supported       0.51      0.39      0.44       122
                           Refuted       0.72      0.71      0.71       305
               Not Enough Evidence       0.00      0.00      0.00        35
Conflicting Evidence/Cherrypicking       0.09      0.24      0.13        38

                          accuracy                           0.55       500
                         macro avg       0.33      0.33      0.32       500
                      weighted avg       0.57      0.55      0.55       500

                                    precision    recall  f1-score   support

                         Supported       0.50      0.39      0.44       122
                           Refuted       0.72      0.72      0.72       305
               Not Enough Evidence       0.00      0.00      0.00        35
Conflicting Evidence/Cherrypicking       0.09      0.24      0.13        38

                          accuracy                           0.55       500
                         macro avg       0.33      0.34      0.32       500
                      weighted avg       0.57      0.55      0.55       500

## plug-in deberta

In [6]:
class SequenceClassificationDataLoaderNLI(pl.LightningDataModule):
    def __init__(self, tokenizer):
        super().__init__()
        self.tokenizer = tokenizer

    def tokenize_strings(
        self,
        source_sentences,
        max_length=400,
        pad_to_max_length=False,
        return_tensors="pt",
    ):
        encoded_dict = self.tokenizer(
            source_sentences,
            max_length=max_length,
            padding="max_length" if pad_to_max_length else "longest",
            truncation=True,
            return_tensors=return_tensors,
        )

        input_ids = encoded_dict["input_ids"]
        attention_masks = encoded_dict["attention_mask"]

        return input_ids, attention_masks

    def quadruple_to_string(self, claim, question, answer):
        return [
            claim.strip(), answer.strip()
        ]

In [14]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

id2correct_id = {0: 1, 1: 0, 2: 2}

device = "cuda:0" if torch.cuda.is_available() else "cpu"


#model_name = "cross-encoder/nli-deberta-v3-small"
#model_name = "cross-encoder/nli-deberta-v3-large"
#model_name = "sileod/deberta-v3-small-tasksource-nli"
#model_name = "models/averitec/nli/deberta-v3-large/checkpoint-687"
model_name = "models/averitec/nli/nli-deberta-v3-large/checkpoint-1145"

model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

dataLoader_nli = SequenceClassificationDataLoaderNLI(tokenizer)

model.eval()

predictions = []

example_supports = []

y_pred_ternary = []
y_pred_us = []

for example in tqdm.tqdm(examples):
    example_strings = []
    for evidence in example["evidence"]:
        example_strings.append(
            dataLoader_nli.quadruple_to_string(
                example["claim"], evidence["question"], evidence["answer"]
            )
        )

    if (
        len(example_strings) == 0
    ):  # If we found no evidence e.g. because google returned 0 pages, just output NEI.
        example["label"] = "Not Enough Evidence"
        y_pred_ternary.append(2)
        y_pred_us.append(2)
        continue

    tokenized_strings, attention_mask = dataLoader_nli.tokenize_strings(example_strings)
    with torch.no_grad():
        logits = model(tokenized_strings.to(device), attention_mask=attention_mask.to(device)).logits

    example_support = torch.argmax(
        logits,
        axis=1,
    )

    #remap the labels in example support tensor using the id2correct_id
    example_support = torch.tensor([id2correct_id[i.item()] for i in example_support])

    example_supports.append(example_support)

    has_unanswerable = False
    has_true = False
    has_false = False

    for v in example_support:
        if v == 0:
            has_true = True
        if v == 1:
            has_false = True
        if v in (
            2,
            3,
        ):  # TODO another hack -- we cant have different labels for train and test so we do this
            has_unanswerable = True

    if has_unanswerable:
        answer = 2
    elif has_true and not has_false:
        answer = 0
    elif not has_true and has_false:
        answer = 1
    else:
        answer = 3

    y_pred_ternary.append(answer)

    if has_true and has_false:
        answer_us = 3
    elif has_true and not has_false:
        answer_us = 0
    elif not has_true and has_false:
        answer_us = 1
    else:
        answer_us = 2 #otherwise NEI


    y_pred_us.append(answer_us)
    
#stack the supports to a tensor
example_supports = torch.stack(example_supports)

100%|██████████| 500/500 [00:19<00:00, 25.07it/s]


RuntimeError: stack expects each tensor to be equal size, but got [2] at entry 0 and [1] at entry 1

In [9]:
np.unique(example_supports.detach().cpu().numpy(), return_counts=True)

AttributeError: 'list' object has no attribute 'detach'

In [15]:
print(np.unique(y_pred_ternary, return_counts=True))
print(np.unique(y_pred_us, return_counts=True))

(array([0, 1, 2, 3]), array([ 61, 261, 121,  57]))
(array([0, 1, 2, 3]), array([ 86, 318,  21,  75]))


deberta nli

(array([0, 1, 2, 3]), array([ 61, 261, 121,  57]))
(array([0, 1, 2, 3]), array([ 86, 318,  21,  75]))

deberta

(array([0, 1, 2, 3]), array([ 60, 296, 118,  26]))
(array([0, 1, 2, 3]), array([ 88, 347,  25,  40]))

large

(array([0, 1, 2, 3]), array([  6,  13, 479,   2]))
(array([0, 1, 2, 3]), array([ 93, 146, 221,  40]))

small
(array([0, 1, 2, 3]), array([  4,   7, 485,   4]))
(array([0, 1, 2, 3]), array([ 82, 169, 220,  29]))

In [16]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred_ternary, target_names=LABEL))
print(classification_report(y_true, y_pred_us, target_names=LABEL))

                                    precision    recall  f1-score   support

                         Supported       0.75      0.38      0.50       122
                           Refuted       0.79      0.68      0.73       305
               Not Enough Evidence       0.21      0.71      0.32        35
Conflicting Evidence/Cherrypicking       0.12      0.18      0.15        38

                          accuracy                           0.57       500
                         macro avg       0.47      0.49      0.42       500
                      weighted avg       0.69      0.57      0.60       500

                                    precision    recall  f1-score   support

                         Supported       0.63      0.44      0.52       122
                           Refuted       0.75      0.79      0.77       305
               Not Enough Evidence       0.24      0.14      0.18        35
Conflicting Evidence/Cherrypicking       0.15      0.29      0.19        38

       

deberta
                                    precision    recall  f1-score   support

                         Supported       0.88      0.43      0.58       122
                           Refuted       0.80      0.78      0.79       305
               Not Enough Evidence       0.17      0.57      0.26        35
Conflicting Evidence/Cherrypicking       0.15      0.11      0.12        38

                          accuracy                           0.63       500
                         macro avg       0.50      0.47      0.44       500
                      weighted avg       0.73      0.63      0.65       500

                                    precision    recall  f1-score   support

                         Supported       0.73      0.52      0.61       122
                           Refuted       0.77      0.87      0.82       305
               Not Enough Evidence       0.24      0.17      0.20        35
Conflicting Evidence/Cherrypicking       0.15      0.16      0.15        38

                          accuracy                           0.68       500
                         macro avg       0.47      0.43      0.44       500
                      weighted avg       0.67      0.68      0.67       500

small
                                    precision    recall  f1-score   support

                         Supported       0.75      0.02      0.05       122
                           Refuted       0.71      0.02      0.03       305
               Not Enough Evidence       0.07      1.00      0.13        35
Conflicting Evidence/Cherrypicking       0.00      0.00      0.00        38

                          accuracy                           0.09       500
                         macro avg       0.38      0.26      0.05       500
                      weighted avg       0.62      0.09      0.04       500

                                    precision    recall  f1-score   support

                         Supported       0.18      0.12      0.15       122
                           Refuted       0.69      0.38      0.49       305
               Not Enough Evidence       0.07      0.46      0.13        35
Conflicting Evidence/Cherrypicking       0.03      0.03      0.03        38

                          accuracy                           0.30       500
                         macro avg       0.25      0.25      0.20       500
                      weighted avg       0.47      0.30      0.35       500

large
                                    precision    recall  f1-score   support

                         Supported       0.33      0.02      0.03       122
                           Refuted       0.69      0.03      0.06       305
               Not Enough Evidence       0.07      1.00      0.14        35
Conflicting Evidence/Cherrypicking       0.00      0.00      0.00        38

                          accuracy                           0.09       500
                         macro avg       0.27      0.26      0.06       500
                      weighted avg       0.51      0.09      0.05       500

                                    precision    recall  f1-score   support

                         Supported       0.27      0.20      0.23       122
                           Refuted       0.64      0.31      0.42       305
               Not Enough Evidence       0.08      0.49      0.13        35
Conflicting Evidence/Cherrypicking       0.10      0.11      0.10        38

                          accuracy                           0.28       500
                         macro avg       0.27      0.28      0.22       500
                      weighted avg       0.47      0.28      0.33       500