# AVeriTec Baseline Notebook

## ⬇️ Imports

In [1]:
from dotenv import load_dotenv
from pathlib import Path
import os

dotenv_path = Path('aic_averitec/.env')
load_dotenv(dotenv_path)

DATASTORE_PATH = os.environ.get("DATASTORE_PATH")
DATASET_PATH = os.environ.get("DATASET_PATH")
MODELS_PATH = os.environ.get("MODELS_PATH")

%load_ext autoreload
%autoreload 2

DEV_PATH = str(os.path.join(DATASET_PATH, 'dev.json'))

## 2. Rank the sentences in the knowledge store with BM25

In [32]:
%reload_ext autoreload

In [33]:
from aic_averitec.src.reranking.bm25_sentences import get_top_k_sentences_bm25 #bm25 reranking

In [None]:
get_top_k_sentences_bm25(knowledge_store_dir=str(os.path.join(DATASTORE_PATH, 'output_dev')), claim_file=DEV_PATH, json_output="./aic_averitec/data_store/dev_top_k.json")

## 2. Rank whole texts with neural reranking

In [15]:
from aic_averitec.src.reranking.rerank_sentences import get_top_k_sentences_nn
%reload_ext autoreload

get_top_k_sentences_nn(knowledge_store_dir=str(os.path.join(DATASTORE_PATH, 'output_dev')), claim_file=DEV_PATH, json_output="./aic_averitec/data_store/dev_top_k_nn.json", rerank_model = "mixedbread-ai/mxbai-rerank-xsmall-v1", end=10)

Processing claim 0... Progress: 1 / 10
Obtained 825 documents from 825 urls.
Top 10 retrieved. Time elapsed: 29.858542919158936.
Processing claim 1... Progress: 2 / 10
Obtained 714 documents from 714 urls.
Top 10 retrieved. Time elapsed: 23.82103157043457.
Processing claim 2... Progress: 3 / 10
Obtained 1300 documents from 1300 urls.
Top 10 retrieved. Time elapsed: 71.80752301216125.
Processing claim 3... Progress: 4 / 10
Obtained 1041 documents from 1041 urls.
Top 10 retrieved. Time elapsed: 59.356168270111084.
Processing claim 4... Progress: 5 / 10
Obtained 1064 documents from 1064 urls.
Top 10 retrieved. Time elapsed: 33.1887104511261.
Processing claim 5... Progress: 6 / 10
Obtained 822 documents from 822 urls.
Top 10 retrieved. Time elapsed: 47.47010135650635.
Processing claim 6... Progress: 7 / 10
Obtained 1392 documents from 1392 urls.
Top 10 retrieved. Time elapsed: 42.5123815536499.
Processing claim 7... Progress: 8 / 10
Obtained 1092 documents from 1092 urls.
Top 10 retrieved.

## 3. Generate questions-answer pair for the top sentences

In [35]:
from aic_averitec.src.reranking.question_generation_top_sentences import generate_questions_top_k #question generation

In [None]:
# TOP_K_TARGET_KNOWLEDGE = "./aic_averitec/data_store/dev_top_k.json"
TOP_K_TARGET_KNOWLEDGE = str(os.path.join(DATASTORE_PATH, 'dev_top_k_sentences.json'))

generate_questions_top_k(reference_corpus=str(os.path.join(DATASET_PATH, "train.json")), target_file=DEV_PATH, top_k_target_knowledge=TOP_K_TARGET_KNOWLEDGE, output_questions="./aic_averitec/data_store/dev_top_k_qa.json")

## 4. Rerank the QA pairs

In [38]:
%pip install typing_extensions

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [None]:
from aic_averitec.src.reranking.rerank_questions import rerank_qa_pairs #typing extensions dependency!

In [None]:
TOP_K_QA_FILE = str(os.path.join(DATASTORE_PATH, 'dev_top_k_qa.json'))
# TOP_K_QA_FILE = "./aic_averitec/data_store/dev_top_k_qa.json"

rerank_qa_pairs(top_k_qa_file=TOP_K_QA_FILE, output_file="./aic_averitec/data_store/dev_top_3_rerank_qa.json", best_checkpoint=str(os.path.join(MODELS_PATH, "bert_dual_encoder.ckpt")))

## 5. Veracity prediction

In [3]:
import json
#convert file from json to jsonl
with open("aic_averitec/data_store/dev500_questions.json", "r") as f:
    json_data = json.load(f)

#write to jsonl
with open("aic_averitec/data_store/dev500_questions.jsonl", "w") as f:
    for item in json_data:
        json.dump(item, f)
        f.write("\n")

In [2]:
from aic_averitec.src.prediction.veracity_prediction import veracity_prediction #veracity prediction

#CLAIM_WITH_EVIDENCE_FILE = "./aic_averitec/data_store/dev_top_3_rerank_qa.json"
#CLAIM_WITH_EVIDENCE_FILE = str(os.path.join(DATASTORE_PATH, 'dev_top_3_rerank_qa.json'))
CLAIM_WITH_EVIDENCE_FILE = "aic_averitec/data_store/dev500_questions.jsonl"

veracity_prediction(claim_with_evidence_file=CLAIM_WITH_EVIDENCE_FILE, output_file="./aic_averitec/data_store/dev_veracity_prediction.json", best_checkpoint=str(os.path.join(MODELS_PATH, "bert_veracity.ckpt")))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [4]:
%pip install --upgrade transformers

Collecting transformers
  Using cached transformers-4.42.3-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Using cached tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.42.3-py3-none-any.whl (9.3 MB)
Using cached tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizers-0.13.3
  You can safely remove it manually.[0m[33m
  You can safely remove it manually.[0m[33m
[0m  Attempting uninstall: transformers
    Found existing installation: transformers 4.29.2
    Uninstalling transformers-4.29.2:
      Successfully uninstalled transformers-4.29.2
[31mERROR: pip's dependency resolver does not currently take into account all the 

In [3]:
from aic_averitec.src.prediction.veracity_prediction import veracity_prediction_v2

veracity_prediction_v2(claim_with_evidence_file="aic_averitec/data_store/dev500_questions.jsonl", output_file="aic_averitec/data_store/dev500_veracity_prediction.json", nei_new_eval=True)

100%|██████████| 500/500 [00:22<00:00, 21.92it/s]


In [2]:
from aic_averitec.src.prediction.veracity_prediction import veracity_prediction_4concat
#model = "models/averitec/nli_4concat/deberta-v3-large/checkpoint-576" #best deberta
model = "models/averitec/nli_4concat/mistral-7B-v0.3/checkpoint-15340" #load with peft! TODO
veracity_prediction_4concat(claim_with_evidence_file="aic_averitec/data_store/dev500_questions.jsonl", output_file="aic_averitec/data_store/dev500_veracity_prediction_4concat_mistral.json", best_checkpoint=model, peft=True)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.3 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/500 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 500/500 [06:20<00:00,  1.31it/s]


## 6. Evaluate Veracity

In [3]:
from aic_averitec.src.prediction.evaluate_veracity import evaluate_veracity #veracity evaluatio
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/mlynatom/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
#PREDICTION_FILE = str(os.path.join(DATASTORE_PATH, 'dev_veracity_prediction.json'))
#PREDICTION_FILE = "aic_averitec/data_store/dev500_veracity_prediction.json"
#PREDICTION_FILE = "./aic_averitec/data_store/dev_veracity_prediction.json"
#PREDICTION_FILE = "aic_averitec/data_store/dev500_veracity_prediction_4concat.json"
PREDICTION_FILE = "aic_averitec/data_store/dev500_veracity_prediction_4concat_mistral.json"

evaluate_veracity(prediction_file=PREDICTION_FILE, label_file=DEV_PATH)

Question-only score (HU-meteor):             0.4482806389034514
Question-answer score (HU-meteor):           0.272736519646492
Veracity F1 scores:
 * Supported:                                0.6768060836501901
 * Refuted:                                  0.7944732297063903
 * Not Enough Evidence:                      0.11538461538461539
 * Conflicting Evidence/Cherrypicking:       0.2075471698113208
 * macro:                                    0.44855277463812915
 * acc:                                      0.666
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.646
 * Veracity scores (meteor @ 0.2):           0.484
 * Veracity scores (meteor @ 0.25):          0.342
 * Veracity scores (meteor @ 0.3):           0.234
 * Veracity scores (meteor @ 0.4):           0.088
 * Veracity scores (meteor @ 0.5):           0.04
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.18362806040517682
 * Veracity scores 

mistral 4concat

Question-only score (HU-meteor):             0.4482806389034514
Question-answer score (HU-meteor):           0.272736519646492
====================
Veracity F1 scores:
 * Supported:                                0.6768060836501901
 * Refuted:                                  0.7944732297063903
 * Not Enough Evidence:                      0.11538461538461539
 * Conflicting Evidence/Cherrypicking:       0.2075471698113208
 * macro:                                    0.44855277463812915
 * acc:                                      0.666
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.646
 * Veracity scores (meteor @ 0.2):           0.484
 * Veracity scores (meteor @ 0.25):          0.342
 * Veracity scores (meteor @ 0.3):           0.234
 * Veracity scores (meteor @ 0.4):           0.088
 * Veracity scores (meteor @ 0.5):           0.04
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.18362806040517682
 * Veracity scores (Position Statement):     0.220755019960999
 * Veracity scores (Causal Claim):           0.150556101994713
 * Veracity scores (Numerical Claim):        0.20530835082213306
 * Veracity scores (Quote Verification):     0.15591383933272546

dev500 deberta 4concat

Question-only score (HU-meteor):             0.4482806389034514
Question-answer score (HU-meteor):           0.272736519646492
====================
Veracity F1 scores:
 * Supported:                                0.6987951807228915
 * Refuted:                                  0.8227194492254732
 * Not Enough Evidence:                      0.13043478260869562
 * Conflicting Evidence/Cherrypicking:       0.19354838709677416
 * macro:                                    0.46137444991345866
 * acc:                                      0.682
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.664
 * Veracity scores (meteor @ 0.2):           0.502
 * Veracity scores (meteor @ 0.25):          0.358
 * Veracity scores (meteor @ 0.3):           0.246
 * Veracity scores (meteor @ 0.4):           0.09
 * Veracity scores (meteor @ 0.5):           0.038
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.18362806040517682
 * Veracity scores (Position Statement):     0.220755019960999
 * Veracity scores (Causal Claim):           0.150556101994713
 * Veracity scores (Numerical Claim):        0.20530835082213306
 * Veracity scores (Quote Verification):     0.15591383933272546

#### original NLI

Question-only score (HU-meteor):             0.4482806389034514
Question-answer score (HU-meteor):           0.272736519646492
====================
Veracity F1 scores:
 * Supported:                                0.41884816753926696
 * Refuted:                                  0.601980198019802
 * Not Enough Evidence:                      0.125
 * Conflicting Evidence/Cherrypicking:       0.18333333333333335
 * macro:                                    0.33229042472310055
 * acc:                                      0.436
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.422
 * Veracity scores (meteor @ 0.2):           0.32
 * Veracity scores (meteor @ 0.25):          0.234
 * Veracity scores (meteor @ 0.3):           0.164
 * Veracity scores (meteor @ 0.4):           0.084
 * Veracity scores (meteor @ 0.5):           0.038
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.18362806040517682
 * Veracity scores (Position Statement):     0.220755019960999
 * Veracity scores (Causal Claim):           0.150556101994713
 * Veracity scores (Numerical Claim):        0.20530835082213306
 * Veracity scores (Quote Verification):     0.15591383933272546

#### Baseline

Question-only score (HU-meteor):             0.24041210604919014
Question-answer score (HU-meteor):           0.18547341231661782
====================
Veracity F1 scores:
 * Supported:                                0.4372093023255814
 * Refuted:                                  0.7138157894736843
 * Not Enough Evidence:                      0.0
 * Conflicting Evidence/Cherrypicking:       0.13333333333333333
 * macro:                                    0.32108960628314975
 * acc:                                      0.546
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.452
 * Veracity scores (meteor @ 0.2):           0.186
 * Veracity scores (meteor @ 0.25):          0.092
 * Veracity scores (meteor @ 0.3):           0.05
 * Veracity scores (meteor @ 0.4):           0.012
 * Veracity scores (meteor @ 0.5):           0.002
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.05979024836242316
 * Veracity scores (Position Statement):     0.07307235683162566
 * Veracity scores (Causal Claim):           0.056993318700669676
 * Veracity scores (Numerical Claim):        0.070268725336461
 * Veracity scores (Quote Verification):     0.06266365543813311