# AVeriTec Baseline Notebook

## ⬇️ Imports

In [31]:
from dotenv import load_dotenv
from pathlib import Path
import os

dotenv_path = Path('aic_averitec/.env')
load_dotenv(dotenv_path)

DATASTORE_PATH = os.environ.get("DATASTORE_PATH")
DATASET_PATH = os.environ.get("DATASET_PATH")
MODELS_PATH = os.environ.get("MODELS_PATH")

%load_ext autoreload
%autoreload 2

DEV_PATH = str(os.path.join(DATASET_PATH, 'dev.json'))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. Rank the sentences in the knowledge store with BM25

In [32]:
%reload_ext autoreload

In [33]:
from aic_averitec.src.reranking.bm25_sentences import get_top_k_sentences_bm25 #bm25 reranking

In [None]:
get_top_k_sentences_bm25(knowledge_store_dir=str(os.path.join(DATASTORE_PATH, 'output_dev')), claim_file=DEV_PATH, json_output="./aic_averitec/data_store/dev_top_k.json")

## 3. Generate questions-answer pair for the top sentences

In [35]:
from aic_averitec.src.reranking.question_generation_top_sentences import generate_questions_top_k #question generation

In [None]:
# TOP_K_TARGET_KNOWLEDGE = "./aic_averitec/data_store/dev_top_k.json"
TOP_K_TARGET_KNOWLEDGE = str(os.path.join(DATASTORE_PATH, 'dev_top_k_sentences.json'))

generate_questions_top_k(reference_corpus=str(os.path.join(DATASET_PATH, "train.json")), target_file=DEV_PATH, top_k_target_knowledge=TOP_K_TARGET_KNOWLEDGE, output_questions="./aic_averitec/data_store/dev_top_k_qa.json")

## 4. Rerank the QA pairs

In [38]:
%pip install typing_extensions

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [None]:
from aic_averitec.src.reranking.rerank_questions import rerank_qa_pairs #typing extensions dependency!

In [None]:
TOP_K_QA_FILE = str(os.path.join(DATASTORE_PATH, 'dev_top_k_qa.json'))
# TOP_K_QA_FILE = "./aic_averitec/data_store/dev_top_k_qa.json"

rerank_qa_pairs(top_k_qa_file=TOP_K_QA_FILE, output_file="./aic_averitec/data_store/dev_top_3_rerank_qa.json", best_checkpoint=str(os.path.join(MODELS_PATH, "bert_dual_encoder.ckpt")))

## 5. Veracity prediction

In [39]:
from aic_averitec.src.prediction.veracity_prediction import veracity_prediction #veracity prediction

#CLAIM_WITH_EVIDENCE_FILE = "./aic_averitec/data_store/dev_top_3_rerank_qa.json"
CLAIM_WITH_EVIDENCE_FILE = str(os.path.join(DATASTORE_PATH, 'dev_top_3_rerank_qa.json'))

veracity_prediction(claim_with_evidence_file=CLAIM_WITH_EVIDENCE_FILE, output_file="./aic_averitec/data_store/dev_veracity_prediction.json", best_checkpoint=str(os.path.join(MODELS_PATH, "bert_veracity.ckpt")))

ImportError: cannot import name 'override' from 'typing_extensions' (/mnt/appl/software/typing-extensions/4.3.0-GCCcore-11.3.0/lib/python3.10/site-packages/typing_extensions.py)

## 6. Evaluate Veracity

In [42]:
from aic_averitec.src.prediction.evaluate_veracity import evaluate_veracity #veracity evaluatio
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/mlynatom/nltk_data...


True

In [43]:
PREDICTION_FILE = str(os.path.join(DATASTORE_PATH, 'dev_veracity_prediction.json'))
#PREDICTION_FILE = "./aic_averitec/data_store/dev_veracity_prediction.json"

evaluate_veracity(prediction_file=PREDICTION_FILE, label_file=DEV_PATH)

Question-only score (HU-meteor):             0.24041210604919014
Question-answer score (HU-meteor):           0.18547341231661782
Veracity F1 scores:
 * Supported:                                0.4372093023255814
 * Refuted:                                  0.7138157894736843
 * Not Enough Evidence:                      0.0
 * Conflicting Evidence/Cherrypicking:       0.13333333333333333
 * macro:                                    0.32108960628314975
 * acc:                                      0.546
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.452
 * Veracity scores (meteor @ 0.2):           0.186
 * Veracity scores (meteor @ 0.25):          0.092
 * Veracity scores (meteor @ 0.3):           0.05
 * Veracity scores (meteor @ 0.4):           0.012
 * Veracity scores (meteor @ 0.5):           0.002
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.05979024836242316
 * Veracity scores (Position St