# Pipeline 

In [69]:
%load_ext autoreload
%autoreload 2

from averitec import Datapoint
from evidence_generation import GptEvidenceGenerator, GptBatchedEvidenceGenerator
from classification import DefaultClassifier, HuggingfaceClassifier, AverageEnsembleClassifier, LogRegEnsembleClassifier
from retrieval import SimpleFaissRetriever, Retriever, MmrFaissRetriever, SubqueryRetriever
from pipeline import Pipeline, MockPipeline
import pickle
from labels import label2id, id2label
import numpy as np
from sklearn.metrics import classification_report
import random
from tqdm import tqdm
random.seed(111)

import json

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Pipeline test

In [75]:
split = "dev"
path = "/mnt/data/factcheck/averitec-data/"
with open(path + f"data/{split}.json") as f:
    dataset = json.load(f)
    for i in range(len(dataset)):
        dataset[i]["claim_id"] = i
    datapoints = [Datapoint.from_dict(d) for d in dataset]
    

In [71]:
datapoint = Datapoint.from_dict(dataset[150])
datapoint

Datapoint(claim='Zimbabwe gets more than half of the African Export Import Bank loans.', claim_id=150, claim_date='7-10-2020', speaker='Tendai Biti', original_claim_url='https://web.archive.org/web/20210129100929im_/https://lh4.googleusercontent.com/WwmjzhYNC1g84X0JmQzruGRgW4tjrB8rAPT9Zx6x-W6xBhQpVIwGW1zw7wImZ6R6JAyQywo-CiCE3_AZtJcxB74aev7jVG1PFobnm3PKanSX0bwWP3Z8E6z-6tDwuPgrSa6mhuz7', reporting_source='Twitter', location_ISO_code='ZW', label='Refuted', metadata={'required_reannotation': False, 'justification': 'Seems the numbers is no where near half. Therefore the claim is refuted', 'fact_checking_article': 'https://web.archive.org/web/20210129100929/https://zimfact.org/does-zimbabwe-get-more-than-half-afreximbank-loans/', 'claim_types': ['Numerical Claim'], 'fact_checking_strategies': ['Numerical Comparison'], 'questions': [{'question': 'How much has Zimbabwe gotten from the Africa Export Import Bank', 'answers': [{'answer': 'Zimbabwe got US$2,78 billion.', 'answer_type': 'Abstracti

In [101]:
retriever = SubqueryRetriever(SimpleFaissRetriever(path="/mnt/data/factcheck/averitec-data/data_store/vecstore/dev/6k"))
retrieval_result = retriever(datapoint)
retrieval_result

RetrievalResult(documents=[Document(metadata={'url': 'https://www.voanews.com/a/afreximbank-pledges-to-post-mugabe-zimbabwe/4161541.html', 'context_before': '', 'context_after': 'The southern African nation, which dumped its hyperinflation-hit currency in 2009, is struggling with a severe dollar crunch that has seen banks fail to avail cash to customers while importers struggle to pay for imports. Finance Minister Patrick Chinamasa promised in a budget speech last week to re-engage with international lenders, curb spending and attract investors to revive the economy. ', 'queries': ['Zimbabwe African Export Import Bank loan percentage 2020']}, page_content='The African Export and Import Bank has pledged up to $1.5 billion in new loans and financial guarantees to Zimbabwe in a major boost for new President Emmerson Mnangagwa\'s government, the bank\'s president and chairman said Tuesday. Mnangagwa, who took over last month after veteran autocrat Robert Mugabe quit following a de facto mi

In [None]:
datapoint = Datapoint.from_dict(dataset[2])
datapoint.claim, datapoint.claim_date, datapoint.speaker

In [102]:
path = "/mnt/data/factcheck/averitec-data/"
target = path + "data_store/vecstore/dev/6k"
PIPELINE_NAME = "subquery+gpt4o"
pipeline = Pipeline(
    retriever,
    GptBatchedEvidenceGenerator(), 
    DefaultClassifier()
)

submission = []
dump = []

for dp in tqdm(datapoints):
    pipeline_result = pipeline(dp)
    submission.append(pipeline_result.to_submission())
    dump.append(pipeline_result)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json", "w") as f:
    json.dump(submission, f, indent=4)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.pkl", "wb") as f:
    pickle.dump(dump, f)

100%|██████████| 500/500 [1:31:08<00:00, 10.94s/it]


In [76]:
path = "/mnt/data/factcheck/averitec-data/"
target = path + "data_store/vecstore/dev/6k"
PIPELINE_NAME = "mmr+gpt4o"
pipeline = Pipeline(
    MmrFaissRetriever(path + "data_store/vecstore/dev/6k"), # TODO: TRIPLE CHECK BEFORE EVERY LAUNCH
    GptBatchedEvidenceGenerator(), 
    DefaultClassifier()
)

submission = []
dump = []

for dp in tqdm(datapoints):
    pipeline_result = pipeline(dp)
    submission.append(pipeline_result.to_submission())
    dump.append(pipeline_result)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json", "w") as f:
    json.dump(submission, f, indent=4)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.pkl", "wb") as f:
    pickle.dump(dump, f)

100%|██████████| 500/500 [01:54<00:00,  4.38it/s]


In [103]:
files = pipeline.evidence_generator.get_batch_files(path=f"/mnt/data/factcheck/averitec-data/data_store/batch_jobs/{split}_{PIPELINE_NAME}")

In [104]:
batch_results = pipeline.evidence_generator.submit_and_await_batches(files, f"/mnt/data/factcheck/averitec-data/data_store/batch_jobs/{split}_{PIPELINE_NAME}/output.jsonl")

  0%|          | 0/5 [00:00<?, ?it/s]

Batch(id='batch_7QTbWxoS8QazKtX0EBpyvDRN', completion_window='24h', created_at=1721861194, endpoint='/v1/chat/completions', input_file_id='file-2JVYE9rub3JkaQbwHFRJGZHR', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1721947594, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 1'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_7QTbWxoS8QazKtX0EBpyvDRN
waiting for batch to complete BatchRequestCounts(completed=5, failed=0, total=100) batch_7QTbWxoS8QazKtX0EBpyvDRN
waiting for batch to complete BatchRequestCounts(completed=93, failed=0, total=100) batch_7QTbWxoS8QazKtX0EBpyvDRN


waiting for batch to complete BatchRequestCounts(completed=99, failed=0, total=100) batch_7QTbWxoS8QazKtX0EBpyvDRN
waiting for batch to complete BatchRequestCounts(completed=99, failed=0, total=100) batch_7QTbWxoS8QazKtX0EBpyvDRN
waiting for batch to complete BatchRequestCounts(completed=99, failed=0, total=100) batch_7QTbWxoS8QazKtX0EBpyvDRN
waiting for batch to complete BatchRequestCounts(completed=99, failed=0, total=100) batch_7QTbWxoS8QazKtX0EBpyvDRN
waiting for batch to complete BatchRequestCounts(completed=100, failed=0, total=100) batch_7QTbWxoS8QazKtX0EBpyvDRN
batch 1 completed


 20%|██        | 1/5 [01:25<05:40, 85.06s/it]

Batch(id='batch_POrSSUcuQcLrhB6tHPLdVhCV', completion_window='24h', created_at=1721861279, endpoint='/v1/chat/completions', input_file_id='file-86Y96riGfTqzFwHMOkzjoy1q', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1721947679, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 2'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_POrSSUcuQcLrhB6tHPLdVhCV
waiting for batch to complete BatchRequestCounts(completed=5, failed=0, total=100) batch_POrSSUcuQcLrhB6tHPLdVhCV
waiting for batch to complete BatchRequestCounts(completed=95, failed=0, total=100) batch_POrSSUcuQcLrhB6tHPLdVhCV
batch 2 completed


 40%|████      | 2/5 [01:59<02:45, 55.19s/it]

Batch(id='batch_LA1i78namW35w6SMWI9K2A1a', completion_window='24h', created_at=1721861312, endpoint='/v1/chat/completions', input_file_id='file-cHJYW3lvk8SKG5fz6iojQF2y', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1721947712, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 3'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_LA1i78namW35w6SMWI9K2A1a
waiting for batch to complete BatchRequestCounts(completed=11, failed=0, total=100) batch_LA1i78namW35w6SMWI9K2A1a
waiting for batch to complete BatchRequestCounts(completed=84, failed=0, total=100) batch_LA1i78namW35w6SMWI9K2A1a
batch 3 completed


 60%|██████    | 3/5 [02:32<01:30, 45.24s/it]

Batch(id='batch_Z5BPN35JdK1uxJ6G7y4vhIXV', completion_window='24h', created_at=1721861346, endpoint='/v1/chat/completions', input_file_id='file-D4re1zxjzn5HLYVFteNa2D9t', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1721947746, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 4'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_Z5BPN35JdK1uxJ6G7y4vhIXV
waiting for batch to complete BatchRequestCounts(completed=12, failed=0, total=100) batch_Z5BPN35JdK1uxJ6G7y4vhIXV
waiting for batch to complete BatchRequestCounts(completed=92, failed=0, total=100) batch_Z5BPN35JdK1uxJ6G7y4vhIXV
batch 4 completed


 80%|████████  | 4/5 [03:06<00:40, 40.59s/it]

Batch(id='batch_UjJ7mIIKC2SMoICls4tEhrPv', completion_window='24h', created_at=1721861379, endpoint='/v1/chat/completions', input_file_id='file-G0VNnXgRsdxueFvG2IZ5guPh', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1721947779, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 5'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_UjJ7mIIKC2SMoICls4tEhrPv
waiting for batch to complete BatchRequestCounts(completed=5, failed=0, total=100) batch_UjJ7mIIKC2SMoICls4tEhrPv
waiting for batch to complete BatchRequestCounts(completed=91, failed=0, total=100) batch_UjJ7mIIKC2SMoICls4tEhrPv
batch 5 completed


100%|██████████| 5/5 [03:39<00:00, 43.87s/it]


In [105]:
new_dump = []
pipeline.evidence_generator.fallback_gpt_generator.client.temperature = .5
for pipeline_result, batch_result in zip(dump[:len(batch_results)], batch_results):
    new_result = pipeline.evidence_generator.update_pipeline_result(pipeline_result, batch_result, pipeline.classifier)
    new_dump.append(new_result)


Error parsing JSON for EvidenceGenerator.
 
{
    "questions":
        [
            {"question": "Did Netflix tweet that its content is 'mostly pedophilia-free'?", "answer": "No, there is no evidence that Netflix tweeted this.", "source": "1"},
            {"question": "What was the nature of the backlash against Netflix for streaming 'Cuties'?", "answer": "The backlash was due to accusations that the film sexualizes children and promotes pedophilia.", "source": "2"},
            {"question": "How did Netflix respond to the backlash over 'Cuties'?", "answer": "Netflix apologized for the inappropriate artwork and stated that 'Cuties' is a social commentary against the sexualization of young children.", "source": "3"},
            {"question": "Did Netflix change the promotional materials for 'Cuties' after the backlash?", "answer": "Yes, Netflix changed the poster and description of the film.", "source": "7"},
            {"question": "What was the original description of 'Cuties' on N

In [106]:
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json", "w") as f:
    json.dump([d.to_submission() for d in new_dump], f, indent=4)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.pkl", "wb") as f:
    pickle.dump(new_dump, f)

In [107]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json

Question-only score (HU-meteor):             0.45140886312940254
Question-answer score (HU-meteor):           0.27770055976133534
Veracity F1 scores:
 * Supported:                                0.6937269372693727
 * Refuted:                                  0.7979966611018364
 * Not Enough Evidence:                      0.20512820512820512
 * Conflicting Evidence/Cherrypicking:       0.038461538461538464
 * macro:                                    0.4338283354902382
 * acc:                                      0.684
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.67
 * Veracity scores (meteor @ 0.2):           0.532
 * Veracity scores (meteor @ 0.25):          0.4
 * Veracity scores (meteor @ 0.3):           0.266
 * Veracity scores (meteor @ 0.4):           0.092
 * Veracity scores (meteor @ 0.5):           0.042
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.1925972444043578
 * Veracity scores

## collapsible begin

In [None]:
from IPython.display import display, Markdown, Latex

In [None]:
knn_retrieval_result = retriever(datapoint)
display(Markdown("### 🗯️ " + datapoint.claim))
display(Markdown("*Retrieved by knn*\n\n"))
# sample 3
for r in knn_retrieval_result:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content[:256]}"))

In [None]:
from retrieval import MmrFaissRetriever

mmr_retriever = MmrFaissRetriever(retriever.path)
mmr_retrieval_result = mmr_retriever(datapoint)
display(Markdown("### 🗯️ " + datapoint.claim))
display(Markdown("*Retrieved by MMR*\n\n"))
# sample 3
for r in mmr_retrieval_result:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content[:256]}"))

In [None]:
subquery_retriever = SubqueryRetriever(retriever)
subquery_retrieval_result = subquery_retriever(datapoint)
display(Markdown("### 🗯️ " + datapoint.claim))
display(Markdown("*Retrieved by subqueries*\n\n"))
# sample 3
for r in subquery_retrieval_result:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n*{';'.join(r.metadata['queries'])}*\n\n{r.page_content[:256]}"))

In [None]:
subquery_retrieval_result.metadata

## Collapsible section end

In [None]:
evidence_generator = GptBatchedEvidenceGenerator("gpt-4o")
evidence_generation_result = evidence_generator(datapoint, retrieval_result)
evidence_generation_result

In [None]:
evidence_generation_result.metadata["suggested_label"]

In [None]:
datapoint.label

In [None]:
classifier = DefaultClassifier()
classification_result = classifier(datapoint, evidence_generation_result, retrieval_result)
str(classification_result), classification_result

In [None]:
datapoint2 = Datapoint.from_dict(dataset[16])
pipeline = Pipeline(retriever, evidence_generator, classifier)
pipeline_result = pipeline(datapoint2)
pipeline_result

In [None]:
str(pipeline_result.classification_result), datapoint2.label

In [None]:
pipeline_result.to_submission()

In [None]:
# pickle dump pipeline result
import pickle
with open('data/pipeline_result.pkl', 'wb') as f:
    pickle.dump(pipeline_result, f)

In [None]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submission_dev_avg_clf.json