# Pipeline 

In [1]:
%load_ext autoreload
%autoreload 2

from averitec import Datapoint
from evidence_generation import GptEvidenceGenerator, GptBatchedEvidenceGenerator, DynamicFewShotBatchedEvidenceGenerator
from classification import DefaultClassifier, HuggingfaceClassifier, AverageEnsembleClassifier, LogRegEnsembleClassifier
from retrieval import SimpleFaissRetriever, Retriever, MmrFaissRetriever, SubqueryRetriever
from pipeline import Pipeline, MockPipeline
import pickle
from labels import label2id, id2label
import numpy as np
from sklearn.metrics import classification_report
import random
from tqdm import tqdm
random.seed(111)

import json

## Pipeline test

In [2]:
split = "test"
path = "/mnt/data/factcheck/averitec-data/"
with open(path + f"data/{split}.json") as f:
    dataset = json.load(f)
    for i in range(len(dataset)):
        dataset[i]["claim_id"] = i
    datapoints = [Datapoint.from_dict(d) for d in dataset]
    

In [3]:
datapoint = Datapoint.from_dict(dataset[150])
datapoint

Datapoint(claim="A research organisation has confirmed Philippine Vice President Leni Robredo received zero votes in the country's Mindanao island.", claim_id=150, claim_date='11-3-2021', speaker='Showbiz Fanaticz', original_claim_url='https://www.youtube.com/watch?v=ErL5IzJkqh8&t=6s', reporting_source='Youtube', location_ISO_code='PH', label=None, metadata={})

In [4]:
# retriever = SimpleFaissRetriever(path="/mnt/data/factcheck/averitec-data/data_store/vecstore/dev/6k")
retriever = MmrFaissRetriever(path=f"/mnt/data/factcheck/averitec-data/data_store/vecstore/{split}/2k")
retrieval_result = retriever(datapoint)
retrieval_result

RetrievalResult(documents=[Document(metadata={'url': 'https://newsinfo.inquirer.net/1396547/the-4-year-robredo-marcos-vice-presidential-case', 'context_before': '', 'context_after': 'READ: Marcos asks SC to set aside Robredo proclamation, files protest Despite Marcos’ appeal to postpone the proclamation, Robredo still takes her oath of office and is inaugurated as vice president. READ: FULL TEXT: Vice President Leni Robredo inaugural speech In a two-page resolution, the SC summons Robredo to respond in ten days on Marcos’ election protest. The high court also asks the Commission Elections (Comelec) to preserve election returns (ERs) that are subject to protest through a Precautionary Protection Order (PPO). The Marcos camp files a three-page letter-manifestation questioning the Comelec’s decision to issue a resolution approving the stripping of the vote-counting machines (VCMs) and laptops used in the May 9 polls, despite the PPO from PET. The stripping activity involves the backup of 

In [5]:
datapoint = Datapoint.from_dict(dataset[2])
datapoint.claim, datapoint.claim_date, datapoint.speaker

('South African hospital found that traces of the novel coronavirus had survived on the surface of fresh food items for 12 hours during lab tests.',
 '3-12-2021',
 None)

In [9]:
from evidence_generation import ClaudeDFSEvidenceGenerator
path = "/mnt/data/factcheck/averitec-data/"
target = path + "data_store/vecstore/dev/6k"
PIPELINE_NAME = "mmr+claude-dfewshot"
pipeline = Pipeline(
    #dumps = "/mnt/data/factcheck/averitec-data/data_store/submissions/dev_mmr+gpt4o.pkl",
    # SubqueryRetriever(retriever),
    retriever,
    evidence_generator=ClaudeDFSEvidenceGenerator(), 
    classifier=DefaultClassifier()
)

submission = []
dump = []

for dp in tqdm(datapoints):
    pipeline_result = pipeline(dp)
    submission.append(pipeline_result.to_submission())
    dump.append(pipeline_result)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json", "w") as f:
    json.dump(submission, f, indent=4)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.pkl", "wb") as f:
    pickle.dump(dump, f)

  1%|          | 25/2215 [03:39<5:00:26,  8.23s/it]

local variable 'answer' referenced before assignment


  2%|▏         | 34/2215 [04:59<5:37:08,  9.28s/it]

In [7]:
dump[-1].classification_result

ClassificationResult(probs=array([0.11245721, 0.01521943, 0.83095266, 0.0413707 ]), metadata=None)

In [71]:
files = pipeline.evidence_generator.get_batch_files(path=f"/mnt/data/factcheck/averitec-data/data_store/batch_jobs/{split}_{PIPELINE_NAME}", batch_size=75)

In [72]:
files

['/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot/batch_1.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot/batch_2.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot/batch_3.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot/batch_4.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot/batch_5.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot/batch_6.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot/batch_7.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot/batch_8.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot/batch_9.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot/batch_10.jsonl',
 '/mnt/data/factche

In [73]:
batch_results = pipeline.evidence_generator.submit_and_await_batches(files, f"/mnt/data/factcheck/averitec-data/data_store/batch_jobs/{split}_{PIPELINE_NAME}/output.jsonl")

  0%|          | 0/30 [00:00<?, ?it/s]

Batch(id='batch_tL2RYVAmsm3CmfbzkcxIMwvy', completion_window='24h', created_at=1721945124, endpoint='/v1/chat/completions', input_file_id='file-4a6JzXH9Nvbwx2Dw15jD5Zeq', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722031524, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 1'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_tL2RYVAmsm3CmfbzkcxIMwvy
waiting for batch to complete BatchRequestCounts(completed=2, failed=0, total=75) batch_tL2RYVAmsm3CmfbzkcxIMwvy
waiting for batch to complete BatchRequestCounts(completed=67, failed=0, total=75) batch_tL2RYVAmsm3CmfbzkcxIMwvy
waiting for batch to complete BatchRequestCounts(completed=73, failed=0, total=75) batch_tL2RYVAmsm3CmfbzkcxIMwvy
waiting for b

  3%|▎         | 1/30 [01:25<41:22, 85.61s/it]

Batch(id='batch_gSvaVoB9y1UGMsRBGVOOqlPn', completion_window='24h', created_at=1721945210, endpoint='/v1/chat/completions', input_file_id='file-p44OVIzgSttdjTfLcEPCalzF', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722031610, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 2'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_gSvaVoB9y1UGMsRBGVOOqlPn
waiting for batch to complete BatchRequestCounts(completed=2, failed=0, total=75) batch_gSvaVoB9y1UGMsRBGVOOqlPn
waiting for batch to complete BatchRequestCounts(completed=68, failed=0, total=75) batch_gSvaVoB9y1UGMsRBGVOOqlPn
waiting for batch to complete BatchRequestCounts(completed=75, failed=0, total=75) batch_gSvaVoB9y1UGMsRBGVOOqlPn
batch 2 compl

  7%|▋         | 2/30 [02:10<28:38, 61.37s/it]

Batch(id='batch_RmPn3dzQpjIUGfmvjCMJbXtk', completion_window='24h', created_at=1721945254, endpoint='/v1/chat/completions', input_file_id='file-1bDjonbQbKggiJaDO20a3Qlr', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722031654, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 3'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_RmPn3dzQpjIUGfmvjCMJbXtk
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=75) batch_RmPn3dzQpjIUGfmvjCMJbXtk
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=75) batch_RmPn3dzQpjIUGfmvjCMJbXtk
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=75) batch_RmPn3dzQpjIUGfmvjCMJbXtk


In [60]:
new_dump = []
pipeline.evidence_generator.fallback_gpt_generator.client.temperature = .5
for pipeline_result, batch_result in zip(dump[:len(batch_results)], batch_results):
    new_result = pipeline.evidence_generator.update_pipeline_result(pipeline_result, batch_result, pipeline.classifier)
    new_dump.append(new_result)


Error parsing JSON for EvidenceGenerator.
 
{
    "questions":
        [
            {"question": "What does the CDC say about the effectiveness of ordinary masks against wildfire smoke?", "answer": "The CDC states that
failed, using fallback gpt
Error parsing JSON for EvidenceGenerator.
 
{
    "questions":
        [
            {"question": "Did President Trump open a platform to report censorship on social media?", "answer": "Yes, President Trump opened a platform to report censorship on social media.", "source": "1"},
            {"question": "What social media platforms are included in the complaint tool?", "answer": "The complaint tool includes Facebook, Twitter, and YouTube.", "source": "1"},
            {"question": "When was the complaint tool launched?", "answer": "The complaint tool was launched in May 2019.", "source": "4"},
            {"question": "What is the name of the complaint tool?", "answer": "The complaint tool is called the 'Tech Bias Story Sharing Tool'.", "sour

In [47]:
(
    new_dump[1],
    new_dump[1].evidence_generation_result,
    new_dump[1].classification_result
)

(PipelineResult(datapoint=Datapoint(claim='Trump Administration claimed songwriter Billie Eilish Is Destroying Our Country In Leaked Documents', claim_id=1, claim_date='31-10-2020', speaker=None, original_claim_url=None, reporting_source='Instagram', location_ISO_code='US', label='Refuted', metadata={'required_reannotation': False, 'justification': 'Seems that the Wzshington post accused the singer wrong lyrics. Therefore the claim is refuted', 'fact_checking_article': 'https://web.archive.org/web/20201103001419/https://leadstories.com/hoax-alert/2020/11/fact-check-trump-administration-did-not-claim-songwriter-billie-eilish-was-destroying-country-in-leaked-documents.html', 'claim_types': ['Position Statement', 'Event/Property Claim'], 'fact_checking_strategies': ['Written Evidence'], 'questions': [{'question': 'Has the Trump administration voiced that Billie Eilish was destroying the country', 'answers': [{'answer': 'A Washington Post story wrongly claimed the Trump administration accu

In [61]:
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json", "w") as f:
    json.dump([d.to_submission() for d in new_dump], f, indent=4)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.pkl", "wb") as f:
    pickle.dump(new_dump, f)

In [None]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submission_dev.json

In [26]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submissions/dev_subquery+gpt4o.json

Question-only score (HU-meteor):             0.45140886312940254
Question-answer score (HU-meteor):           0.27770055976133534
Veracity F1 scores:
 * Supported:                                0.6937269372693727
 * Refuted:                                  0.7979966611018364
 * Not Enough Evidence:                      0.20512820512820512
 * Conflicting Evidence/Cherrypicking:       0.038461538461538464
 * macro:                                    0.4338283354902382
 * acc:                                      0.684
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.67
 * Veracity scores (meteor @ 0.2):           0.532
 * Veracity scores (meteor @ 0.25):          0.4
 * Veracity scores (meteor @ 0.3):           0.266
 * Veracity scores (meteor @ 0.4):           0.092
 * Veracity scores (meteor @ 0.5):           0.042
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.1925972444043578
 * Veracity scores

In [62]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json

Question-only score (HU-meteor):             0.4504587577119231
Question-answer score (HU-meteor):           0.2854870005624111
Veracity F1 scores:
 * Supported:                                0.7357142857142858
 * Refuted:                                  0.8245033112582781
 * Not Enough Evidence:                      0.14925373134328357
 * Conflicting Evidence/Cherrypicking:       0.12244897959183673
 * macro:                                    0.45798007697692106
 * acc:                                      0.72
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.704
 * Veracity scores (meteor @ 0.2):           0.572
 * Veracity scores (meteor @ 0.25):          0.422
 * Veracity scores (meteor @ 0.3):           0.286
 * Veracity scores (meteor @ 0.4):           0.126
 * Veracity scores (meteor @ 0.5):           0.046
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.20454380902113348
 * Veracity score

In [22]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json

Question-only score (HU-meteor):             0.44559202247101487
Question-answer score (HU-meteor):           0.27486491800924373
Veracity F1 scores:
 * Supported:                                0.6566037735849056
 * Refuted:                                  0.8098360655737705
 * Not Enough Evidence:                      0.1891891891891892
 * Conflicting Evidence/Cherrypicking:       0.0784313725490196
 * macro:                                    0.43351510022422124
 * acc:                                      0.686
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.672
 * Veracity scores (meteor @ 0.2):           0.532
 * Veracity scores (meteor @ 0.25):          0.394
 * Veracity scores (meteor @ 0.3):           0.256
 * Veracity scores (meteor @ 0.4):           0.106
 * Veracity scores (meteor @ 0.5):           0.03
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.18721511530819013
 * Veracity score

In [21]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submissions/dev_{PIPELINE_NAME}.json

Question-only score (HU-meteor):             0.4505443758501816
Question-answer score (HU-meteor):           0.2848630013774468
Veracity F1 scores:
 * Supported:                                0.7003610108303249
 * Refuted:                                  0.831973898858075
 * Not Enough Evidence:                      0.29508196721311475
 * Conflicting Evidence/Cherrypicking:       0.08163265306122448
 * macro:                                    0.47726238249068476
 * acc:                                      0.726
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.71
 * Veracity scores (meteor @ 0.2):           0.574
 * Veracity scores (meteor @ 0.25):          0.42
 * Veracity scores (meteor @ 0.3):           0.304
 * Veracity scores (meteor @ 0.4):           0.122
 * Veracity scores (meteor @ 0.5):           0.044
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.19985145249552058
 * Veracity scores 

## collapsible begin

In [None]:
from IPython.display import display, Markdown, Latex

In [None]:
knn_retrieval_result = retriever(datapoint)
display(Markdown("### 🗯️ " + datapoint.claim))
display(Markdown("*Retrieved by knn*\n\n"))
# sample 3
for r in knn_retrieval_result:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content[:256]}"))

In [None]:
from retrieval import MmrFaissRetriever

mmr_retriever = MmrFaissRetriever(retriever.path)
mmr_retrieval_result = mmr_retriever(datapoint)
display(Markdown("### 🗯️ " + datapoint.claim))
display(Markdown("*Retrieved by MMR*\n\n"))
# sample 3
for r in mmr_retrieval_result:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content[:256]}"))

In [None]:
subquery_retriever = SubqueryRetriever(retriever)
subquery_retrieval_result = subquery_retriever(datapoint)
display(Markdown("### 🗯️ " + datapoint.claim))
display(Markdown("*Retrieved by subqueries*\n\n"))
# sample 3
for r in subquery_retrieval_result:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n*{';'.join(r.metadata['queries'])}*\n\n{r.page_content[:256]}"))

In [None]:
subquery_retrieval_result.metadata

## Collapsible section end

In [None]:
evidence_generator = GptBatchedEvidenceGenerator("gpt-4o")
evidence_generation_result = evidence_generator(datapoint, retrieval_result)
evidence_generation_result

In [None]:
evidence_generation_result.metadata["suggested_label"]

In [None]:
datapoint.label

In [None]:
classifier = DefaultClassifier()
classification_result = classifier(datapoint, evidence_generation_result, retrieval_result)
str(classification_result), classification_result

In [None]:
datapoint2 = Datapoint.from_dict(dataset[16])
pipeline = Pipeline(retriever, evidence_generator, classifier)
pipeline_result = pipeline(datapoint2)
pipeline_result

In [None]:
str(pipeline_result.classification_result), datapoint2.label

In [None]:
pipeline_result.to_submission()

In [None]:
# pickle dump pipeline result
import pickle
with open('data/pipeline_result.pkl', 'wb') as f:
    pickle.dump(pipeline_result, f)

In [None]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submission_dev_avg_clf.json