# Pipeline 

In [8]:
%load_ext autoreload
%autoreload 2

from averitec import Datapoint
from evidence_generation import GptEvidenceGenerator, GptBatchedEvidenceGenerator, DynamicFewShotBatchedEvidenceGenerator
from classification import DefaultClassifier, HuggingfaceClassifier, AverageEnsembleClassifier, LogRegEnsembleClassifier
from retrieval import SimpleFaissRetriever, Retriever, MmrFaissRetriever, SubqueryRetriever
from pipeline import Pipeline, MockPipeline
import pickle
from labels import label2id, id2label
import numpy as np
from sklearn.metrics import classification_report
import random
from tqdm import tqdm
random.seed(111)

import json

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Pipeline test

In [9]:
split = "dev"
path = "/mnt/data/factcheck/averitec-data/"
with open(path + f"data/{split}.json") as f:
    dataset = json.load(f)
    for i in range(len(dataset)):
        dataset[i]["claim_id"] = i
    datapoints = [Datapoint.from_dict(d) for d in dataset]
    

In [10]:
datapoint = Datapoint.from_dict(dataset[150])
datapoint

Datapoint(claim='Zimbabwe gets more than half of the African Export Import Bank loans.', claim_id=150, claim_date='7-10-2020', speaker='Tendai Biti', original_claim_url='https://web.archive.org/web/20210129100929im_/https://lh4.googleusercontent.com/WwmjzhYNC1g84X0JmQzruGRgW4tjrB8rAPT9Zx6x-W6xBhQpVIwGW1zw7wImZ6R6JAyQywo-CiCE3_AZtJcxB74aev7jVG1PFobnm3PKanSX0bwWP3Z8E6z-6tDwuPgrSa6mhuz7', reporting_source='Twitter', location_ISO_code='ZW', label='Refuted', metadata={'required_reannotation': False, 'justification': 'Seems the numbers is no where near half. Therefore the claim is refuted', 'fact_checking_article': 'https://web.archive.org/web/20210129100929/https://zimfact.org/does-zimbabwe-get-more-than-half-afreximbank-loans/', 'claim_types': ['Numerical Claim'], 'fact_checking_strategies': ['Numerical Comparison'], 'questions': [{'question': 'How much has Zimbabwe gotten from the Africa Export Import Bank', 'answers': [{'answer': 'Zimbabwe got US$2,78 billion.', 'answer_type': 'Abstracti

In [11]:
# retriever = SimpleFaissRetriever(path="/mnt/data/factcheck/averitec-data/data_store/vecstore/dev/6k")
retriever = MmrFaissRetriever(path=f"/mnt/data/factcheck/averitec-data/data_store/vecstore/{split}/6k")
retrieval_result = retriever(datapoint)
retrieval_result

RetrievalResult(documents=[Document(metadata={'url': 'https://www.voanews.com/a/afreximbank-pledges-to-post-mugabe-zimbabwe/4161541.html', 'context_before': '', 'context_after': 'The southern African nation, which dumped its hyperinflation-hit currency in 2009, is struggling with a severe dollar crunch that has seen banks fail to avail cash to customers while importers struggle to pay for imports. Finance Minister Patrick Chinamasa promised in a budget speech last week to re-engage with international lenders, curb spending and attract investors to revive the economy. '}, page_content='The African Export and Import Bank has pledged up to $1.5 billion in new loans and financial guarantees to Zimbabwe in a major boost for new President Emmerson Mnangagwa\'s government, the bank\'s president and chairman said Tuesday. Mnangagwa, who took over last month after veteran autocrat Robert Mugabe quit following a de facto military coup, has vowed to focus on reviving the struggling economy and pr

In [12]:
datapoint = Datapoint.from_dict(dataset[2])
datapoint.claim, datapoint.claim_date, datapoint.speaker

("Due to Imran Khan's criticism of Macron's comments on Islam, French authorities cancelled the visas of 183 Pakistani citizens and deported 118 from the country.",
 '31-10-2020',
 'Consulate General Of Pakistan France')

In [38]:
path = "/mnt/data/factcheck/averitec-data/"
target = path + "data_store/vecstore/dev/6k"
PIPELINE_NAME = "mmr+gpt4o-dfewshot-gpttie-10ev"
pipeline = Pipeline(
    #dumps = "/mnt/data/factcheck/averitec-data/data_store/submissions/dev_mmr+gpt4o.pkl",
    # SubqueryRetriever(retriever),
    retriever,
    evidence_generator=DynamicFewShotBatchedEvidenceGenerator(), 
    classifier=DefaultClassifier()
)

submission = []
dump = []

for dp in tqdm(datapoints):
    pipeline_result = pipeline(dp)
    submission.append(pipeline_result.to_submission())
    dump.append(pipeline_result)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json", "w") as f:
    json.dump(submission, f, indent=4)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.pkl", "wb") as f:
    pickle.dump(dump, f)

100%|██████████| 500/500 [00:58<00:00,  8.51it/s]


In [39]:
files = pipeline.evidence_generator.get_batch_files(path=f"/mnt/data/factcheck/averitec-data/data_store/batch_jobs/{split}_{PIPELINE_NAME}", batch_size=75)

In [40]:
files

['/mnt/data/factcheck/averitec-data/data_store/batch_jobs/dev_mmr+gpt4o-dfewshot-gpttie-10ev/batch_1.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/dev_mmr+gpt4o-dfewshot-gpttie-10ev/batch_2.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/dev_mmr+gpt4o-dfewshot-gpttie-10ev/batch_3.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/dev_mmr+gpt4o-dfewshot-gpttie-10ev/batch_4.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/dev_mmr+gpt4o-dfewshot-gpttie-10ev/batch_5.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/dev_mmr+gpt4o-dfewshot-gpttie-10ev/batch_6.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/dev_mmr+gpt4o-dfewshot-gpttie-10ev/batch_7.jsonl']

In [41]:
batch_results = pipeline.evidence_generator.submit_and_await_batches(files, f"/mnt/data/factcheck/averitec-data/data_store/batch_jobs/{split}_{PIPELINE_NAME}/output.jsonl")

  0%|          | 0/7 [00:00<?, ?it/s]

Batch(id='batch_aWpnXWIxJXhadga0wZmETJnq', completion_window='24h', created_at=1722000435, endpoint='/v1/chat/completions', input_file_id='file-sYLy4YbbA5vBrghmVNy84l4K', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722086835, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 1'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_aWpnXWIxJXhadga0wZmETJnq
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=75) batch_aWpnXWIxJXhadga0wZmETJnq
waiting for batch to complete BatchRequestCounts(completed=59, failed=0, total=75) batch_aWpnXWIxJXhadga0wZmETJnq
waiting for batch to complete BatchRequestCounts(completed=74, failed=0, total=75) batch_aWpnXWIxJXhadga0wZmETJnq
batch 1 compl

 14%|█▍        | 1/7 [00:44<04:29, 44.99s/it]

Batch(id='batch_S0TgAcnSKrSqxIZylmIkXgFs', completion_window='24h', created_at=1722000480, endpoint='/v1/chat/completions', input_file_id='file-PkYhkfaV0g7g7aq2UXpMjbAm', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722086880, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 2'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_S0TgAcnSKrSqxIZylmIkXgFs
waiting for batch to complete BatchRequestCounts(completed=2, failed=0, total=75) batch_S0TgAcnSKrSqxIZylmIkXgFs
waiting for batch to complete BatchRequestCounts(completed=59, failed=0, total=75) batch_S0TgAcnSKrSqxIZylmIkXgFs
waiting for batch to complete BatchRequestCounts(completed=75, failed=0, total=75) batch_S0TgAcnSKrSqxIZylmIkXgFs
batch 2 compl

 29%|██▊       | 2/7 [01:29<03:43, 44.80s/it]

Batch(id='batch_Pg3w2fvgVBSKWfAjX64hXWfn', completion_window='24h', created_at=1722000524, endpoint='/v1/chat/completions', input_file_id='file-ruQXBT8ftkWwG3v3InlnI91O', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722086924, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 3'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_Pg3w2fvgVBSKWfAjX64hXWfn
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=75) batch_Pg3w2fvgVBSKWfAjX64hXWfn
waiting for batch to complete BatchRequestCounts(completed=66, failed=0, total=75) batch_Pg3w2fvgVBSKWfAjX64hXWfn
batch 3 completed


 43%|████▎     | 3/7 [02:03<02:39, 39.80s/it]

Batch(id='batch_6KYON29A4J72vbc9ulV1erLm', completion_window='24h', created_at=1722000558, endpoint='/v1/chat/completions', input_file_id='file-B7Dz2uMd1KozdPE3qeZOpupg', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722086958, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 4'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_6KYON29A4J72vbc9ulV1erLm
waiting for batch to complete BatchRequestCounts(completed=2, failed=0, total=75) batch_6KYON29A4J72vbc9ulV1erLm
waiting for batch to complete BatchRequestCounts(completed=62, failed=0, total=75) batch_6KYON29A4J72vbc9ulV1erLm
waiting for batch to complete BatchRequestCounts(completed=73, failed=0, total=75) batch_6KYON29A4J72vbc9ulV1erLm
batch 4 compl

 57%|█████▋    | 4/7 [02:46<02:03, 41.22s/it]

Batch(id='batch_sgCTnpKTxFuvSAM4ii9StdN0', completion_window='24h', created_at=1722000602, endpoint='/v1/chat/completions', input_file_id='file-lfL17eHqKtUnp1LQWpODQFdx', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722087002, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 5'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_sgCTnpKTxFuvSAM4ii9StdN0
waiting for batch to complete BatchRequestCounts(completed=2, failed=0, total=75) batch_sgCTnpKTxFuvSAM4ii9StdN0
waiting for batch to complete BatchRequestCounts(completed=56, failed=0, total=75) batch_sgCTnpKTxFuvSAM4ii9StdN0
waiting for batch to complete BatchRequestCounts(completed=73, failed=0, total=75) batch_sgCTnpKTxFuvSAM4ii9StdN0
batch 5 compl

 71%|███████▏  | 5/7 [03:31<01:24, 42.27s/it]

Batch(id='batch_wtIRKdG4mlOcGCeVaQhuWIzQ', completion_window='24h', created_at=1722000645, endpoint='/v1/chat/completions', input_file_id='file-oMMkE2oo2vWaSVK9VkrfDUO4', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722087045, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 6'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_wtIRKdG4mlOcGCeVaQhuWIzQ
waiting for batch to complete BatchRequestCounts(completed=2, failed=0, total=75) batch_wtIRKdG4mlOcGCeVaQhuWIzQ
waiting for batch to complete BatchRequestCounts(completed=64, failed=0, total=75) batch_wtIRKdG4mlOcGCeVaQhuWIzQ
batch 6 completed


 86%|████████▌ | 6/7 [04:04<00:39, 39.22s/it]

Batch(id='batch_FET6ak9Awwkckyk1K8gBPLu5', completion_window='24h', created_at=1722000679, endpoint='/v1/chat/completions', input_file_id='file-ATIzw0q9WLXOElOz5r9rh3Gd', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722087079, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 7'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_FET6ak9Awwkckyk1K8gBPLu5
waiting for batch to complete BatchRequestCounts(completed=2, failed=0, total=50) batch_FET6ak9Awwkckyk1K8gBPLu5
waiting for batch to complete BatchRequestCounts(completed=44, failed=0, total=50) batch_FET6ak9Awwkckyk1K8gBPLu5
waiting for batch to complete BatchRequestCounts(completed=48, failed=0, total=50) batch_FET6ak9Awwkckyk1K8gBPLu5
waiting for b

100%|██████████| 7/7 [04:58<00:00, 42.67s/it]


In [19]:
print("```json\n{\n    \"questions\":\n        [\n            {\"question\": \"Did the Trump administration plan a public health campaign featuring celebrities?\", \"answer\": \"Yes, the Trump administration planned a public health campaign featuring numerous celebrities to help 'defeat despair and inspire hope' amid the pandemic.\", \"source\": \"1\"},\n            {\"question\": \"Was Billie Eilish considered for the Trump administration's public health campaign?\", \"answer\": \"Yes, Billie Eilish was considered for the campaign but was ultimately disqualified.\", \"source\": \"1\"},\n            {\"question\": \"Why was Billie Eilish disqualified from the Trump administration's public health campaign?\", \"answer\": \"Billie Eilish was disqualified because she was 'not a Trump supporter' and had made political statements against Trump.\", \"source\": \"1\"},\n            {\"question\": \"Did the Trump administration accuse Billie Eilish of 'destroying our country and everything we care about'?\", \"answer\": \"No, the Washington Post misattributed a quote from Billie Eilish to the Trump administration. The documents actually stated that Eilish had said Trump 'is destroying our country and everything we care about.'\", \"source\": \"3\"},\n            {\"question\": \"Did the Washington Post issue a correction regarding the misattributed quote?\", \"answer\": \"Yes, the Washington Post issued a correction on the story.\", \"source\": \"3\"}\n        ],\n    \"claim_veracity\": {\n        \"Supported\": \"1\",\n        \"Refuted\": \"5\",\n        \"Not Enough Evidence\": \"1\",\n        \"Conflicting Evidence/Cherrypicking\": \"1\"\n    },\n    \"veracity_vedict\": \"Refuted\"\n}\n```")

```json
{
    "questions":
        [
            {"question": "Did the Trump administration plan a public health campaign featuring celebrities?", "answer": "Yes, the Trump administration planned a public health campaign featuring numerous celebrities to help 'defeat despair and inspire hope' amid the pandemic.", "source": "1"},
            {"question": "Was Billie Eilish considered for the Trump administration's public health campaign?", "answer": "Yes, Billie Eilish was considered for the campaign but was ultimately disqualified.", "source": "1"},
            {"question": "Why was Billie Eilish disqualified from the Trump administration's public health campaign?", "answer": "Billie Eilish was disqualified because she was 'not a Trump supporter' and had made political statements against Trump.", "source": "1"},
            {"question": "Did the Trump administration accuse Billie Eilish of 'destroying our country and everything we care about'?", "answer": "No, the Washington Post misat

In [42]:
new_dump = []
pipeline.evidence_generator.fallback_gpt_generator.client.temperature = .5
for pipeline_result, batch_result in zip(dump[:len(batch_results)], batch_results):
    new_result = pipeline.evidence_generator.update_pipeline_result(pipeline_result, batch_result, pipeline.classifier)
    new_dump.append(new_result)


Error parsing JSON for EvidenceGenerator.
 
{
    "questions":
        [
            {"question": "What is the CDC's stance on the effectiveness of ordinary masks against wildfire smoke?", "answer": "The CDC states that
list indices must be integers or slices, not str
failed, using fallback gpt


In [43]:
(
    new_dump[1],
    new_dump[1].evidence_generation_result,
    new_dump[1].classification_result
)

(PipelineResult(datapoint=Datapoint(claim='Trump Administration claimed songwriter Billie Eilish Is Destroying Our Country In Leaked Documents', claim_id=1, claim_date='31-10-2020', speaker=None, original_claim_url=None, reporting_source='Instagram', location_ISO_code='US', label='Refuted', metadata={'required_reannotation': False, 'justification': 'Seems that the Wzshington post accused the singer wrong lyrics. Therefore the claim is refuted', 'fact_checking_article': 'https://web.archive.org/web/20201103001419/https://leadstories.com/hoax-alert/2020/11/fact-check-trump-administration-did-not-claim-songwriter-billie-eilish-was-destroying-country-in-leaked-documents.html', 'claim_types': ['Position Statement', 'Event/Property Claim'], 'fact_checking_strategies': ['Written Evidence'], 'questions': [{'question': 'Has the Trump administration voiced that Billie Eilish was destroying the country', 'answers': [{'answer': 'A Washington Post story wrongly claimed the Trump administration accu

In [44]:
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json", "w") as f:
    json.dump([d.to_submission() for d in new_dump], f, indent=4)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.pkl", "wb") as f:
    pickle.dump(new_dump, f)

In [45]:
print(f"{split}_{PIPELINE_NAME}")
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json

dev_mmr+gpt4o-dfewshot-gpttie-10ev
Question-only score (HU-meteor):             0.44806149103778475
Question-answer score (HU-meteor):           0.28123074910922286
Veracity F1 scores:
 * Supported:                                0.7293233082706767
 * Refuted:                                  0.8165289256198347
 * Not Enough Evidence:                      0.19672131147540983
 * Conflicting Evidence/Cherrypicking:       0.14705882352941177
 * macro:                                    0.47240809222383323
 * acc:                                      0.71
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.69
 * Veracity scores (meteor @ 0.2):           0.544
 * Veracity scores (meteor @ 0.25):          0.402
 * Veracity scores (meteor @ 0.3):           0.27
 * Veracity scores (meteor @ 0.4):           0.124
 * Veracity scores (meteor @ 0.5):           0.038
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.

In [22]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json

Question-only score (HU-meteor):             0.44559202247101487
Question-answer score (HU-meteor):           0.27486491800924373
Veracity F1 scores:
 * Supported:                                0.6566037735849056
 * Refuted:                                  0.8098360655737705
 * Not Enough Evidence:                      0.1891891891891892
 * Conflicting Evidence/Cherrypicking:       0.0784313725490196
 * macro:                                    0.43351510022422124
 * acc:                                      0.686
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.672
 * Veracity scores (meteor @ 0.2):           0.532
 * Veracity scores (meteor @ 0.25):          0.394
 * Veracity scores (meteor @ 0.3):           0.256
 * Veracity scores (meteor @ 0.4):           0.106
 * Veracity scores (meteor @ 0.5):           0.03
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.18721511530819013
 * Veracity score

In [21]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submissions/dev_{PIPELINE_NAME}.json

Question-only score (HU-meteor):             0.4505443758501816
Question-answer score (HU-meteor):           0.2848630013774468
Veracity F1 scores:
 * Supported:                                0.7003610108303249
 * Refuted:                                  0.831973898858075
 * Not Enough Evidence:                      0.29508196721311475
 * Conflicting Evidence/Cherrypicking:       0.08163265306122448
 * macro:                                    0.47726238249068476
 * acc:                                      0.726
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.71
 * Veracity scores (meteor @ 0.2):           0.574
 * Veracity scores (meteor @ 0.25):          0.42
 * Veracity scores (meteor @ 0.3):           0.304
 * Veracity scores (meteor @ 0.4):           0.122
 * Veracity scores (meteor @ 0.5):           0.044
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.19985145249552058
 * Veracity scores 

## collapsible begin

In [None]:
from IPython.display import display, Markdown, Latex

In [None]:
knn_retrieval_result = retriever(datapoint)
display(Markdown("### 🗯️ " + datapoint.claim))
display(Markdown("*Retrieved by knn*\n\n"))
# sample 3
for r in knn_retrieval_result:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content[:256]}"))

In [None]:
from retrieval import MmrFaissRetriever

mmr_retriever = MmrFaissRetriever(retriever.path)
mmr_retrieval_result = mmr_retriever(datapoint)
display(Markdown("### 🗯️ " + datapoint.claim))
display(Markdown("*Retrieved by MMR*\n\n"))
# sample 3
for r in mmr_retrieval_result:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content[:256]}"))

In [None]:
subquery_retriever = SubqueryRetriever(retriever)
subquery_retrieval_result = subquery_retriever(datapoint)
display(Markdown("### 🗯️ " + datapoint.claim))
display(Markdown("*Retrieved by subqueries*\n\n"))
# sample 3
for r in subquery_retrieval_result:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n*{';'.join(r.metadata['queries'])}*\n\n{r.page_content[:256]}"))

In [None]:
subquery_retrieval_result.metadata

## Collapsible section end

In [None]:
evidence_generator = GptBatchedEvidenceGenerator("gpt-4o")
evidence_generation_result = evidence_generator(datapoint, retrieval_result)
evidence_generation_result

In [None]:
evidence_generation_result.metadata["suggested_label"]

In [None]:
datapoint.label

In [None]:
classifier = DefaultClassifier()
classification_result = classifier(datapoint, evidence_generation_result, retrieval_result)
str(classification_result), classification_result

In [None]:
datapoint2 = Datapoint.from_dict(dataset[16])
pipeline = Pipeline(retriever, evidence_generator, classifier)
pipeline_result = pipeline(datapoint2)
pipeline_result

In [None]:
str(pipeline_result.classification_result), datapoint2.label

In [None]:
pipeline_result.to_submission()

In [None]:
# pickle dump pipeline result
import pickle
with open('data/pipeline_result.pkl', 'wb') as f:
    pickle.dump(pipeline_result, f)

In [None]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submission_dev_avg_clf.json