# Pipeline 

In [1]:
%load_ext autoreload
%autoreload 2

from averitec import Datapoint
from evidence_generation import GptEvidenceGenerator, GptBatchedEvidenceGenerator, DynamicFewShotBatchedEvidenceGenerator
from classification import DefaultClassifier, HuggingfaceClassifier, AverageEnsembleClassifier, LogRegEnsembleClassifier
from retrieval import SimpleFaissRetriever, Retriever, MmrFaissRetriever, SubqueryRetriever
from pipeline import Pipeline, MockPipeline
import pickle
from labels import label2id, id2label
import numpy as np
from sklearn.metrics import classification_report
import random
from tqdm import tqdm
random.seed(111)

import json

## Pipeline test

In [25]:
split = "test"
path = "/mnt/data/factcheck/averitec-data/"
with open(path + f"data/{split}.json") as f:
    dataset = json.load(f)
    for i in range(len(dataset)):
        dataset[i]["claim_id"] = i
    datapoints = [Datapoint.from_dict(d) for d in dataset]
    

In [26]:
datapoint = Datapoint.from_dict(dataset[150])
datapoint

Datapoint(claim="A research organisation has confirmed Philippine Vice President Leni Robredo received zero votes in the country's Mindanao island.", claim_id=150, claim_date='11-3-2021', speaker='Showbiz Fanaticz', original_claim_url='https://www.youtube.com/watch?v=ErL5IzJkqh8&t=6s', reporting_source='Youtube', location_ISO_code='PH', label=None, metadata={})

In [27]:
# retriever = SimpleFaissRetriever(path="/mnt/data/factcheck/averitec-data/data_store/vecstore/dev/6k")
retriever = MmrFaissRetriever(path=f"/mnt/data/factcheck/averitec-data/data_store/vecstore/{split}/2k")
retrieval_result = retriever(datapoint)
retrieval_result

RetrievalResult(documents=[Document(metadata={'url': 'https://newsinfo.inquirer.net/1396547/the-4-year-robredo-marcos-vice-presidential-case', 'context_before': '', 'context_after': 'READ: Marcos asks SC to set aside Robredo proclamation, files protest Despite Marcos’ appeal to postpone the proclamation, Robredo still takes her oath of office and is inaugurated as vice president. READ: FULL TEXT: Vice President Leni Robredo inaugural speech In a two-page resolution, the SC summons Robredo to respond in ten days on Marcos’ election protest. The high court also asks the Commission Elections (Comelec) to preserve election returns (ERs) that are subject to protest through a Precautionary Protection Order (PPO). The Marcos camp files a three-page letter-manifestation questioning the Comelec’s decision to issue a resolution approving the stripping of the vote-counting machines (VCMs) and laptops used in the May 9 polls, despite the PPO from PET. The stripping activity involves the backup of 

In [28]:
datapoint = Datapoint.from_dict(dataset[2])
datapoint.claim, datapoint.claim_date, datapoint.speaker

('South African hospital found that traces of the novel coronavirus had survived on the surface of fresh food items for 12 hours during lab tests.',
 '3-12-2021',
 None)

In [29]:
json.dumps({"question": datapoint.claim, "context": datapoint.speaker})

'{"question": "South African hospital found that traces of the novel coronavirus had survived on the surface of fresh food items for 12 hours during lab tests.", "context": null}'

In [30]:
from classification import DefaultClassifier, HuggingfaceClassifier, AverageEnsembleClassifier, LogRegEnsembleClassifier, RandomForestClassifier

path = "/mnt/data/factcheck/averitec-data/"
target = path + "data_store/vecstore/test/2k"
PIPELINE_NAME = "mmr+gpt4o-dfewshot-tiebrk-atype"
classifier = DefaultClassifier()
pipeline = Pipeline(
    #dumps = "/mnt/data/factcheck/averitec-data/data_store/submissions/dev_mmr+gpt4o-dfewshot.pkl",
    #SubqueryRetriever(retriever),
    retriever,
    evidence_generator=DynamicFewShotBatchedEvidenceGenerator(), 
    classifier=classifier
)

submission = []
dump = []

for dp in tqdm(datapoints):
    pipeline_result = pipeline(dp)
    submission.append(pipeline_result.to_submission())
    dump.append(pipeline_result)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json", "w") as f:
    json.dump(submission, f, indent=4)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.pkl", "wb") as f:
    pickle.dump(dump, f)

100%|██████████| 2215/2215 [02:25<00:00, 15.27it/s]


In [32]:
files = pipeline.evidence_generator.get_batch_files(path=f"/mnt/data/factcheck/averitec-data/data_store/batch_jobs/{split}_{PIPELINE_NAME}", batch_size=90)

In [33]:
files

['/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot-tiebrk-atype/batch_1.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot-tiebrk-atype/batch_2.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot-tiebrk-atype/batch_3.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot-tiebrk-atype/batch_4.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot-tiebrk-atype/batch_5.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot-tiebrk-atype/batch_6.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot-tiebrk-atype/batch_7.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot-tiebrk-atype/batch_8.jsonl',
 '/mnt/data/factcheck/averitec-data/data_store/batch_jobs/test_mmr+gpt4o-dfewshot-tiebrk-atype/batch_9.jsonl',
 

In [34]:
batch_results = pipeline.evidence_generator.submit_and_await_batches(files, f"/mnt/data/factcheck/averitec-data/data_store/batch_jobs/{split}_{PIPELINE_NAME}/output.jsonl")

  0%|          | 0/25 [00:00<?, ?it/s]

Batch(id='batch_b71LCLk3tzgAAta78Nh7WRbF', completion_window='24h', created_at=1722202477, endpoint='/v1/chat/completions', input_file_id='file-EuxPiUbVE6m9dtLLQ6cQ9PlC', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722288877, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 1'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_b71LCLk3tzgAAta78Nh7WRbF
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=90) batch_b71LCLk3tzgAAta78Nh7WRbF
waiting for batch to complete BatchRequestCounts(completed=1, failed=0, total=90) batch_b71LCLk3tzgAAta78Nh7WRbF
waiting for batch to complete BatchRequestCounts(completed=7, failed=0, total=90) batch_b71LCLk3tzgAAta78Nh7WRbF
waiting for bat

  4%|▍         | 1/25 [07:23<2:57:32, 443.85s/it]

Batch(id='batch_T99qudNnmff3vpynk4iBkiBo', completion_window='24h', created_at=1722202921, endpoint='/v1/chat/completions', input_file_id='file-eXaqjVCmr6DH0HQbE1OSj0RK', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289321, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 2'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_T99qudNnmff3vpynk4iBkiBo
waiting for batch to complete BatchRequestCounts(completed=13, failed=0, total=90) batch_T99qudNnmff3vpynk4iBkiBo
waiting for batch to complete BatchRequestCounts(completed=89, failed=0, total=90) batch_T99qudNnmff3vpynk4iBkiBo
waiting for batch to complete BatchRequestCounts(completed=89, failed=0, total=90) batch_T99qudNnmff3vpynk4iBkiBo
waiting for 

  8%|▊         | 2/25 [08:48<1:29:13, 232.77s/it]

Batch(id='batch_fs8JDMzi9LKoV2MTTLCPqwJz', completion_window='24h', created_at=1722203006, endpoint='/v1/chat/completions', input_file_id='file-C5sxyxa3gKSjciwq3VQo5V29', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289406, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 3'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_fs8JDMzi9LKoV2MTTLCPqwJz
waiting for batch to complete BatchRequestCounts(completed=7, failed=0, total=90) batch_fs8JDMzi9LKoV2MTTLCPqwJz
waiting for batch to complete BatchRequestCounts(completed=90, failed=0, total=90) batch_fs8JDMzi9LKoV2MTTLCPqwJz
batch 3 completed


 12%|█▏        | 3/25 [09:22<52:02, 141.95s/it]  

Batch(id='batch_tW75oHDYC8m2IRqX9mgqdfoY', completion_window='24h', created_at=1722203040, endpoint='/v1/chat/completions', input_file_id='file-XfTu5kydNbVbeDjyHgM3Hbvb', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289440, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 4'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_tW75oHDYC8m2IRqX9mgqdfoY
waiting for batch to complete BatchRequestCounts(completed=14, failed=0, total=90) batch_tW75oHDYC8m2IRqX9mgqdfoY
waiting for batch to complete BatchRequestCounts(completed=89, failed=0, total=90) batch_tW75oHDYC8m2IRqX9mgqdfoY
batch 4 completed


 16%|█▌        | 4/25 [09:56<34:45, 99.33s/it] 

Batch(id='batch_MPiMAadnGyyqqylhUfm4tQlS', completion_window='24h', created_at=1722203074, endpoint='/v1/chat/completions', input_file_id='file-q4iI6bVyz7suG0BRENbSUKSB', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289474, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 5'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_MPiMAadnGyyqqylhUfm4tQlS
waiting for batch to complete BatchRequestCounts(completed=9, failed=0, total=90) batch_MPiMAadnGyyqqylhUfm4tQlS
waiting for batch to complete BatchRequestCounts(completed=88, failed=0, total=90) batch_MPiMAadnGyyqqylhUfm4tQlS
waiting for batch to complete BatchRequestCounts(completed=89, failed=0, total=90) batch_MPiMAadnGyyqqylhUfm4tQlS
waiting for b

 20%|██        | 5/25 [11:21<31:24, 94.24s/it]

Batch(id='batch_821MUkzWS0mdATtr5ZehbXbw', completion_window='24h', created_at=1722203159, endpoint='/v1/chat/completions', input_file_id='file-qZs7snTXwqga0IKNQcGBCpao', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289559, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 6'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_821MUkzWS0mdATtr5ZehbXbw
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=90) batch_821MUkzWS0mdATtr5ZehbXbw
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=90) batch_821MUkzWS0mdATtr5ZehbXbw
waiting for batch to complete BatchRequestCounts(completed=79, failed=0, total=90) batch_821MUkzWS0mdATtr5ZehbXbw
batch 6 comple

 24%|██▍       | 6/25 [12:06<24:28, 77.29s/it]

Batch(id='batch_eTA3cTlDTZMlHLYOWnKx095J', completion_window='24h', created_at=1722203203, endpoint='/v1/chat/completions', input_file_id='file-jzYsw3x8H0o2rur24w75fveD', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289603, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 7'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_eTA3cTlDTZMlHLYOWnKx095J
waiting for batch to complete BatchRequestCounts(completed=11, failed=0, total=90) batch_eTA3cTlDTZMlHLYOWnKx095J
waiting for batch to complete BatchRequestCounts(completed=90, failed=0, total=90) batch_eTA3cTlDTZMlHLYOWnKx095J
batch 7 completed


 28%|██▊       | 7/25 [12:40<18:55, 63.06s/it]

Batch(id='batch_u1Bp9nebvMqaQ0hJvdtDGxHg', completion_window='24h', created_at=1722203237, endpoint='/v1/chat/completions', input_file_id='file-HkG3tgKo4Aam6OJ2vE4c5YEt', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289637, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 8'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_u1Bp9nebvMqaQ0hJvdtDGxHg
waiting for batch to complete BatchRequestCounts(completed=13, failed=0, total=90) batch_u1Bp9nebvMqaQ0hJvdtDGxHg
waiting for batch to complete BatchRequestCounts(completed=88, failed=0, total=90) batch_u1Bp9nebvMqaQ0hJvdtDGxHg
waiting for batch to complete BatchRequestCounts(completed=88, failed=0, total=90) batch_u1Bp9nebvMqaQ0hJvdtDGxHg
batch 8 comp

 32%|███▏      | 8/25 [13:24<16:08, 56.99s/it]

Batch(id='batch_N4huWPezCq1ZWbowBK0QuTyn', completion_window='24h', created_at=1722203281, endpoint='/v1/chat/completions', input_file_id='file-VrtNYedfsjXZH7vBVE4uEcj4', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289681, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 9'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_N4huWPezCq1ZWbowBK0QuTyn
waiting for batch to complete BatchRequestCounts(completed=10, failed=0, total=90) batch_N4huWPezCq1ZWbowBK0QuTyn
waiting for batch to complete BatchRequestCounts(completed=90, failed=0, total=90) batch_N4huWPezCq1ZWbowBK0QuTyn
batch 9 completed


 36%|███▌      | 9/25 [13:57<13:15, 49.70s/it]

Batch(id='batch_hKeynZgHOoRCfNDsR7mD8aPO', completion_window='24h', created_at=1722203314, endpoint='/v1/chat/completions', input_file_id='file-TJ5IR7ychOR6b05airFyaJaZ', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289714, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 10'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_hKeynZgHOoRCfNDsR7mD8aPO
waiting for batch to complete BatchRequestCounts(completed=10, failed=0, total=90) batch_hKeynZgHOoRCfNDsR7mD8aPO
waiting for batch to complete BatchRequestCounts(completed=89, failed=0, total=90) batch_hKeynZgHOoRCfNDsR7mD8aPO
waiting for batch to complete BatchRequestCounts(completed=89, failed=0, total=90) batch_hKeynZgHOoRCfNDsR7mD8aPO
waiting for

 40%|████      | 10/25 [15:11<14:19, 57.27s/it]

Batch(id='batch_AH5uEb0p4AgKXMX2wqveKh5n', completion_window='24h', created_at=1722203389, endpoint='/v1/chat/completions', input_file_id='file-WdIl87ZCH1qC2FhBCaBqXVWz', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289789, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 11'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_AH5uEb0p4AgKXMX2wqveKh5n
waiting for batch to complete BatchRequestCounts(completed=18, failed=0, total=90) batch_AH5uEb0p4AgKXMX2wqveKh5n
waiting for batch to complete BatchRequestCounts(completed=90, failed=0, total=90) batch_AH5uEb0p4AgKXMX2wqveKh5n
batch 11 completed


 44%|████▍     | 11/25 [15:46<11:42, 50.19s/it]

Batch(id='batch_H46vcemfsY4mDRxKiwbQ2rn2', completion_window='24h', created_at=1722203423, endpoint='/v1/chat/completions', input_file_id='file-EnYjl3UspDQJV1j5BFD3bMGt', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289823, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 12'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_H46vcemfsY4mDRxKiwbQ2rn2
waiting for batch to complete BatchRequestCounts(completed=12, failed=0, total=90) batch_H46vcemfsY4mDRxKiwbQ2rn2
waiting for batch to complete BatchRequestCounts(completed=90, failed=0, total=90) batch_H46vcemfsY4mDRxKiwbQ2rn2
batch 12 completed


 48%|████▊     | 12/25 [16:19<09:47, 45.16s/it]

Batch(id='batch_Jg0hIaLg3ck663VdYu1sR7HW', completion_window='24h', created_at=1722203457, endpoint='/v1/chat/completions', input_file_id='file-QXxCTQVPOmVw6BDTs8PnfVaH', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289857, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 13'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_Jg0hIaLg3ck663VdYu1sR7HW
waiting for batch to complete BatchRequestCounts(completed=11, failed=0, total=90) batch_Jg0hIaLg3ck663VdYu1sR7HW
waiting for batch to complete BatchRequestCounts(completed=90, failed=0, total=90) batch_Jg0hIaLg3ck663VdYu1sR7HW
batch 13 completed


 52%|█████▏    | 13/25 [16:53<08:21, 41.77s/it]

Batch(id='batch_bbyz2QIKVSa18BCZgSyR2G3N', completion_window='24h', created_at=1722203491, endpoint='/v1/chat/completions', input_file_id='file-tdhU9crk2NIfZ7YzniRNoRNv', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289891, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 14'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_bbyz2QIKVSa18BCZgSyR2G3N
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=90) batch_bbyz2QIKVSa18BCZgSyR2G3N
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=90) batch_bbyz2QIKVSa18BCZgSyR2G3N
waiting for batch to complete BatchRequestCounts(completed=4, failed=0, total=90) batch_bbyz2QIKVSa18BCZgSyR2G3N
waiting for ba

 56%|█████▌    | 14/25 [18:18<10:03, 54.84s/it]

Batch(id='batch_AGig79l2vnGfx7WSveZHnHQC', completion_window='24h', created_at=1722203575, endpoint='/v1/chat/completions', input_file_id='file-kpUu04GzCPxQH70w2jDCt0A8', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722289975, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 15'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_AGig79l2vnGfx7WSveZHnHQC
waiting for batch to complete BatchRequestCounts(completed=14, failed=0, total=90) batch_AGig79l2vnGfx7WSveZHnHQC
waiting for batch to complete BatchRequestCounts(completed=89, failed=0, total=90) batch_AGig79l2vnGfx7WSveZHnHQC
batch 15 completed


 60%|██████    | 15/25 [18:52<08:03, 48.39s/it]

Batch(id='batch_bXSc3A8hhoOjNWK7VvcFlBux', completion_window='24h', created_at=1722203609, endpoint='/v1/chat/completions', input_file_id='file-VDKBaBuPlhXGG4oDRlYN4sg6', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722290009, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 16'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_bXSc3A8hhoOjNWK7VvcFlBux
waiting for batch to complete BatchRequestCounts(completed=9, failed=0, total=90) batch_bXSc3A8hhoOjNWK7VvcFlBux
waiting for batch to complete BatchRequestCounts(completed=88, failed=0, total=90) batch_bXSc3A8hhoOjNWK7VvcFlBux
waiting for batch to complete BatchRequestCounts(completed=88, failed=0, total=90) batch_bXSc3A8hhoOjNWK7VvcFlBux
waiting for 

 64%|██████▍   | 16/25 [20:17<08:54, 59.36s/it]

Batch(id='batch_SrW81WzYHaP0O2qAPc1lDZqW', completion_window='24h', created_at=1722203694, endpoint='/v1/chat/completions', input_file_id='file-1ikoFzo9q0ivfT0RaVbZ1LQ2', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722290094, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 17'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_SrW81WzYHaP0O2qAPc1lDZqW
waiting for batch to complete BatchRequestCounts(completed=7, failed=0, total=90) batch_SrW81WzYHaP0O2qAPc1lDZqW
waiting for batch to complete BatchRequestCounts(completed=20, failed=0, total=90) batch_SrW81WzYHaP0O2qAPc1lDZqW
waiting for batch to complete BatchRequestCounts(completed=20, failed=0, total=90) batch_SrW81WzYHaP0O2qAPc1lDZqW
waiting for 

 68%|██████▊   | 17/25 [21:11<07:41, 57.73s/it]

Batch(id='batch_cCf1KD5vvFUqDLtgAezaBowa', completion_window='24h', created_at=1722203748, endpoint='/v1/chat/completions', input_file_id='file-iMIRgLOsr63kmGslvzwZBG9f', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722290148, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 18'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_cCf1KD5vvFUqDLtgAezaBowa
waiting for batch to complete BatchRequestCounts(completed=13, failed=0, total=90) batch_cCf1KD5vvFUqDLtgAezaBowa
waiting for batch to complete BatchRequestCounts(completed=89, failed=0, total=90) batch_cCf1KD5vvFUqDLtgAezaBowa
batch 18 completed


 72%|███████▏  | 18/25 [21:45<05:54, 50.61s/it]

Batch(id='batch_GO5iIBaov4moVy1JojdZdX8v', completion_window='24h', created_at=1722203782, endpoint='/v1/chat/completions', input_file_id='file-zvxet35M0BEXDy0cothw5a4M', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722290182, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 19'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_GO5iIBaov4moVy1JojdZdX8v
waiting for batch to complete BatchRequestCounts(completed=12, failed=0, total=90) batch_GO5iIBaov4moVy1JojdZdX8v
waiting for batch to complete BatchRequestCounts(completed=88, failed=0, total=90) batch_GO5iIBaov4moVy1JojdZdX8v
waiting for batch to complete BatchRequestCounts(completed=89, failed=0, total=90) batch_GO5iIBaov4moVy1JojdZdX8v
waiting for

 76%|███████▌  | 19/25 [23:09<06:05, 60.90s/it]

Batch(id='batch_k5fHxsPTd6wY0fE6AwQ7bqvH', completion_window='24h', created_at=1722203867, endpoint='/v1/chat/completions', input_file_id='file-4oZ72ytQgH6GmxQC8Y5nhF6u', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722290267, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 20'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_k5fHxsPTd6wY0fE6AwQ7bqvH
waiting for batch to complete BatchRequestCounts(completed=13, failed=0, total=90) batch_k5fHxsPTd6wY0fE6AwQ7bqvH
waiting for batch to complete BatchRequestCounts(completed=89, failed=0, total=90) batch_k5fHxsPTd6wY0fE6AwQ7bqvH
batch 20 completed


 80%|████████  | 20/25 [23:43<04:23, 52.75s/it]

Batch(id='batch_mUFl4vELjXGPRVRKcSNMwAiB', completion_window='24h', created_at=1722203900, endpoint='/v1/chat/completions', input_file_id='file-WSHjGeOGVkxhu4IhraeQbcCp', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722290300, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 21'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_mUFl4vELjXGPRVRKcSNMwAiB
waiting for batch to complete BatchRequestCounts(completed=11, failed=0, total=90) batch_mUFl4vELjXGPRVRKcSNMwAiB
waiting for batch to complete BatchRequestCounts(completed=90, failed=0, total=90) batch_mUFl4vELjXGPRVRKcSNMwAiB
batch 21 completed


 84%|████████▍ | 21/25 [24:17<03:08, 47.02s/it]

Batch(id='batch_3IfMPcZGPejqL596SgMxmpBQ', completion_window='24h', created_at=1722203934, endpoint='/v1/chat/completions', input_file_id='file-PNQXstJFn3BeCtLq7edRehlh', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722290334, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 22'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_3IfMPcZGPejqL596SgMxmpBQ
waiting for batch to complete BatchRequestCounts(completed=12, failed=0, total=90) batch_3IfMPcZGPejqL596SgMxmpBQ
waiting for batch to complete BatchRequestCounts(completed=88, failed=0, total=90) batch_3IfMPcZGPejqL596SgMxmpBQ
batch 22 completed


 88%|████████▊ | 22/25 [24:51<02:09, 43.04s/it]

Batch(id='batch_yJxeI6TFQ0PlOMmMWKy3M8lL', completion_window='24h', created_at=1722203968, endpoint='/v1/chat/completions', input_file_id='file-ok1vjddj5yZKMZUrI2UU8nIy', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722290368, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 23'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_yJxeI6TFQ0PlOMmMWKy3M8lL
waiting for batch to complete BatchRequestCounts(completed=10, failed=0, total=90) batch_yJxeI6TFQ0PlOMmMWKy3M8lL
waiting for batch to complete BatchRequestCounts(completed=89, failed=0, total=90) batch_yJxeI6TFQ0PlOMmMWKy3M8lL
batch 23 completed


 92%|█████████▏| 23/25 [25:24<01:20, 40.18s/it]

Batch(id='batch_qg2uAr2Z2LtGedLaP0A1f2sq', completion_window='24h', created_at=1722204001, endpoint='/v1/chat/completions', input_file_id='file-Wi1l0bgvMLdnapj4QGZKntoh', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722290401, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 24'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_qg2uAr2Z2LtGedLaP0A1f2sq
waiting for batch to complete BatchRequestCounts(completed=7, failed=0, total=90) batch_qg2uAr2Z2LtGedLaP0A1f2sq
waiting for batch to complete BatchRequestCounts(completed=88, failed=0, total=90) batch_qg2uAr2Z2LtGedLaP0A1f2sq
batch 24 completed


 96%|█████████▌| 24/25 [25:57<00:38, 38.14s/it]

Batch(id='batch_W5Dvza64qLxLkUGxNMNoALQj', completion_window='24h', created_at=1722204034, endpoint='/v1/chat/completions', input_file_id='file-nOe62hpBh7Juy5khBhONH6pb', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722290434, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'dev-set job, batch 25'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
waiting for batch to complete BatchRequestCounts(completed=0, failed=0, total=0) batch_W5Dvza64qLxLkUGxNMNoALQj
waiting for batch to complete BatchRequestCounts(completed=9, failed=0, total=55) batch_W5Dvza64qLxLkUGxNMNoALQj
waiting for batch to complete BatchRequestCounts(completed=55, failed=0, total=55) batch_W5Dvza64qLxLkUGxNMNoALQj
batch 25 completed


100%|██████████| 25/25 [26:31<00:00, 63.66s/it]


In [35]:
new_dump = []
pipeline.evidence_generator.fallback_gpt_generator.client.temperature = .5
for pipeline_result, batch_result in zip(dump[:len(batch_results)], batch_results):
    new_result = pipeline.evidence_generator.update_pipeline_result(pipeline_result, batch_result, pipeline.classifier)
    new_dump.append(new_result)


Error parsing JSON for EvidenceGenerator.
 
{
    "questions":
        [
            {"question": "Did the Supreme Court deny a lawsuit challenging Biden's victory in Pennsylvania?", "answer": "Yes, the Supreme Court denied the lawsuit.", "source": "1", "answer_type":"Boolean"},
            {"question": "Who filed the lawsuit challenging Biden's victory in Pennsylvania?", "answer": "The lawsuit was filed by Rep. Mike Kelly and other Republican challengers.", "source": "1", "answer_type":"Extractive"},
            {"question": "What was the basis of the lawsuit challenging Biden's victory in Pennsylvania?", "answer": "The lawsuit challenged a Pennsylvania law that had expanded mail-in voting in the state.", "source": "1", "answer_type":"Extractive"},
            {"question": "What was the Supreme Court's response to the lawsuit?", "answer": "The Supreme Court denied the application for injunctive relief.", "source": "1", "answer_type":"Extractive"},
            {"question": "Did the Sup

In [23]:
(
    new_dump[1],
    new_dump[1].evidence_generation_result,
    new_dump[1].classification_result
)

(PipelineResult(datapoint=Datapoint(claim='Trump Administration claimed songwriter Billie Eilish Is Destroying Our Country In Leaked Documents', claim_id=1, claim_date='31-10-2020', speaker=None, original_claim_url=None, reporting_source='Instagram', location_ISO_code='US', label='Refuted', metadata={'required_reannotation': False, 'justification': 'Seems that the Wzshington post accused the singer wrong lyrics. Therefore the claim is refuted', 'fact_checking_article': 'https://web.archive.org/web/20201103001419/https://leadstories.com/hoax-alert/2020/11/fact-check-trump-administration-did-not-claim-songwriter-billie-eilish-was-destroying-country-in-leaked-documents.html', 'claim_types': ['Position Statement', 'Event/Property Claim'], 'fact_checking_strategies': ['Written Evidence'], 'questions': [{'question': 'Has the Trump administration voiced that Billie Eilish was destroying the country', 'answers': [{'answer': 'A Washington Post story wrongly claimed the Trump administration accu

In [36]:
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json", "w") as f:
    json.dump([d.to_submission() for d in new_dump], f, indent=4)
with open(f"/mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.pkl", "wb") as f:
    pickle.dump(new_dump, f)

In [21]:
print(f"{split}_{PIPELINE_NAME}")
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json

dev_mmr+gpt4o-dfewshot-tiebrk-atype
Question-only score (HU-meteor):             0.45651171638681776
Question-answer score (HU-meteor):           0.2875847086574023
Veracity F1 scores:
 * Supported:                                0.7368421052631579
 * Refuted:                                  0.813953488372093
 * Not Enough Evidence:                      0.2222222222222222
 * Conflicting Evidence/Cherrypicking:       0.0784313725490196
 * macro:                                    0.46286229710162313
 * acc:                                      0.708
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.688
 * Veracity scores (meteor @ 0.2):           0.54
 * Veracity scores (meteor @ 0.25):          0.412
 * Veracity scores (meteor @ 0.3):           0.29
 * Veracity scores (meteor @ 0.4):           0.132
 * Veracity scores (meteor @ 0.5):           0.052
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.20

In [45]:
print(f"{split}_{PIPELINE_NAME}")
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json

dev_mmr+gpt4o-dfewshot-gpttie-10ev
Question-only score (HU-meteor):             0.44806149103778475
Question-answer score (HU-meteor):           0.28123074910922286
Veracity F1 scores:
 * Supported:                                0.7293233082706767
 * Refuted:                                  0.8165289256198347
 * Not Enough Evidence:                      0.19672131147540983
 * Conflicting Evidence/Cherrypicking:       0.14705882352941177
 * macro:                                    0.47240809222383323
 * acc:                                      0.71
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.69
 * Veracity scores (meteor @ 0.2):           0.544
 * Veracity scores (meteor @ 0.25):          0.402
 * Veracity scores (meteor @ 0.3):           0.27
 * Veracity scores (meteor @ 0.4):           0.124
 * Veracity scores (meteor @ 0.5):           0.038
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.

In [22]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submissions/{split}_{PIPELINE_NAME}.json

Question-only score (HU-meteor):             0.44559202247101487
Question-answer score (HU-meteor):           0.27486491800924373
Veracity F1 scores:
 * Supported:                                0.6566037735849056
 * Refuted:                                  0.8098360655737705
 * Not Enough Evidence:                      0.1891891891891892
 * Conflicting Evidence/Cherrypicking:       0.0784313725490196
 * macro:                                    0.43351510022422124
 * acc:                                      0.686
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.672
 * Veracity scores (meteor @ 0.2):           0.532
 * Veracity scores (meteor @ 0.25):          0.394
 * Veracity scores (meteor @ 0.3):           0.256
 * Veracity scores (meteor @ 0.4):           0.106
 * Veracity scores (meteor @ 0.5):           0.03
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.18721511530819013
 * Veracity score

In [21]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submissions/dev_{PIPELINE_NAME}.json

Question-only score (HU-meteor):             0.4505443758501816
Question-answer score (HU-meteor):           0.2848630013774468
Veracity F1 scores:
 * Supported:                                0.7003610108303249
 * Refuted:                                  0.831973898858075
 * Not Enough Evidence:                      0.29508196721311475
 * Conflicting Evidence/Cherrypicking:       0.08163265306122448
 * macro:                                    0.47726238249068476
 * acc:                                      0.726
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.71
 * Veracity scores (meteor @ 0.2):           0.574
 * Veracity scores (meteor @ 0.25):          0.42
 * Veracity scores (meteor @ 0.3):           0.304
 * Veracity scores (meteor @ 0.4):           0.122
 * Veracity scores (meteor @ 0.5):           0.044
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.19985145249552058
 * Veracity scores 

## collapsible begin

In [None]:
from IPython.display import display, Markdown, Latex

In [None]:
knn_retrieval_result = retriever(datapoint)
display(Markdown("### 🗯️ " + datapoint.claim))
display(Markdown("*Retrieved by knn*\n\n"))
# sample 3
for r in knn_retrieval_result:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content[:256]}"))

In [None]:
from retrieval import MmrFaissRetriever

mmr_retriever = MmrFaissRetriever(retriever.path)
mmr_retrieval_result = mmr_retriever(datapoint)
display(Markdown("### 🗯️ " + datapoint.claim))
display(Markdown("*Retrieved by MMR*\n\n"))
# sample 3
for r in mmr_retrieval_result:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n{r.page_content[:256]}"))

In [None]:
subquery_retriever = SubqueryRetriever(retriever)
subquery_retrieval_result = subquery_retriever(datapoint)
display(Markdown("### 🗯️ " + datapoint.claim))
display(Markdown("*Retrieved by subqueries*\n\n"))
# sample 3
for r in subquery_retrieval_result:
    newline = "\n"
    display(Markdown(f"**{r.metadata['url']}**\n\n*{';'.join(r.metadata['queries'])}*\n\n{r.page_content[:256]}"))

In [None]:
subquery_retrieval_result.metadata

## Collapsible section end

In [None]:
evidence_generator = GptBatchedEvidenceGenerator("gpt-4o")
evidence_generation_result = evidence_generator(datapoint, retrieval_result)
evidence_generation_result

In [None]:
evidence_generation_result.metadata["suggested_label"]

In [None]:
datapoint.label

In [None]:
classifier = DefaultClassifier()
classification_result = classifier(datapoint, evidence_generation_result, retrieval_result)
str(classification_result), classification_result

In [None]:
datapoint2 = Datapoint.from_dict(dataset[16])
pipeline = Pipeline(retriever, evidence_generator, classifier)
pipeline_result = pipeline(datapoint2)
pipeline_result

In [None]:
str(pipeline_result.classification_result), datapoint2.label

In [None]:
pipeline_result.to_submission()

In [None]:
# pickle dump pipeline result
import pickle
with open('data/pipeline_result.pkl', 'wb') as f:
    pickle.dump(pipeline_result, f)

In [None]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submission_dev_avg_clf.json