# Pipeline 

In [26]:
%load_ext autoreload
%autoreload 2

from averitec import Datapoint
from evidence_generation import GptEvidenceGenerator
from classification import DefaultClassifier, HuggingfaceClassifier, AverageEnsembleClassifier, LogRegEnsembleClassifier
from retrieval import SimpleFaissRetriever
from pipeline import Pipeline, MockPipeline
import pickle
from labels import label2id, id2label
import numpy as np
from sklearn.metrics import classification_report
import random
from tqdm import tqdm
random.seed(111)

import json

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Pipeline test

In [2]:
with open('/mnt/data/factcheck/averitec-data/data/dev.json') as f:
    dataset = json.load(f)
    claim_id = 0
    for el in dataset:
        el['claim_id'] = claim_id
        claim_id += 1

In [4]:
datapoint = Datapoint.from_dict(dataset[150])
datapoint

Datapoint(claim='Zimbabwe gets more than half of the African Export Import Bank loans.', claim_id=150, claim_date='7-10-2020', speaker='Tendai Biti', original_claim_url='https://web.archive.org/web/20210129100929im_/https://lh4.googleusercontent.com/WwmjzhYNC1g84X0JmQzruGRgW4tjrB8rAPT9Zx6x-W6xBhQpVIwGW1zw7wImZ6R6JAyQywo-CiCE3_AZtJcxB74aev7jVG1PFobnm3PKanSX0bwWP3Z8E6z-6tDwuPgrSa6mhuz7', reporting_source='Twitter', location_ISO_code='ZW', label='Refuted', split='dev', metadata={'required_reannotation': False, 'justification': 'Seems the numbers is no where near half. Therefore the claim is refuted', 'fact_checking_article': 'https://web.archive.org/web/20210129100929/https://zimfact.org/does-zimbabwe-get-more-than-half-afreximbank-loans/', 'claim_types': ['Numerical Claim'], 'fact_checking_strategies': ['Numerical Comparison'], 'questions': [{'question': 'How much has Zimbabwe gotten from the Africa Export Import Bank', 'answers': [{'answer': 'Zimbabwe got US$2,78 billion.', 'answer_type

In [11]:
retriever = SimpleFaissRetriever(k=5, path="/mnt/data/factcheck/averitec-data/data_store/vecstore/dev/6k")
retrieval_result = retriever(datapoint)
retrieval_result

  from tqdm.autonotebook import tqdm, trange


RetrievalResult(documents=[Document(metadata={'url': 'https://www.voanews.com/a/afreximbank-pledges-to-post-mugabe-zimbabwe/4161541.html', 'context_before': '', 'context_after': 'The southern African nation, which dumped its hyperinflation-hit currency in 2009, is struggling with a severe dollar crunch that has seen banks fail to avail cash to customers while importers struggle to pay for imports. Finance Minister Patrick Chinamasa promised in a budget speech last week to re-engage with international lenders, curb spending and attract investors to revive the economy. '}, page_content='The African Export and Import Bank has pledged up to $1.5 billion in new loans and financial guarantees to Zimbabwe in a major boost for new President Emmerson Mnangagwa\'s government, the bank\'s president and chairman said Tuesday. Mnangagwa, who took over last month after veteran autocrat Robert Mugabe quit following a de facto military coup, has vowed to focus on reviving the struggling economy and pr

In [12]:
evidence_generator = GptEvidenceGenerator("gpt-4o")
evidence_generation_result = evidence_generator(datapoint, retrieval_result)
evidence_generation_result

EvidenceGenerationResult(evidences=[Evidence(question='What is the total amount of loans provided by the African Export Import Bank (Afreximbank) to African countries?', answer='Since 1994, Afreximbank has approved more than $51 billion in credit facilities for African businesses.', url='https://www.afreximbank.com/afreximbank-tackles-zimbabwes-financial-sector-liquidity-challenges-new-facility-trade-finance-banks/', scraped_text='Cairo, 11 Feb. 2014– To boost access to trade finance, the African Export-Import Bank (Afreximbank) has entered into an agreement to introduce a facility and associated instruments to alleviate the liquidity challenges confronting the financial sector in Zimbabwe. The Afreximbank Trade Debt-backed Securities (AFTRADES) will be provided to participating banks as debt securities that could be used as collateral for interbank funds placements in order to promote interbank dealings among Zimbabwean Banks active in trade finance, according to the terms of a memora

In [6]:
evidence_generation_result.metadata["suggested_label"]

array([0.92045574, 0.01685873, 0.04582679, 0.01685873])

In [7]:
datapoint.label

'Refuted'

In [8]:
classifier = DefaultClassifier()
classification_result = classifier(datapoint, evidence_generation_result, retrieval_result)
str(classification_result), classification_result

('Refuted',
 ClassificationResult(probs=array([0.92045574, 0.01685873, 0.04582679, 0.01685873]), metadata=None))

In [9]:
datapoint2 = Datapoint.from_dict(dataset[16])
pipeline = Pipeline(retriever, evidence_generator, classifier)
pipeline_result = pipeline(datapoint2)
pipeline_result

PipelineResult(datapoint=Datapoint(claim='The plentiful evidence we have indicates that masks would not meaningfully help with aerosol transmission of COVID 19.', claim_id=16, claim_date='29-10-2020', speaker='Yinon Weiss', original_claim_url='https://thefederalist.com/2020/10/29/these-12-graphs-show-mask-mandates-do-nothing-to-stop-covid/', reporting_source='The Federalist', location_ISO_code='US', label='Refuted', split='dev', metadata={'required_reannotation': True, 'justification': 'Evidence finds that surgical face masks could prevent transmission of human coronaviruses and influenza viruses from symptomatic individuals. The CDC and the World Health Organization (WHO), at the time of the claim, also recommended cloth masks for the general public.', 'fact_checking_article': 'https://web.archive.org/web/20210711073635/https://healthfeedback.org/claimreview/wearing-face-masks-helps-reduce-the-spread-of-covid-19-but-it-only-confers-partial-protection-and-should-be-combined-with-additi

In [10]:
str(pipeline_result.classification_result), datapoint2.label

('Refuted', 'Refuted')

In [11]:
pipeline_result.to_submission()

{'claim_id': 16,
 'claim': 'The plentiful evidence we have indicates that masks would not meaningfully help with aerosol transmission of COVID 19.',
 'evidence': [{'question': 'What is the primary mode of COVID-19 transmission?',
   'answer': 'COVID-19 primarily spreads through respiratory droplets and aerosols.',
   'url': 'https://time.com/5861295/masks-covid19-spread-fighting/',
   'scraped_text': '\nAt long last, we have made a truly game-changing scientific breakthrough in preventing the spread of COVID-19. The impact of this breakthrough seems almost too good to be true. We have found a disease control tool that, when used properly, can dramatically reduce the person-to-person transmission of SARS-CoV-2, the virus that causes COVID-19. Studies have shown that this tool could reduce transmission by somewhere between 50% and 85%. The tool is cheap and remarkably low-tech. You can even make one at home. It has no significant side effects. And with each passing day, the scientific re

In [12]:
# pickle dump pipeline result
import pickle
with open('data/pipeline_result.pkl', 'wb') as f:
    pickle.dump(pipeline_result, f)

## Whole dev set example
not really whole, but we will show a sampling :)

In [5]:
import json, random, pickle
from tqdm.notebook import tqdm
sample_size = 2
random.seed(111)

In [86]:
retriever = SimpleFaissRetriever(k=5, path="/mnt/data/factcheck/averitec-data/data_store/vecstore/dev/6k")
evidence_generator = GptEvidenceGenerator("gpt-4o")
classifier = DefaultClassifier()
pipeline = Pipeline(retriever, evidence_generator, classifier)

with open("/mnt/data/factcheck/averitec-data/data/dev.json") as f:
    dataset = json.load(f)
    seq = 0
    for d in dataset:
        d["claim_id"] = seq
        seq += 1


sample = random.sample(dataset, sample_size)

with open(f"/mnt/data/factcheck/averitec-data/data/dev{sample_size}.json", "w") as f:
    json.dump(sample, f)
    
datapoints = [Datapoint.from_dict(d) for d in sample]
submission = []
dump = []

for dp in tqdm(datapoints):
    pipeline_result = pipeline(dp)
    submission.append(pipeline_result.to_submission())
    dump.append(pipeline_result)
    with open(f"/mnt/data/factcheck/averitec-data/data_store/submission_dev{sample_size}.json", "w") as f:
        json.dump(submission, f, indent=4)
    with open(f"/mnt/data/factcheck/averitec-data/data_store/pipeline_result_dev{sample_size}.pkl", "wb") as f:
        pickle.dump(dump, f)

  0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
dump[-1].evidence_generation_result.metadata

{'suggested_label': array([0.1469628 , 0.3994863 , 0.3994863 , 0.05406459]),
 'llm_type': 'gpt-4o',
 'llm_output': {'questions': [{'question': 'Did Donald Trump sign executive orders aimed at lowering prescription drug costs?',
    'answer': 'Yes, Donald Trump signed multiple executive orders aimed at lowering the cost of prescription drugs.',
    'source': '0'},
   {'question': 'When did Donald Trump sign these executive orders?',
    'answer': 'Donald Trump signed these executive orders on a Friday, but the exact date is not specified in the sources.',
    'source': '1'},
   {'question': 'What were the main objectives of the executive orders signed by Trump?',
    'answer': 'The executive orders aimed to allow imports of cheaper medicines, direct drugmaker rebates straight to patients, provide insulin and EpiPens at steep discounts to low-income people, and use lower international prices to pay for some Medicare drugs.',
    'source': '1'},
   {'question': 'Did Trump claim that these

### now lets see if our submission fits the eval script

In [15]:
%run src/prediction/evaluate_veracity.py --label_file data/dev{sample_size}.json --prediction_file data_store/submission_dev{sample_size}.json

Question-only score (HU-meteor):             0.7057333276522711
Question-answer score (HU-meteor):           0.36226711909299614
Veracity F1 scores:
 * Supported:                                0.6666666666666666
 * Refuted:                                  0.0
 * Not Enough Evidence:                      0.0
 * Conflicting Evidence/Cherrypicking:       0.0
 * macro:                                    0.16666666666666666
 * acc:                                      0.5
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.5
 * Veracity scores (meteor @ 0.2):           0.5
 * Veracity scores (meteor @ 0.25):          0.5
 * Veracity scores (meteor @ 0.3):           0.5
 * Veracity scores (meteor @ 0.4):           0.0
 * Veracity scores (meteor @ 0.5):           0.0
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Position Statement):     0.36226711909299614
 * Veracity scores (Quote Verification):     0.32760357724321804
 * Veracity

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Load Pickles

In [30]:
DUMP_PATH = "/mnt/data/factcheck/averitec-data/data_store/pipeline_result_dev.pkl"
DUMP_PATH_CLAUDE = "/mnt/data/factcheck/averitec-data/data_store/pipeline_result_dev_claude.pkl"
# dump = pickle.load(open(f"/mnt/data/factcheck/averitec-data/data_store/pipeline_result_dev{sample_size}.pkl", "rb"))
dump = pickle.load(open(DUMP_PATH, "rb")) #whole dev set

retrieval_results = [d.retrieval_result for d in dump]
evidence_generation_results = [d.evidence_generation_result for d in dump]
datapoints = [d.datapoint for d in dump]

dump_claude = pickle.load(open(DUMP_PATH_CLAUDE, "rb")) #whole dev set

retrieval_results_claude = [d.retrieval_result for d in dump_claude]
evidence_generation_results_claude = [d.evidence_generation_result for d in dump_claude]
datapoints_claude = [d.datapoint for d in dump_claude]

In [28]:
#split dump in dev_train and dev_test respecting the labels
#divide dump according to the labels
dump_labels = {0: [], 1: [], 2: [], 3: []}

for d in dump:
    dump_labels[label2id[d.datapoint.label]].append(d)

dump_dev_train = []
dump_dev_test = []

for label, data in dump_labels.items():
    split = int(len(data)*0.7)
    dump_dev_train.extend(data[:split])
    dump_dev_test.extend(data[split:])

random.shuffle(dump_dev_train)

retrieval_results_dev_train = [d.retrieval_result for d in dump_dev_train]
evidence_generation_results_dev_train = [d.evidence_generation_result for d in dump_dev_train]
datapoints_dev_train = [d.datapoint for d in dump_dev_train]

retrieval_results_dev_test = [d.retrieval_result for d in dump_dev_test]
evidence_generation_results_dev_test = [d.evidence_generation_result for d in dump_dev_test]
datapoints_dev_test = [d.datapoint for d in dump_dev_test]

## Classifiers testing

In [29]:
def test_classifier(pipeline_results, datapoints, verbose=False):
    pipeline_labels = [np.argmax(pr.classification_result.probs) for pr in pipeline_results]
    
    true_labels = [label2id[d.label] for d in datapoints]
    if verbose:
        print("y_pred", np.unique(pipeline_labels, return_counts=True))
        print("y_true", np.unique(true_labels, return_counts=True))

    print(classification_report(true_labels, pipeline_labels, target_names=id2label.values()))

### GPT4o Likert scale based classifier

In [31]:
#GPTClassifier example
gpt_likert_classifier = DefaultClassifier(evidence_generation_results=evidence_generation_results)

mock_pipeline = MockPipeline(
    classifier=gpt_likert_classifier,
    dumps=DUMP_PATH,
)

pipeline_result = mock_pipeline(dump_dev_test[0].datapoint)
pipeline_result.classification_result, str(pipeline_result.classification_result), str(pipeline_result.evidence_generation_result.evidences[0])

(ClassificationResult(probs=array([0.94791499, 0.01736167, 0.01736167, 0.01736167]), metadata=None),
 'Refuted',
 'Evidence(question=\'Was Reed Hastings, the CEO of Netflix, arrested by the FBI?\', answer=\'No, Reed Hastings was not arrested by the FBI.\', url=\'https://www.boomlive.in/world/posts-claiming-netflix-ceo-was-arrested-for-child-pornography-are-false-9919\', scraped_text=\'\\nPosts Claiming Netflix CEO Was Arrested For Child Pornography Are False Law enforcement officials said that there was no arrest and the rumour was published by a fringe news site, where it has since been removed. "The CEO of a Netflix – Reed Hastings – has been caught by FBI agents with 13,000 files of child pornography. He was arrested at his home in California," reads a Facebook post, archived here, shared more than 400 times since it was published on September 12, 2020. Screenshot taken on September 22, 2020, showing the false claim on Facebook The same claim also circulated on social media in Spani

In [32]:
pipeline_results_gpt_likert = [mock_pipeline(d.datapoint) for d in dump_dev_test]

test_classifier(pipeline_results_gpt_likert, datapoints_dev_test, verbose=True)


y_pred (array([0, 1, 2, 3]), array([94, 48,  9,  1]))
y_true (array([0, 1, 2, 3]), array([92, 37, 11, 12]))
                                    precision    recall  f1-score   support

                           Refuted       0.81      0.83      0.82        92
                         Supported       0.58      0.76      0.66        37
               Not Enough Evidence       0.11      0.09      0.10        11
Conflicting Evidence/Cherrypicking       0.00      0.00      0.00        12

                          accuracy                           0.69       152
                         macro avg       0.38      0.42      0.39       152
                      weighted avg       0.64      0.69      0.66       152



In [33]:
#whole dev
pipeline_results_gpt_likert = [mock_pipeline(d.datapoint) for d in dump]

test_classifier(pipeline_results_gpt_likert, datapoints, verbose=True)

y_pred (array([0, 1, 2, 3]), array([305, 143,  39,  13]))
y_true (array([0, 1, 2, 3]), array([305, 122,  35,  38]))
                                    precision    recall  f1-score   support

                           Refuted       0.81      0.81      0.81       305
                         Supported       0.61      0.71      0.66       122
               Not Enough Evidence       0.18      0.20      0.19        35
Conflicting Evidence/Cherrypicking       0.15      0.05      0.08        38

                          accuracy                           0.69       500
                         macro avg       0.44      0.44      0.43       500
                      weighted avg       0.67      0.69      0.67       500



### Claude Likert scale based classifier

In [34]:
#ClaudeClassifier example
claude_likert_classifier = DefaultClassifier(evidence_generation_results=evidence_generation_results_claude)

mock_pipeline = MockPipeline(
    classifier=claude_likert_classifier,
    dumps=DUMP_PATH,
)

pipeline_result = mock_pipeline(dump_dev_test[0].datapoint)
pipeline_result.classification_result, str(pipeline_result.classification_result), str(
    pipeline_result.evidence_generation_result.evidences[0])

(ClassificationResult(probs=array([0.94791499, 0.01736167, 0.01736167, 0.01736167]), metadata=None),
 'Refuted',
 'Evidence(question=\'Was Reed Hastings, the CEO of Netflix, arrested by the FBI?\', answer=\'No, Reed Hastings was not arrested by the FBI.\', url=\'https://www.boomlive.in/world/posts-claiming-netflix-ceo-was-arrested-for-child-pornography-are-false-9919\', scraped_text=\'\\nPosts Claiming Netflix CEO Was Arrested For Child Pornography Are False Law enforcement officials said that there was no arrest and the rumour was published by a fringe news site, where it has since been removed. "The CEO of a Netflix – Reed Hastings – has been caught by FBI agents with 13,000 files of child pornography. He was arrested at his home in California," reads a Facebook post, archived here, shared more than 400 times since it was published on September 12, 2020. Screenshot taken on September 22, 2020, showing the false claim on Facebook The same claim also circulated on social media in Spani

In [36]:
pipeline_results_claude_likert = [mock_pipeline(d.datapoint) for d in dump_dev_test]

test_classifier(pipeline_results_claude_likert, datapoints_dev_test, verbose=True)

y_pred (array([0, 1, 2, 3]), array([67, 35, 13, 37]))
y_true (array([0, 1, 2, 3]), array([92, 37, 11, 12]))
                                    precision    recall  f1-score   support

                           Refuted       0.87      0.63      0.73        92
                         Supported       0.74      0.70      0.72        37
               Not Enough Evidence       0.23      0.27      0.25        11
Conflicting Evidence/Cherrypicking       0.16      0.50      0.24        12

                          accuracy                           0.61       152
                         macro avg       0.50      0.53      0.49       152
                      weighted avg       0.73      0.61      0.65       152



In [37]:
#whole dev
pipeline_results_claude_likert = [mock_pipeline(d.datapoint) for d in dump]

test_classifier(pipeline_results_claude_likert, datapoints, verbose=True)

y_pred (array([0, 1, 2, 3]), array([247, 110,  34, 109]))
y_true (array([0, 1, 2, 3]), array([305, 122,  35,  38]))
                                    precision    recall  f1-score   support

                           Refuted       0.87      0.70      0.78       305
                         Supported       0.74      0.66      0.70       122
               Not Enough Evidence       0.21      0.20      0.20        35
Conflicting Evidence/Cherrypicking       0.18      0.53      0.27        38

                          accuracy                           0.64       500
                         macro avg       0.50      0.52      0.49       500
                      weighted avg       0.74      0.64      0.68       500



### Huggingface based classifier

In [55]:
#HuggingfaceClassifier example

#hf_classifier = HuggingfaceClassifier(model_path="/mnt/data/factcheck/averitec-data/pretrained_models/nli_4concat/deberta-v3-large/checkpoint-576")
hf_classifier = HuggingfaceClassifier(model_path="/mnt/data/factcheck/averitec-data/pretrained_models/nli_4concat/deberta-v3-large/checkpoint-576", rand_order_evidence=True, num_orders=10)

mock_pipeline = MockPipeline(
    classifier=hf_classifier,
    dumps=DUMP_PATH_CLAUDE,
)

pipeline_result = mock_pipeline(dump_dev_test[0].datapoint)
pipeline_result.classification_result, str(pipeline_result.classification_result), str(
    pipeline_result.evidence_generation_result.evidences[0]
)

(ClassificationResult(probs=array([9.9754179e-01, 3.8506641e-04, 1.2324582e-03, 8.4070157e-04],
       dtype=float32), metadata={'logits': array([[ 5.451454 , -2.3421342, -1.2933868, -1.6085335],
        [ 5.404509 , -2.4541886, -1.2605273, -1.549625 ],
        [ 5.505223 , -2.3594985, -1.2050164, -1.6430886],
        [ 5.5156126, -2.4180639, -1.1979843, -1.6767354],
        [ 5.4588323, -2.4269962, -1.2276417, -1.6143471],
        [ 5.4715605, -2.350417 , -1.3035946, -1.5991894],
        [ 5.421331 , -2.4247289, -1.103447 , -1.6727502],
        [ 5.4672832, -2.384836 , -1.3410959, -1.5654705],
        [ 5.4723916, -2.392237 , -1.2000624, -1.6302536],
        [ 5.4460998, -2.428934 , -1.2157843, -1.6138378]], dtype=float32), 'mean_logits': array([ 5.4614296, -2.3982036, -1.234854 , -1.6173832], dtype=float32)}),
 'Refuted',
 'Evidence(question=\'Was Netflix CEO Reed Hastings arrested?\', answer=\'No, Reed Hastings was not arrested. This claim is false.\', url=\'https://www.boomlive.in/

In [26]:
#whole dev
pipeline_results_hf = [mock_pipeline(d.datapoint) for d in dump]
test_classifier(pipeline_results_hf, datapoints, verbose=True)

y_pred (array([0, 1, 2, 3]), array([265, 115,   5, 115]))
y_true (array([0, 1, 2, 3]), array([305, 122,  35,  38]))
                                    precision    recall  f1-score   support

                           Refuted       0.84      0.73      0.78       305
                         Supported       0.65      0.61      0.63       122
               Not Enough Evidence       0.40      0.06      0.10        35
Conflicting Evidence/Cherrypicking       0.10      0.29      0.14        38

                          accuracy                           0.62       500
                         macro avg       0.50      0.42      0.41       500
                      weighted avg       0.71      0.62      0.65       500



In [52]:
pipeline_results_hf = [mock_pipeline(d.datapoint) for d in dump_dev_test]

test_classifier(pipeline_results_hf, datapoints_dev_test)


                                    precision    recall  f1-score   support

                           Refuted       0.86      0.77      0.81        92
                         Supported       0.60      0.57      0.58        37
               Not Enough Evidence       0.50      0.09      0.15        11
Conflicting Evidence/Cherrypicking       0.16      0.42      0.23        12

                          accuracy                           0.64       152
                         macro avg       0.53      0.46      0.44       152
                      weighted avg       0.71      0.64      0.66       152



In [54]:
#with claude evidence
pipeline_results_hf = [mock_pipeline(d.datapoint) for d in dump_dev_test]

test_classifier(pipeline_results_hf, datapoints_dev_test)


                                    precision    recall  f1-score   support

                           Refuted       0.84      0.77      0.80        92
                         Supported       0.73      0.59      0.66        37
               Not Enough Evidence       0.67      0.36      0.47        11
Conflicting Evidence/Cherrypicking       0.13      0.33      0.19        12

                          accuracy                           0.66       152
                         macro avg       0.59      0.52      0.53       152
                      weighted avg       0.74      0.66      0.69       152



In [30]:
pipeline_results_hf_rand = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_hf_rand, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.84      0.76      0.80        92
                         Supported       0.61      0.59      0.60        37
               Not Enough Evidence       0.00      0.00      0.00        11
Conflicting Evidence/Cherrypicking       0.12      0.33      0.18        12

                          accuracy                           0.63       152
                         macro avg       0.39      0.42      0.40       152
                      weighted avg       0.67      0.63      0.65       152



In [56]:
#with Claude evidence
pipeline_results_hf_rand = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_hf_rand, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.85      0.77      0.81        92
                         Supported       0.73      0.59      0.66        37
               Not Enough Evidence       0.67      0.36      0.47        11
Conflicting Evidence/Cherrypicking       0.16      0.42      0.23        12

                          accuracy                           0.67       152
                         macro avg       0.60      0.54      0.54       152
                      weighted avg       0.75      0.67      0.70       152



### Simple average ensemble classifier

In [61]:
# AverageEnsemble classifier example
from pipeline import MockPipeline
from classification import AverageEnsembleClassifier, HuggingfaceClassifier, DefaultClassifier
%reload_ext autoreload

hf_classifier = HuggingfaceClassifier(model_path="/mnt/data/factcheck/averitec-data/pretrained_models/nli_4concat/deberta-v3-large/checkpoint-576")
hf_classifier_order = HuggingfaceClassifier(model_path="/mnt/data/factcheck/averitec-data/pretrained_models/nli_4concat/deberta-v3-large/checkpoint-576", rand_order_evidence=True, num_orders=10)
gpt_likert_classifier = DefaultClassifier(evidence_generation_results=evidence_generation_results)
claude_likert_classifier = DefaultClassifier(evidence_generation_results=evidence_generation_results_claude)

mock_pipeline = MockPipeline(
    classifier=AverageEnsembleClassifier([hf_classifier, claude_likert_classifier]),
    dumps=DUMP_PATH_CLAUDE,
)

pipeline_result = mock_pipeline(dump_dev_test[0].datapoint)
pipeline_result.classification_result, str(pipeline_result.classification_result), str(
    pipeline_result.evidence_generation_result.evidences[0]
)

(ClassificationResult(probs=array([0.97268544, 0.00886898, 0.00935958, 0.00908605]), metadata={'clf_probs': [array([9.9745589e-01, 3.7628258e-04, 1.3574883e-03, 8.1042910e-04],
       dtype=float32), array([0.94791499, 0.01736167, 0.01736167, 0.01736167])], 'weights': array([1., 1.])}),
 'Refuted',
 'Evidence(question=\'Was Netflix CEO Reed Hastings arrested?\', answer=\'No, Reed Hastings was not arrested. This claim is false.\', url=\'https://www.boomlive.in/world/posts-claiming-netflix-ceo-was-arrested-for-child-pornography-are-false-9919\', scraped_text=\'\\nPosts Claiming Netflix CEO Was Arrested For Child Pornography Are False Law enforcement officials said that there was no arrest and the rumour was published by a fringe news site, where it has since been removed. "The CEO of a Netflix – Reed Hastings – has been caught by FBI agents with 13,000 files of child pornography. He was arrested at his home in California," reads a Facebook post, archived here, shared more than 400 times 

In [32]:
pipeline_results_avg_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_avg_ensemble, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.84      0.80      0.82        92
                         Supported       0.61      0.73      0.67        37
               Not Enough Evidence       0.50      0.09      0.15        11
Conflicting Evidence/Cherrypicking       0.11      0.17      0.13        12

                          accuracy                           0.68       152
                         macro avg       0.52      0.45      0.44       152
                      weighted avg       0.70      0.68      0.68       152



In [50]:
#hf with Claude
pipeline_results_avg_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_avg_ensemble, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.86      0.78      0.82        92
                         Supported       0.66      0.68      0.67        37
               Not Enough Evidence       0.00      0.00      0.00        11
Conflicting Evidence/Cherrypicking       0.18      0.42      0.25        12

                          accuracy                           0.67       152
                         macro avg       0.42      0.47      0.43       152
                      weighted avg       0.69      0.67      0.68       152



In [39]:
# gpt, claude, hf
pipeline_results_avg_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_avg_ensemble, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.85      0.79      0.82        92
                         Supported       0.67      0.70      0.68        37
               Not Enough Evidence       0.25      0.09      0.13        11
Conflicting Evidence/Cherrypicking       0.17      0.33      0.23        12

                          accuracy                           0.68       152
                         macro avg       0.48      0.48      0.47       152
                      weighted avg       0.71      0.68      0.69       152



In [41]:
# claude, gpt
pipeline_results_avg_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_avg_ensemble, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.79      0.76      0.77        92
                         Supported       0.57      0.73      0.64        37
               Not Enough Evidence       0.12      0.09      0.11        11
Conflicting Evidence/Cherrypicking       0.12      0.08      0.10        12

                          accuracy                           0.65       152
                         macro avg       0.40      0.42      0.41       152
                      weighted avg       0.63      0.65      0.64       152



In [62]:
# claude, hf - claude ev
pipeline_results_avg_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_avg_ensemble, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.83      0.77      0.80        92
                         Supported       0.74      0.68      0.70        37
               Not Enough Evidence       0.38      0.27      0.32        11
Conflicting Evidence/Cherrypicking       0.17      0.33      0.22        12

                          accuracy                           0.68       152
                         macro avg       0.53      0.51      0.51       152
                      weighted avg       0.72      0.68      0.69       152



### Weighted average ensemble classifier

In [59]:
# AverageEnsemble classifier fiting example
hf_classifier = HuggingfaceClassifier(model_path="/mnt/data/factcheck/averitec-data/pretrained_models/nli_4concat/deberta-v3-large/checkpoint-576")
gpt_likert_classifier = DefaultClassifier(evidence_generation_results=evidence_generation_results)
claude_likert_classifier = DefaultClassifier(evidence_generation_results=evidence_generation_results_claude)

avg_ensemble_classifier = AverageEnsembleClassifier([claude_likert_classifier, hf_classifier])

# fit the weights TODO using only part of the data -> to be able to validate results! 
#prepared for univariate (used for 2 classifiers) and multivariate (used for 3+ classifiers) optimization and 2 metrics f1 and cross-entropy
avg_ensemble_classifier.fit_weights(datapoints_dev_train, evidence_generation_results_dev_train, retrieval_results_dev_train, metric="f1")

mock_pipeline = MockPipeline(
    classifier=avg_ensemble_classifier,
    dumps=DUMP_PATH_CLAUDE,
)

pipeline_result = mock_pipeline(dump_dev_test[0].datapoint)
pipeline_result.classification_result, str(pipeline_result.classification_result), pipeline_result.datapoint.label, str(
    pipeline_result.evidence_generation_result.evidences[0]
)

 message: Solution found.
 success: True
  status: 0
     fun: -0.48539516995108256
       x: 0.7639364807041505
     nit: 24
    nfev: 24


(ClassificationResult(probs=array([0.95960979, 0.01335204, 0.01358367, 0.01345452]), metadata={'clf_probs': [array([0.94791499, 0.01736167, 0.01736167, 0.01736167]), array([9.9745589e-01, 3.7628258e-04, 1.3574883e-03, 8.1042910e-04],
       dtype=float32)], 'weights': array([0.76393648, 0.23606352])}),
 'Refuted',
 'Refuted',
 'Evidence(question=\'Was Netflix CEO Reed Hastings arrested?\', answer=\'No, Reed Hastings was not arrested. This claim is false.\', url=\'https://www.boomlive.in/world/posts-claiming-netflix-ceo-was-arrested-for-child-pornography-are-false-9919\', scraped_text=\'\\nPosts Claiming Netflix CEO Was Arrested For Child Pornography Are False Law enforcement officials said that there was no arrest and the rumour was published by a fringe news site, where it has since been removed. "The CEO of a Netflix – Reed Hastings – has been caught by FBI agents with 13,000 files of child pornography. He was arrested at his home in California," reads a Facebook post, archived here,

In [37]:
#cross entropy
pipeline_results_avg_w_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_avg_w_ensemble, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.80      0.80      0.80        92
                         Supported       0.60      0.76      0.67        37
               Not Enough Evidence       0.11      0.09      0.10        11
Conflicting Evidence/Cherrypicking       0.25      0.08      0.12        12

                          accuracy                           0.68       152
                         macro avg       0.44      0.43      0.42       152
                      weighted avg       0.66      0.68      0.67       152



In [34]:
#f1 deberta with GPT
pipeline_results_avg_w_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_avg_w_ensemble, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.80      0.84      0.82        92
                         Supported       0.59      0.73      0.65        37
               Not Enough Evidence       0.33      0.09      0.14        11
Conflicting Evidence/Cherrypicking       0.14      0.08      0.11        12

                          accuracy                           0.70       152
                         macro avg       0.47      0.44      0.43       152
                      weighted avg       0.66      0.70      0.67       152



In [20]:
#f1 deberta with Claude
pipeline_results_avg_w_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_avg_w_ensemble, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.81      0.75      0.78        92
                         Supported       0.76      0.70      0.73        37
               Not Enough Evidence       0.25      0.18      0.21        11
Conflicting Evidence/Cherrypicking       0.12      0.25      0.16        12

                          accuracy                           0.66       152
                         macro avg       0.49      0.47      0.47       152
                      weighted avg       0.71      0.66      0.68       152



In [43]:
#f1 gpt with Claude
pipeline_results_avg_w_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_avg_w_ensemble, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.87      0.63      0.73        92
                         Supported       0.74      0.70      0.72        37
               Not Enough Evidence       0.23      0.27      0.25        11
Conflicting Evidence/Cherrypicking       0.16      0.50      0.24        12

                          accuracy                           0.61       152
                         macro avg       0.50      0.53      0.49       152
                      weighted avg       0.73      0.61      0.65       152



In [46]:
#ce gpt with Claude with hf
pipeline_results_avg_w_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_avg_w_ensemble, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.79      0.78      0.79        92
                         Supported       0.61      0.73      0.67        37
               Not Enough Evidence       0.20      0.09      0.12        11
Conflicting Evidence/Cherrypicking       0.08      0.08      0.08        12

                          accuracy                           0.66       152
                         macro avg       0.42      0.42      0.42       152
                      weighted avg       0.65      0.66      0.65       152



In [58]:
#ce gpt with Claude with hf - claude ev
pipeline_results_avg_w_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_avg_w_ensemble, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.80      0.77      0.78        92
                         Supported       0.63      0.73      0.68        37
               Not Enough Evidence       0.29      0.18      0.22        11
Conflicting Evidence/Cherrypicking       0.08      0.08      0.08        12

                          accuracy                           0.66       152
                         macro avg       0.45      0.44      0.44       152
                      weighted avg       0.66      0.66      0.66       152



In [60]:
#f1 Claude with hf - claude ev
pipeline_results_avg_w_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_avg_w_ensemble, datapoints_dev_test)

                                    precision    recall  f1-score   support

                           Refuted       0.86      0.62      0.72        92
                         Supported       0.79      0.70      0.74        37
               Not Enough Evidence       0.17      0.18      0.17        11
Conflicting Evidence/Cherrypicking       0.20      0.67      0.30        12

                          accuracy                           0.61       152
                         macro avg       0.50      0.54      0.49       152
                      weighted avg       0.74      0.61      0.65       152



### Logistic regression based stacking classifier

In [47]:
#LogRegEnsemble classifier example
hf_classifier = HuggingfaceClassifier(model_path="/mnt/data/factcheck/averitec-data/pretrained_models/nli_4concat/deberta-v3-large/checkpoint-576")
gpt_likert_classifier = DefaultClassifier(evidence_generation_results=evidence_generation_results)
claude_likert_classifier = DefaultClassifier(evidence_generation_results=evidence_generation_results_claude)

logreg_ensemble_classifier = LogRegEnsembleClassifier([claude_likert_classifier, gpt_likert_classifier])

#need all labels to present in the data
logreg_ensemble_classifier.fit(datapoints_dev_train, evidence_generation_results_dev_train, retrieval_results_dev_train)

mock_pipeline = MockPipeline(
    classifier=logreg_ensemble_classifier,
    dumps=DUMP_PATH_CLAUDE,
)

pipeline_result = mock_pipeline(dump_dev_test[0].datapoint)
pipeline_result.classification_result, str(pipeline_result.classification_result), str(
    pipeline_result.evidence_generation_result.evidences[0]
)

(ClassificationResult(probs=array([0.91284996, 0.03829509, 0.02086459, 0.02799035]), metadata={'clf_probs': [array([0.94791499, 0.01736167, 0.01736167, 0.01736167]), array([0.94791499, 0.01736167, 0.01736167, 0.01736167])]}),
 'Refuted',
 'Evidence(question=\'Was Netflix CEO Reed Hastings arrested?\', answer=\'No, Reed Hastings was not arrested. This claim is false.\', url=\'https://www.boomlive.in/world/posts-claiming-netflix-ceo-was-arrested-for-child-pornography-are-false-9919\', scraped_text=\'\\nPosts Claiming Netflix CEO Was Arrested For Child Pornography Are False Law enforcement officials said that there was no arrest and the rumour was published by a fringe news site, where it has since been removed. "The CEO of a Netflix – Reed Hastings – has been caught by FBI agents with 13,000 files of child pornography. He was arrested at his home in California," reads a Facebook post, archived here, shared more than 400 times since it was published on September 12, 2020. Screenshot taken

In [39]:
pipeline_results_logreg_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_logreg_ensemble, datapoints_dev_test, verbose=True)

y_pred (array([0, 1]), array([108,  44]))
y_true (array([0, 1, 2, 3]), array([92, 37, 11, 12]))
                                    precision    recall  f1-score   support

                           Refuted       0.79      0.92      0.85        92
                         Supported       0.61      0.73      0.67        37
               Not Enough Evidence       0.00      0.00      0.00        11
Conflicting Evidence/Cherrypicking       0.00      0.00      0.00        12

                          accuracy                           0.74       152
                         macro avg       0.35      0.41      0.38       152
                      weighted avg       0.63      0.74      0.68       152



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
#hf with Claude
pipeline_results_logreg_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_logreg_ensemble, datapoints_dev_test, verbose=True)

y_pred (array([0, 1]), array([120,  32]))
y_true (array([0, 1, 2, 3]), array([92, 37, 11, 12]))
                                    precision    recall  f1-score   support

                           Refuted       0.73      0.96      0.83        92
                         Supported       0.78      0.68      0.72        37
               Not Enough Evidence       0.00      0.00      0.00        11
Conflicting Evidence/Cherrypicking       0.00      0.00      0.00        12

                          accuracy                           0.74       152
                         macro avg       0.38      0.41      0.39       152
                      weighted avg       0.63      0.74      0.68       152



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [48]:
#gpt with Claude
pipeline_results_logreg_ensemble = [mock_pipeline(d.datapoint) for d in dump_dev_test]
test_classifier(pipeline_results_logreg_ensemble, datapoints_dev_test, verbose=True)

y_pred (array([0, 1]), array([108,  44]))
y_true (array([0, 1, 2, 3]), array([92, 37, 11, 12]))
                                    precision    recall  f1-score   support

                           Refuted       0.77      0.90      0.83        92
                         Supported       0.64      0.76      0.69        37
               Not Enough Evidence       0.00      0.00      0.00        11
Conflicting Evidence/Cherrypicking       0.00      0.00      0.00        12

                          accuracy                           0.73       152
                         macro avg       0.35      0.41      0.38       152
                      weighted avg       0.62      0.73      0.67       152



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Full Eval of the Best Classifier

### Just Likert GPT

In [45]:


classifier = DefaultClassifier()

pipeline = MockPipeline(classifier=classifier, dumps=DUMP_PATH)

with open("/mnt/data/factcheck/averitec-data/data/dev.json") as f:
    dataset = json.load(f)
    seq = 0
    for d in dataset:
        d["claim_id"] = seq
        seq += 1
    
datapoints = [Datapoint.from_dict(d) for d in dataset]
submission = []
dump = []

for dp in tqdm(datapoints):
    pipeline_result = pipeline(dp)
    submission.append(pipeline_result.to_submission())
    dump.append(pipeline_result)
    with open(f"/mnt/data/factcheck/averitec-data/data_store/submission_dev_gpt_likert.json", "w") as f:
        json.dump(submission, f, indent=4)
    with open(f"/mnt/data/factcheck/averitec-data/data_store/pipeline_result_dev_gpt_likert.pkl", "wb") as f:
        pickle.dump(dump, f)

[0.37138793 0.62861207]


100%|██████████| 500/500 [00:52<00:00,  9.50it/s]


In [46]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submission_dev_gpt_likert.json

Question-only score (HU-meteor):             0.44559202247101487
Question-answer score (HU-meteor):           0.27486491800924373
Veracity F1 scores:
 * Supported:                                0.6566037735849056
 * Refuted:                                  0.8098360655737705
 * Not Enough Evidence:                      0.1891891891891892
 * Conflicting Evidence/Cherrypicking:       0.0784313725490196
 * macro:                                    0.43351510022422124
 * acc:                                      0.686
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.672
 * Veracity scores (meteor @ 0.2):           0.532
 * Veracity scores (meteor @ 0.25):          0.394
 * Veracity scores (meteor @ 0.3):           0.256
 * Veracity scores (meteor @ 0.4):           0.106
 * Veracity scores (meteor @ 0.5):           0.03
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.18721511530819013
 * Veracity score

### Just Claude

In [24]:
classifier = DefaultClassifier()
pipeline = MockPipeline(classifier=classifier, dumps=DUMP_PATH_CLAUDE)

with open("/mnt/data/factcheck/averitec-data/data/dev.json") as f:
    dataset = json.load(f)
    seq = 0
    for d in dataset:
        d["claim_id"] = seq
        seq += 1
    
datapoints = [Datapoint.from_dict(d) for d in dataset]
submission = []
dump = []

for dp in tqdm(datapoints):
    pipeline_result = pipeline(dp)
    submission.append(pipeline_result.to_submission())
    dump.append(pipeline_result)
    with open(f"/mnt/data/factcheck/averitec-data/data_store/submission_dev_claude_likert.json", "w") as f:
        json.dump(submission, f, indent=4)
    with open(f"/mnt/data/factcheck/averitec-data/data_store/pipeline_result_dev_claude_likert.pkl", "wb") as f:
        pickle.dump(dump, f)

100%|██████████| 500/500 [00:42<00:00, 11.68it/s]


In [25]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submission_dev_claude_likert.json

Question-only score (HU-meteor):             0.43356239977995653
Question-answer score (HU-meteor):           0.2756433945181975
Veracity F1 scores:
 * Supported:                                0.6982758620689655
 * Refuted:                                  0.7753623188405797
 * Not Enough Evidence:                      0.2028985507246377
 * Conflicting Evidence/Cherrypicking:       0.272108843537415
 * macro:                                    0.4871613937928995
 * acc:                                      0.644
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.63
 * Veracity scores (meteor @ 0.2):           0.496
 * Veracity scores (meteor @ 0.25):          0.35
 * Veracity scores (meteor @ 0.3):           0.234
 * Veracity scores (meteor @ 0.4):           0.094
 * Veracity scores (meteor @ 0.5):           0.038
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.18399764934948373
 * Veracity scores (P

### Deberta only

In [63]:
from tqdm import tqdm
from classification import HuggingfaceClassifier
classifier = HuggingfaceClassifier(model_path="/mnt/data/factcheck/averitec-data/pretrained_models/nli_4concat/deberta-v3-large/checkpoint-576")

pipeline = MockPipeline(classifier=classifier, dumps=DUMP_PATH_CLAUDE)

with open("/mnt/data/factcheck/averitec-data/data/dev.json") as f:
    dataset = json.load(f)
    seq = 0
    for d in dataset:
        d["claim_id"] = seq
        seq += 1
    
datapoints = [Datapoint.from_dict(d) for d in dataset]
submission = []
dump = []

for dp in tqdm(datapoints):
    pipeline_result = pipeline(dp)
    submission.append(pipeline_result.to_submission())
    dump.append(pipeline_result)
    with open(f"/mnt/data/factcheck/averitec-data/data_store/submission_dev_hf_dc.json", "w") as f:
        json.dump(submission, f, indent=4)
    with open(f"/mnt/data/factcheck/averitec-data/data_store/pipeline_result_dev_hf_dc.pkl", "wb") as f:
        pickle.dump(dump, f)

100%|██████████| 500/500 [01:08<00:00,  7.32it/s]


In [48]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submission_dev_hf.json

Question-only score (HU-meteor):             0.44559202247101487
Question-answer score (HU-meteor):           0.27486491800924373
Veracity F1 scores:
 * Supported:                                0.6329113924050633
 * Refuted:                                  0.7824561403508772
 * Not Enough Evidence:                      0.1
 * Conflicting Evidence/Cherrypicking:       0.1437908496732026
 * macro:                                    0.4147895956072858
 * acc:                                      0.622
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.606
 * Veracity scores (meteor @ 0.2):           0.474
 * Veracity scores (meteor @ 0.25):          0.358
 * Veracity scores (meteor @ 0.3):           0.232
 * Veracity scores (meteor @ 0.4):           0.098
 * Veracity scores (meteor @ 0.5):           0.028
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.18721511530819013
 * Veracity scores (Position Sta

In [64]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submission_dev_hf_dc.json

Question-only score (HU-meteor):             0.43356239977995653
Question-answer score (HU-meteor):           0.2756433945181975
Veracity F1 scores:
 * Supported:                                0.6666666666666666
 * Refuted:                                  0.7952218430034129
 * Not Enough Evidence:                      0.19607843137254902
 * Conflicting Evidence/Cherrypicking:       0.19047619047619047
 * macro:                                    0.46211078287970475
 * acc:                                      0.648
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.634
 * Veracity scores (meteor @ 0.2):           0.482
 * Veracity scores (meteor @ 0.25):          0.334
 * Veracity scores (meteor @ 0.3):           0.222
 * Veracity scores (meteor @ 0.4):           0.088
 * Veracity scores (meteor @ 0.5):           0.03
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.18399764934948373
 * Veracity scor

### Average Ensemble

In [65]:
hf_classifier = HuggingfaceClassifier(model_path="/mnt/data/factcheck/averitec-data/pretrained_models/nli_4concat/deberta-v3-large/checkpoint-576")
claude_likert_classifier = DefaultClassifier(evidence_generation_results=evidence_generation_results_claude)

pipeline = MockPipeline(
    classifier=AverageEnsembleClassifier([hf_classifier, claude_likert_classifier]),
    dumps=DUMP_PATH_CLAUDE,
)

with open("/mnt/data/factcheck/averitec-data/data/dev.json") as f:
    dataset = json.load(f)
    seq = 0
    for d in dataset:
        d["claim_id"] = seq
        seq += 1
    
datapoints = [Datapoint.from_dict(d) for d in dataset]
submission = []
dump = []

for dp in tqdm(datapoints):
    pipeline_result = pipeline(dp)
    submission.append(pipeline_result.to_submission())
    dump.append(pipeline_result)
    with open(f"/mnt/data/factcheck/averitec-data/data_store/submission_dev_avg_hf_claude_dc.json", "w") as f:
        json.dump(submission, f, indent=4)
    with open(f"/mnt/data/factcheck/averitec-data/data_store/pipeline_result_dev_avg_hf_claude_dc.pkl", "wb") as f:
        pickle.dump(dump, f)

[0.76393648 0.23606352]


100%|██████████| 500/500 [01:08<00:00,  7.34it/s]


In [66]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submission_dev_avg_hf_claude_dc.json

Question-only score (HU-meteor):             0.43356239977995653
Question-answer score (HU-meteor):           0.2756433945181975
Veracity F1 scores:
 * Supported:                                0.7053571428571429
 * Refuted:                                  0.8108108108108109
 * Not Enough Evidence:                      0.1568627450980392
 * Conflicting Evidence/Cherrypicking:       0.22556390977443608
 * macro:                                    0.4746486521351072
 * acc:                                      0.676
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.662
 * Veracity scores (meteor @ 0.2):           0.512
 * Veracity scores (meteor @ 0.25):          0.356
 * Veracity scores (meteor @ 0.3):           0.238
 * Veracity scores (meteor @ 0.4):           0.094
 * Veracity scores (meteor @ 0.5):           0.034
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.18399764934948373
 * Veracity score

### Weighted Average Ensemble

In [43]:
from tqdm import tqdm

pipeline = MockPipeline(classifier=avg_ensemble_classifier, dumps=DUMP_PATH)
print(avg_ensemble_classifier.weights)

with open("/mnt/data/factcheck/averitec-data/data/dev.json") as f:
    dataset = json.load(f)
    seq = 0
    for d in dataset:
        d["claim_id"] = seq
        seq += 1
    
datapoints = [Datapoint.from_dict(d) for d in dataset]
submission = []
dump = []

for dp in tqdm(datapoints):
    pipeline_result = pipeline(dp)
    submission.append(pipeline_result.to_submission())
    dump.append(pipeline_result)
    with open(f"/mnt/data/factcheck/averitec-data/data_store/submission_dev_avg_clf.json", "w") as f:
        json.dump(submission, f, indent=4)
    with open(f"/mnt/data/factcheck/averitec-data/data_store/pipeline_result_dev_avg_clf.pkl", "wb") as f:
        pickle.dump(dump, f)

[0.37138793 0.62861207]


100%|██████████| 500/500 [01:15<00:00,  6.61it/s]


In [44]:
%run src/prediction/evaluate_veracity.py --label_file /mnt/data/factcheck/averitec-data/data/dev.json --prediction_file /mnt/data/factcheck/averitec-data/data_store/submission_dev_avg_clf.json

Question-only score (HU-meteor):             0.44559202247101487
Question-answer score (HU-meteor):           0.27486491800924373
Veracity F1 scores:
 * Supported:                                0.6666666666666666
 * Refuted:                                  0.8181818181818182
 * Not Enough Evidence:                      0.1509433962264151
 * Conflicting Evidence/Cherrypicking:       0.136986301369863
 * macro:                                    0.44319454561119076
 * acc:                                      0.694
--------------------
AVeriTeC scores:
 * Veracity scores (meteor @ 0.1):           0.678
 * Veracity scores (meteor @ 0.2):           0.528
 * Veracity scores (meteor @ 0.25):          0.392
 * Veracity scores (meteor @ 0.3):           0.252
 * Veracity scores (meteor @ 0.4):           0.104
 * Veracity scores (meteor @ 0.5):           0.032
--------------------
AVeriTeC scores by type @ 0.25:
 * Veracity scores (Event/Property Claim):   0.18721511530819013
 * Veracity score