In [1]:
%load_ext autoreload
%autoreload 2

# Init

In [2]:
import sys
sys.path.insert(0, "../src")

import json
import random
import pickle as pkl
import numpy as np
import scipy
from pathlib import Path
from collections import Counter, defaultdict

import pandas as pd
from joblib import Parallel, delayed

import constants
from evaluate import score
from gen.util import read_data, write_jsonl

# FEVER

In [3]:
fever_actual = Path("/users/k21190024/study/fact-checking-repos/fever/teamathene/data/fever-data")
fever_train_actual = read_data(fever_actual / "train.jsonl")
fever_dev_actual = read_data(fever_actual / "dev.jsonl")

## IR Evaluation

In [4]:
fever_pages = Path("/users/k21190024/study/fact-checking-repos/fever/teamathene/data/fever")
fever_train = read_data(fever_pages / "train.p7.s5.jsonl")
fever_dev = read_data(fever_pages / "dev.p7.s5.jsonl")

In [5]:
fever_train_scored = score.FEVERScorer(fever_train_actual, fever_train, oracle_ir=False, oracle_rte=True, max_evidence=None, score_name="FEVER train")
print(fever_train_scored)


            FEVER train
            Oracle IR: False
            Oracle RTE: True
            Max Evidences: None

            FEVER Score: 90.3
            Accuracy: 100.0
            Macro Precision: 26.1
            Macro Recall: 87.16
            Macro F1: 40.18
            


In [6]:
fever_train_scored.get_document_metric()

{'recall': 0.8963470421138264, 'precision': 0.32361337977789684}

In [7]:
fever_dev_scored = score.FEVERScorer(fever_dev_actual, fever_dev, oracle_ir=False, oracle_rte=True, max_evidence=None, score_name="FEVER Dev")
print(fever_dev_scored)


            FEVER Dev
            Oracle IR: False
            Oracle RTE: True
            Max Evidences: None

            FEVER Score: 90.68
            Accuracy: 100.0
            Macro Precision: 23.81
            Macro Recall: 86.03
            Macro F1: 37.3
            


In [8]:
fever_dev_scored.get_document_metric()

{'recall': 0.9044265668449961, 'precision': 0.3068520285351291}

# Climate-FEVER

In [9]:
cfever_actual = Path("/users/k21190024/study/fact-checking-repos/fever/teamathene/data/cfever-data")
cfever_train_actual = read_data(cfever_actual / "train.jsonl")
cfever_dev_actual = read_data(cfever_actual / "dev.jsonl")

## IR Evaluation

In [10]:
cfever_pages = Path("/users/k21190024/study/fact-checking-repos/fever/teamathene/data/climatefever")
cfever_train = read_data(cfever_pages / "train.p7.s5.jsonl")
cfever_dev = read_data(cfever_pages / "dev.p7.s5.jsonl")

In [11]:
cfever_train_scored = score.ClimateFEVERScorer(cfever_train_actual, cfever_train, oracle_ir=False, oracle_rte=True, max_evidence=None, score_name="Climate-FEVER Train")
print(cfever_train_scored)


            Climate-FEVER Train
            Oracle IR: False
            Oracle RTE: True
            Max Evidences: None

            FEVER Score: 24.22
            Accuracy: 100.0
            Macro Precision: 11.24
            Macro Recall: 24.22
            Macro F1: 15.36
            


In [12]:
cfever_train_scored.get_document_metric()

{'recall': 0.18693926846100758, 'precision': 0.08108311141557731}

In [13]:
cfever_dev_scored = score.ClimateFEVERScorer(cfever_dev_actual, cfever_dev, oracle_ir=False, oracle_rte=True, max_evidence=None, score_name="Climate-FEVER Dev")
print(cfever_dev_scored)


            Climate-FEVER Dev
            Oracle IR: False
            Oracle RTE: True
            Max Evidences: None

            FEVER Score: 22.66
            Accuracy: 100.0
            Macro Precision: 10.31
            Macro Recall: 22.66
            Macro F1: 14.17
            


In [14]:
cfever_dev_scored.get_document_metric()

{'recall': 0.2223021582733813, 'precision': 0.09046754544045565}

# SciFact

In [16]:
sf_actual_p = Path("/users/k21190024/study/fact-checking-repos/fever/teamathene/data/scifact-data")
sf_actual = read_data(sf_actual_p / "scifact_all_titleid.jsonl")

## IR Evaluation

In [27]:
sf_pages = Path("/users/k21190024/study/fact-checking-repos/fever/teamathene/data/scifact")
sf_all = read_data(sf_pages / "test.p7.s5.jsonl")

In [29]:
sf_scored = score.FEVERScorer(sf_actual, sf_all, oracle_ir=False, oracle_rte=True, max_evidence=None, score_name="SciFact All")
print(sf_scored)


            SciFact All
            Oracle IR: False
            Oracle RTE: True
            Max Evidences: None

            FEVER Score: 37.51
            Accuracy: 100.0
            Macro Precision: 5.19
            Macro Recall: 0.0
            Macro F1: 0.0
            


In [30]:
sf_scored.get_document_metric()

{'recall': 0.0, 'precision': 0.0}