In [1]:
%load_ext autoreload
%autoreload 2

# Init

In [2]:
import sys
sys.path.insert(0, "../src")

import json
import random
import pickle as pkl
import numpy as np
import scipy
from pathlib import Path
from collections import Counter, defaultdict

import pandas as pd
from joblib import Parallel, delayed

import constants
from evaluate import score
from gen.util import read_data, write_jsonl

# FEVER

In [3]:
fever_actual = Path("/users/k21190024/study/fact-checking-repos/fever/teamathene/data/fever-data")
fever_train_actual = read_data(fever_actual / "train.jsonl")
fever_dev_actual = read_data(fever_actual / "dev.jsonl")

## IR Evaluation

In [4]:
fever_pages = Path("/users/k21190024/study/fact-checking-repos/fever/teamathene/data/fever")
fever_train = read_data(fever_pages / "train.p7.s5.jsonl")
fever_dev = read_data(fever_pages / "dev.p7.s5.jsonl")

In [5]:
fever_train_scored = score.FEVERScorer(fever_train_actual, fever_train, oracle_ir=False, oracle_rte=True, max_evidence=None, score_name="FEVER train")
print(fever_train_scored)


            FEVER train
            Oracle IR: False
            Oracle RTE: True
            Max Evidences: None

            FEVER Score: 90.3
            Accuracy: 100.0
            Macro Precision: 26.1
            Macro Recall: 87.16
            Macro F1: 40.18
            


In [6]:
fever_train_scored.get_document_metric()

{'recall': 0.8963470421138264,
 'precision': 0.32361337977789684,
 'f1': 0.475539846288753}

In [7]:
fever_dev_scored = score.FEVERScorer(fever_dev_actual, fever_dev, oracle_ir=False, oracle_rte=True, max_evidence=None, score_name="FEVER Dev")
print(fever_dev_scored)


            FEVER Dev
            Oracle IR: False
            Oracle RTE: True
            Max Evidences: None

            FEVER Score: 90.68
            Accuracy: 100.0
            Macro Precision: 23.81
            Macro Recall: 86.03
            Macro F1: 37.3
            


In [8]:
fever_dev_scored.get_document_metric()

{'recall': 0.9044265668449961,
 'precision': 0.3068520285351291,
 'f1': 0.45823500515231386}

# Climate-FEVER

In [9]:
cfever_actual = Path("/users/k21190024/study/fact-checking-repos/fever/teamathene/data/cfever-data")
cfever_train_actual = read_data(cfever_actual / "train.jsonl")
cfever_dev_actual = read_data(cfever_actual / "dev.jsonl")

## IR Evaluation

In [10]:
cfever_pages = Path("/users/k21190024/study/fact-checking-repos/fever/teamathene/data/climatefever")
cfever_train = read_data(cfever_pages / "train.p7.s5.jsonl")
cfever_dev = read_data(cfever_pages / "dev.p7.s5.jsonl")

In [11]:
cfever_train_scored = score.ClimateFEVERScorer(cfever_train_actual, cfever_train, oracle_ir=False, oracle_rte=True, max_evidence=None, score_name="Climate-FEVER Train")
print(cfever_train_scored)


            Climate-FEVER Train
            Oracle IR: False
            Oracle RTE: True
            Max Evidences: None

            FEVER Score: 24.22
            Accuracy: 100.0
            Macro Precision: 11.24
            Macro Recall: 24.22
            Macro F1: 15.36
            


In [12]:
cfever_train_scored.get_document_metric()

{'recall': 0.18693926846100758,
 'precision': 0.08108311141557731,
 'f1': 0.11310710351538523}

In [13]:
cfever_dev_scored = score.ClimateFEVERScorer(cfever_dev_actual, cfever_dev, oracle_ir=False, oracle_rte=True, max_evidence=None, score_name="Climate-FEVER Dev")
print(cfever_dev_scored)


            Climate-FEVER Dev
            Oracle IR: False
            Oracle RTE: True
            Max Evidences: None

            FEVER Score: 22.66
            Accuracy: 100.0
            Macro Precision: 10.31
            Macro Recall: 22.66
            Macro F1: 14.17
            


In [14]:
cfever_dev_scored.get_document_metric()

{'recall': 0.2223021582733813,
 'precision': 0.09046754544045565,
 'f1': 0.12860024718704088}

# SciFact

In [15]:
sf_actual_p = Path("/users/k21190024/study/fact-checking-repos/fever/teamathene/data/scifact-data")
sf_actual = read_data(sf_actual_p / "scifact_all_titleid.jsonl")

## IR Evaluation

In [16]:
sf_pages = Path("/users/k21190024/study/fact-checking-repos/fever/teamathene/data/scifact")
sf_all = read_data(sf_pages / "test.p7.s5.jsonl")

In [17]:
sf_scored = score.FEVERScorer(sf_actual, sf_all, oracle_ir=False, oracle_rte=True, max_evidence=None, score_name="SciFact All")
print(sf_scored)


            SciFact All
            Oracle IR: False
            Oracle RTE: True
            Max Evidences: None

            FEVER Score: 37.51
            Accuracy: 100.0
            Macro Precision: 5.19
            Macro Recall: 0.0
            Macro F1: 0.0
            


In [18]:
sf_scored.get_document_metric()

  metrics["f1"] = 2.0 * metrics["precision"] * metrics["recall"] / (metrics["precision"] + metrics["recall"])


{'recall': 0.0, 'precision': 0.0, 'f1': nan}

# Analyse wiki results

## FEVER

In [19]:
_, fever_match_matrix = fever_dev_scored.get_document_metric(return_df=True)
fever_match_matrix = fever_match_matrix.set_index("claim_id")
fever_dev_wiki = pd.DataFrame([{"claim_id": i["id"], "wiki_results": set(i["wiki_results"])} for i in fever_dev]).set_index("claim_id")
fever_match_matrix = fever_match_matrix.join(fever_dev_wiki, how="left")
fever_match_matrix.query("doc_recall < 1")

Unnamed: 0_level_0,claim_label,n_predicted_evidences,n_total_evidences,full_hit,evidence_sent_miss,irrelevant,evidence_page,predicted_page,_tp,doc_recall,doc_precision,wiki_results
claim_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
111897,REFUTES,5,5,3.0,1,{Television_network},"{Telemundo, Hispanic_and_Latino_Americans}","{Television_network, Telemundo}",1,0.500000,0.500000,{2019–20_United_States_network_television_sche...
105095,SUPPORTS,5,4,1.0,2,"{Character_-LRB-arts-RRB-, Homeland, Homeland_...","{Carrie_Mathison, Nicholas_Brody}","{Character_-LRB-arts-RRB-, Homeland, Homeland_...",1,0.500000,0.100000,"{Homeland_-LRB-TV_series-RRB-, Character, Nich..."
227362,REFUTES,5,2,1.0,3,"{Home, Giada_-LRB-disambiguation-RRB-, DVD, Gi...","{Food_Network, Giada_at_Home}","{Giada_-LRB-disambiguation-RRB-, DVD, Giada, H...",1,0.500000,0.200000,"{The_History_of_Middle-earth, HOME_Investment_..."
86175,REFUTES,5,2,0.0,0,"{Songwriter, Singer-songwriter, Hourglass}","{James_Taylor, Hourglass_-LRB-James_Taylor_alb...","{Songwriter, Singer-songwriter, Hourglass}",0,0.000000,0.000000,"{Songwriter, Hourglass_corset, Slava_-LRB-sing..."
71959,SUPPORTS,5,7,3.0,2,"{Single_-LRB-music-RRB-, Single}","{Jennifer_Lopez, J.Lo_-LRB-album-RRB-, Como_Am...","{Single_-LRB-music-RRB-, Jennifer_Lopez, Single}",1,0.333333,0.333333,"{Single_&_Single, Single_parent, Single_-LRB-m..."
...,...,...,...,...,...,...,...,...,...,...,...,...
5483,SUPPORTS,5,1,0.0,0,"{The_Hit, DVD, Hit, The_DVD}",{The_Hit_-LRB-1984_film-RRB-},"{The_Hit, DVD, Hit, The_DVD}",0,0.000000,0.000000,"{Hit_the_Road_Jack, DVD_recordable, BTS_videog..."
63731,REFUTES,5,34,4.0,1,"{Space, Space_-LRB-disambiguation-RRB-}","{Rwanda, Tanzania, African_Great_Lakes, Uganda...","{Space, Uganda, Space_-LRB-disambiguation-RRB-}",1,0.100000,0.333333,"{Uganda_Scheme, International_Space_Station, U..."
44864,SUPPORTS,5,2,1.0,1,"{Stanford_Prison_Experiment_-LRB-band-RRB-, Or...","{Stanford_prison_experiment, Office_of_Naval_R...","{Stanford_prison_experiment, Stanford_Prison_E...",1,0.500000,0.200000,"{History_of_Stanford_University, Das_Experimen..."
144200,REFUTES,5,2,0.0,0,"{Heroes_-LRB-season_2-RRB-, Heroes_-LRB-season...",{Heroes_-LRB-TV_series-RRB-},"{Heroes_-LRB-season_2-RRB-, Heroes_-LRB-season...",0,0.000000,0.000000,{List_of_Legend_of_the_Galactic_Heroes_episode...


In [20]:
fever_match_matrix["wiki_results"].apply(len).mean()

25.594659465946595

## Climate-FEVER

In [21]:
_, cfever_match_matrix = cfever_dev_scored.get_document_metric(return_df=True)
cfever_match_matrix = cfever_match_matrix.set_index("claim_id")
cfever_dev_wiki = pd.DataFrame([{"claim_id": i["id"], "wiki_results": set(i["wiki_results"])} for i in cfever_dev]).set_index("claim_id")
cfever_match_matrix = cfever_match_matrix.join(cfever_dev_wiki, how="left")
cfever_match_matrix.query("doc_recall < 1")

Unnamed: 0_level_0,claim,claim_label,n_predicted_evidences,n_total_evidences,full_hit,other_evidence_full_hit,evidence_sent_miss,other_evidence_sent_miss,irrelevant,evidence_page,predicted_page,_tp,doc_recall,doc_precision,wiki_results
claim_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
42,Unprecedented climate change has caused sea le...,REFUTES,5,2,0.0,0.0,0,5.0,"{Sea_Level_-LRB-band-RRB-, Sydney, Climate_cha...","{Sea_level, Global_warming}","{Sydney, Sea_level, Climate_change, Sea_Level_...",1,0.5,0.166667,"{Sydney, Pioneer_0, Sydney_Harbour_Tunnel, Syd..."
44,Human-produced carbon might be one of the fact...,REFUTES,5,5,0.0,,0,,"{Factoring, There_There, It, Factor, One_-LRB-...","{Global_warming_controversy, Scientific_consen...","{Factoring, There_There, It, Factor, One_-LRB-...",0,0.0,0.000000,"{South_Korea, Effects_of_climate_change_on_agr..."
74,The models predicted about three times the amo...,SUPPORTS,1,1,0.0,0.0,0,0.0,"{Warming, We_-LRB-disambiguation-RRB-, Model_-...",{Global_warming_controversy},"{Warming, Model_-LRB-person-RRB-, World, We, T...",0,0.0,0.000000,"{UEFA_Euro_1988_squads, Terraforming_of_Mars, ..."
76,Burping cows are more damaging to the climate ...,NOT ENOUGH INFO,5,5,0.0,,0,,"{Climate, Burping, Cars_-LRB-franchise-RRB-, P...","{Climate_change_mitigation, Earth}","{Climate, Burping, Cars_-LRB-franchise-RRB-, P...",0,0.0,0.000000,"{Mercury_-LRB-planet-RRB-, Oceanic_climate, Th..."
79,The rate of warming according to the data is m...,REFUTES,1,1,0.0,0.0,0,0.0,"{Warming, IPCC_-LRB-disambiguation-RRB-, Model...",{Intergovernmental_Panel_on_Climate_Change},"{Warming, IPCC_-LRB-disambiguation-RRB-, Model...",0,0.0,0.000000,"{IPCC_Fifth_Assessment_Report, Greenhouse_gas,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3048,There is no question whatsoever that the CO2 i...,SUPPORTS,1,4,0.0,0.0,0,0.0,"{Question, There_There, No_Question, There}","{Greenhouse_gas, Carbon_dioxide, Carbon_dioxid...","{Question, There_There, No_Question, There}",0,0.0,0.000000,"{Greenhouse_gas, Rhetorical_question, The_Weal..."
3069,Continued greenhouse gas emissions at or above...,SUPPORTS,5,1,0.0,0.0,0,4.0,"{Climate_system, Global_Climate_Observing_Syst...",{Global_warming},"{Climate_system, Global_Climate_Observing_Syst...",0,0.0,0.000000,"{21st_Century_Breakdown, 20th_Century_Animatio..."
3104,the satellite sensors show less warming in the...,SUPPORTS,5,1,0.0,1.0,0,1.0,"{Troposphere, Temperature}",{Global_warming},"{Troposphere, Earth, Temperature}",0,0.0,0.000000,"{Madden–Julian_oscillation, Internal_structure..."
3111,Ljungqvist's millennial temperature reconstruc...,NOT ENOUGH INFO,1,5,0.0,,0,,{Ljungqvist},"{North_Report, Hockey_stick_controversy, List_...",{Ljungqvist},0,0.0,0.000000,"{Porites, Clark_-LRB-TV_series-RRB-, Hockey_st..."


In [23]:
cfever_match_matrix["wiki_results"].apply(len).mean()

49.37410071942446

In [35]:
cfever_match_matrix.loc[42, "claim"]

'Unprecedented climate change has caused sea level at Sydney Harbour to rise approximately 0.0 cm over the past 140 years.'

In [36]:
cfever_match_matrix.loc[42, "wiki_results"]

{'2017_in_science',
 '2022_Hunga_Tonga–Hunga_Haʻapai_eruption_and_tsunami',
 'Atmospheric_pressure',
 'Attack_on_Sydney_Harbour',
 'Cephalopod_eye',
 'City_status_in_the_United_Kingdom',
 'Climate_change',
 'Climate_change_adaptation',
 'Climate_change_denial',
 'Climate_change_in_Australia',
 'Climate_change_in_the_United_States',
 'Climate_variability_and_change',
 'Cubic_metre',
 'DOCSIS',
 'Effects_of_climate_change',
 'Geography_of_Sydney',
 'Height_above_mean_sea_level',
 'IERS_Reference_Meridian',
 'Inch',
 'Indus_River_Delta',
 'List_of_places_on_land_with_elevations_below_sea_level',
 'List_of_presidents_of_Wellesley_College',
 'Litre',
 'Maximilian_I,_Holy_Roman_Emperor',
 'Nightcap_National_Park',
 'One_Sydney_Harbour',
 'Past_sea_level',
 'Pioneer_0',
 'Port_Jackson',
 'Scientific_consensus_on_climate_change',
 'Sea_Level_-LRB-band-RRB-',
 'Sea_level',
 'Sea_level_rise',
 'Selters_-LRB-Lahn-RRB-',
 'Spinach',
 "Stott's_College",
 'Sydney',
 'Sydney_Ferries',
 'Sydney_Harbou

In [37]:
[i for i in cfever_dev if i["id"] == 42]

[{'id': 42,
  'claim': 'Unprecedented climate change has caused sea level at Sydney Harbour to rise approximately 0.0 cm over the past 140 years.',
  'label': 'REFUTES',
  'elab': ['REFUTES', 'REFUTES'],
  'evidence': [[[None, None, 'Global_warming', 98]],
   [[None, None, 'Sea_level', 6]]],
  'other_elab': ['NOT ENOUGH INFO', 'NOT ENOUGH INFO', 'NOT ENOUGH INFO'],
  'other_evidence': [[[None, None, 'Global_warming', 99]],
   [[None, None, 'Sea_level_rise', 1]],
   [[None, None, 'Sea_level_rise', 113]]],
  'verifiable': 'VERIFIABLE',
  'noun_phrases': ['sea level at Sydney Harbour',
   'Sydney Harbour',
   'approximately 0.0 cm',
   'Unprecedented climate change has caused sea level at Sydney Harbour to rise approximately 0.0 cm over the past 140 years.',
   'the past 140 years',
   'sea level',
   'Unprecedented climate change'],
  'predicted_pages': ['Climate_change',
   'Past_sea_level',
   'Sea_Level_-LRB-band-RRB-',
   'Sea_level_rise',
   'Sea_level',
   'Sydney'],
  'wiki_result