# Metrics
-> write as simple cells to be converted into scripts with args

## Decontextualization

In [5]:
from os import path
import json
import tensorflow as tf, pandas as pd
import tensorflow_text  # Required to run exported model.
import argparse
from tqdm import tqdm

DATASET_BUCKET = "/mnt/personal/ullriher/models/tf/decontext_dataset"
SAVED_MODEL_PATH = path.join("/mnt/personal/ullriher/models/tf/decontext_dataset", "t5_base/1611267950")


def load_predict_fn(model_path):
    print("Loading SavedModel in eager mode.")
    imported = tf.saved_model.load(model_path, ["serve"])
    return lambda x: imported.signatures["serving_default"](tf.constant(x))["outputs"].numpy()

predict_fn = load_predict_fn(SAVED_MODEL_PATH)

def decontextualize(input):
    return predict_fn([input])[0].decode("utf-8")

dp = {}
def decontextualize_with_dp(input):
    if input in dp:
        return dp[input]
    result = decontextualize(input)
    dp[input] = result
    return result

def create_input(paragraph, target, page_title="", section_title=""):
    prefix = paragraph
    return " [SEP] ".join((page_title, section_title, prefix, target, ""))

def same_alphabetic_chars(generated, decontext_proposed):
    a = "".join(filter(str.isalpha, generated)).lower()
    b = "".join(filter(str.isalpha, decontext_proposed)).lower()
    return  bool(len(a)) and a == b

# if running from command line, parse argument as model name
if False:
    parser = argparse.ArgumentParser()
    parser.add_argument("model_name", type=str, help="Name of the model to use")
    args = parser.parse_args()
    model_name = args.model_name
else:
    model_name = "t5_small_multiclaim"

df = pd.read_json("/mnt/data/factcheck/claim_extraction/feversum/hf_multiclaim/test.jsonl", lines=True)
df["generated"]=None
predictions = f"/home/ullriher/ullriher/data/_paper/predictions/{model_name}.jsonl"
outfile = f"/home/ullriher/ullriher/data/_paper/metrics/decontextualization/{model_name}.jsonl"
skip_title_from_context = True

with open(predictions, "r") as f:
    for i, line in enumerate(f):
        df.at[i, "generated"] = json.loads(line)
        # if context starts with source\n, remove it
        if skip_title_from_context:
            if df.at[i, "sentence_context"].startswith(df.at[i, "source"]+"\n"):
                df.at[i, "sentence_context"] = df.at[i, "sentence_context"][len(df.at[i, "source"])+1:]

# expand df by generated, one row per element of generated list
df = df.explode("generated").reset_index(drop=True)
df.drop(columns=["source_text"], inplace=True)
# remove leading source\n from sentence_context
df["decontext_result"] = None
df["decontext_label"] = None
df["decontext_proposed"] = None

if path.exists(outfile):
    df = pd.read_json(outfile, lines=True)
    print(f"Loaded checkpoint from {outfile}")

# if claims column, drop it
if "claims" in df.columns:
    df.drop(columns=["claims"], inplace=True)
    
print("predicting")
for index, row in tqdm(df.iterrows()):
    if row["decontext_label"] is not None:
        continue
    
    page_title = row["source"]
    section_title = ""
    input = create_input(row["sentence_context"], row["generated"], page_title, section_title)
    decontextualized = decontextualize(input)
    row["decontext_label"], row["decontext_proposed"] = [s.strip() for s in decontextualized.split("####", 1)]
    
    if same_alphabetic_chars(row["generated"], row["decontext_proposed"]):
        row["decontext_label"] = "UNNECESSARY"
        
    # only preserve alphabetic and turn to uppercase
    df.at[index, "decontext_result"] = decontextualized
    df.at[index, "decontext_label"] = "".join(filter(str.isalpha, row["decontext_label"])).upper()
    df.at[index, "decontext_proposed"] = row["decontext_proposed"]
    # break at 100
    # save df to outfile
    if True or index % 100 == 0:
        df.to_json(outfile, lines=True, orient="records")

Loading SavedModel in eager mode.


KeyboardInterrupt: 

Loaded checkpoint from /home/ullriher/ullriher/data/_paper/metrics/decontextualization/t5_small_multiclaim.jsonl


In [None]:
%%sh
#!/bin/bash

#SBATCH --partition amdgpu
#SBATCH --nodes 1
#SBATCH --ntasks-per-node 1
#SBATCH --mem-per-cpu 64G
#SBATCH --gres gpu:1
#SBATCH --time 24:00:00
#SBATCH --job-name acl_feversum
#SBATCH --output /home/ullriher/ullriher/logs/expts/amdsum.%j.out



ml Python/3.10.4-GCCcore-11.3.0-bare
module unload OpenSSL/1.1

source ~/venvs/2023feb/bin/activate
cd ~/ullriher/src

export PYTHONPATH=/home/ullriher/ullriher/src:$PYTHONPATH
export PATH=/home/ullriher/nodejs-latest/node-v15.14.0:/home/ullriher/venv_amd/bin:$PATH

#deepspeed --num_gpus=1 
~/venvs/2023feb/bin/python metric_atomicity.py qacg

sh: 14: ml: not found
sh: 15: module: not found
sh: 17: source: not found
2024-02-14 01:04:40.777072: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /mnt/appl/software/Python/3.10.4-GCCcore-11.3.0-bare/lib:/mnt/appl/software/libffi/3.4.2-GCCcore-11.3.0/lib64:/mnt/appl/software/XZ/5.2.5-GCCcore-11.3.0/lib:/mnt/appl/software/SQLite/3.38.3-GCCcore-11.3.0/lib:/mnt/appl/software/Tcl/8.6.12-GCCcore-11.3.0/lib:/mnt/appl/software/libreadline/8.1.2-GCCcore-11.3.0/lib:/mnt/appl/software/ncurses/6.3-GCCcore-11.3.0/lib:/mnt/appl/software/bzip2/1.0.8-GCCcore-11.3.0/lib:/mnt/appl/software/binutils/2.38-GCCcore-11.3.0/lib:/mnt/appl/software/zlib/1.2.12-GCCcore-11.3.0/lib:/mnt/appl/software/GCCcore/11.3.0/lib64:/mnt/appl/software/Python/3.10.4-GCCcore-11.3.0/lib:/mnt/appl/software/GMP/6.2.1-GCCcore-11.3.0/lib
2024-02-14 01

[2024-02-14 01:04:49,285] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Computing metric: atomicity
Using model qacg
predicting
Loading Named Entity Recognition Pipeline[33m...[0m
2024-02-14 01:05:11,768 SequenceTagger predicts: Dictionary with 75 tags: O, S-PERSON, B-PERSON, E-PERSON, I-PERSON, S-GPE, B-GPE, E-GPE, I-GPE, S-ORG, B-ORG, E-ORG, I-ORG, S-DATE, B-DATE, E-DATE, I-DATE, S-CARDINAL, B-CARDINAL, E-CARDINAL, I-CARDINAL, S-NORP, B-NORP, E-NORP, I-NORP, S-MONEY, B-MONEY, E-MONEY, I-MONEY, S-PERCENT, B-PERCENT, E-PERCENT, I-PERCENT, S-ORDINAL, B-ORDINAL, E-ORDINAL, I-ORDINAL, S-LOC, B-LOC, E-LOC, I-LOC, S-TIME, B-TIME, E-TIME, I-TIME, S-WORK_OF_ART, B-WORK_OF_ART, E-WORK_OF_ART, I-WORK_OF_ART, S-FAC
Loading Relation Extraction Pipeline[33m...[0m


0it [00:00, ?it/s]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at studio-ousia/luke-large-finetuned-tacred were not used when initializing LukeForEntityPairClassification: ['luke.embeddings.position_ids']
- This IS expected if you are initializing LukeForEntityPairClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeForEntityPairClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Process is interrupted.


10it [00:53,  5.38s/it]
Traceback (most recent call last):
  File "/home/ullriher/ullriher/src/metric_atomicity.py", line 94, in <module>


    df.at[index, "rebel_facts"] = rebel_solve(row["generated"])
  File "/home/ullriher/ullriher/src/metric_atomicity.py", line 24, in rebel_solve
    preds = rebel(input_text, return_tensors=True, return_text=False)
  File "/home/ullriher/venvs/2023feb/lib/python3.10/site-packages/transformers/pipelines/text2text_generation.py", line 167, in __call__
    result = super().__call__(*args, **kwargs)
  File "/home/ullriher/venvs/2023feb/lib/python3.10/site-packages/transformers/pipelines/base.py", line 1162, in __call__
    return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
  File "/home/ullriher/venvs/2023feb/lib/python3.10/site-packages/transformers/pipelines/base.py", line 1169, in run_single
    model_outputs = self.forward(model_inputs, **forward_params)
  File "/home/ullriher/venvs/2023feb/lib/python3.10/site-packages/transformers/pipelines/base.py", line 1068, in forward
    model_outputs = self._forward(model_inputs, **forward_params)
  File "/hom

In [None]:
# drop where generated is null
df = df.dropna(subset=["generated"])

In [None]:
df

Unnamed: 0,source,sentence_id,sentence,sentence_context,generated,rebel,factsumm,rebel_facts,factsumm_facts
0,R. Kelly,6140,"In 2002 and 2004, Kelly released collaboration albums with rapper Jay-Z and has been a guest vocalist for other hip hop artists like Nas, Sean Combs, and The Notorious B.I.G. The Recording Industry Association of America (RIAA) has recognized R. Kelly as one of the best-selling music artists in the United States with 40 million albums sold as well as only the fifth black artist to crack the top 50 of the same list.","In 1996, Kelly was nominated for a Grammy for writing Michael Jackson's song ""You Are Not Alone"".\nIn 2002 and 2004, Kelly released collaboration albums with rapper Jay-Z and has been a guest vocalist for other hip hop artists like Nas, Sean Combs, and The Notorious B.I.G. The Recording Industry Association of America (RIAA) has recognized R. Kelly as one of the best-selling music artists in the United States with 40 million albums sold as well as only the fifth black artist to crack the top 50 of the...",R. Kelly has been a guest vocalist for Nas.,1,1,"[(R. Kelly, member of, Nas), (Nas, has part, R. Kelly)]",{}
1,R. Kelly,6140,"In 2002 and 2004, Kelly released collaboration albums with rapper Jay-Z and has been a guest vocalist for other hip hop artists like Nas, Sean Combs, and The Notorious B.I.G. The Recording Industry Association of America (RIAA) has recognized R. Kelly as one of the best-selling music artists in the United States with 40 million albums sold as well as only the fifth black artist to crack the top 50 of the same list.","In 1996, Kelly was nominated for a Grammy for writing Michael Jackson's song ""You Are Not Alone"".\nIn 2002 and 2004, Kelly released collaboration albums with rapper Jay-Z and has been a guest vocalist for other hip hop artists like Nas, Sean Combs, and The Notorious B.I.G. The Recording Industry Association of America (RIAA) has recognized R. Kelly as one of the best-selling music artists in the United States with 40 million albums sold as well as only the fifth black artist to crack the top 50 of the...",R. Kelly has been a guest vocalist for Sean Combs.,1,1,"[(R. Kelly, member of, Sean Combs), (Sean Combs, has part, R. Kelly)]",{}
2,R. Kelly,6140,"In 2002 and 2004, Kelly released collaboration albums with rapper Jay-Z and has been a guest vocalist for other hip hop artists like Nas, Sean Combs, and The Notorious B.I.G. The Recording Industry Association of America (RIAA) has recognized R. Kelly as one of the best-selling music artists in the United States with 40 million albums sold as well as only the fifth black artist to crack the top 50 of the same list.","In 1996, Kelly was nominated for a Grammy for writing Michael Jackson's song ""You Are Not Alone"".\nIn 2002 and 2004, Kelly released collaboration albums with rapper Jay-Z and has been a guest vocalist for other hip hop artists like Nas, Sean Combs, and The Notorious B.I.G. The Recording Industry Association of America (RIAA) has recognized R. Kelly as one of the best-selling music artists in the United States with 40 million albums sold as well as only the fifth black artist to crack the top 50 of the...",R. Kelly has been a guest vocalist for The Notorious B.I.G.,1,1,"[(R. Kelly, member of, The Notorious B.I.G.), (The Notorious B.I.G., has part, R. Kelly)]",{}
3,R. Kelly,6140,"In 2002 and 2004, Kelly released collaboration albums with rapper Jay-Z and has been a guest vocalist for other hip hop artists like Nas, Sean Combs, and The Notorious B.I.G. The Recording Industry Association of America (RIAA) has recognized R. Kelly as one of the best-selling music artists in the United States with 40 million albums sold as well as only the fifth black artist to crack the top 50 of the same list.","In 1996, Kelly was nominated for a Grammy for writing Michael Jackson's song ""You Are Not Alone"".\nIn 2002 and 2004, Kelly released collaboration albums with rapper Jay-Z and has been a guest vocalist for other hip hop artists like Nas, Sean Combs, and The Notorious B.I.G. The Recording Industry Association of America (RIAA) has recognized R. Kelly as one of the best-selling music artists in the United States with 40 million albums sold as well as only the fifth black artist to crack the top 50 of the...",R. Kelly has been recognized by the Recording Industry Association of America.,1,1,"[(R. Kelly, member of, Recording Industry Association of America)]",{}
4,R. Kelly,6140,"In 2002 and 2004, Kelly released collaboration albums with rapper Jay-Z and has been a guest vocalist for other hip hop artists like Nas, Sean Combs, and The Notorious B.I.G. The Recording Industry Association of America (RIAA) has recognized R. Kelly as one of the best-selling music artists in the United States with 40 million albums sold as well as only the fifth black artist to crack the top 50 of the same list.","In 1996, Kelly was nominated for a Grammy for writing Michael Jackson's song ""You Are Not Alone"".\nIn 2002 and 2004, Kelly released collaboration albums with rapper Jay-Z and has been a guest vocalist for other hip hop artists like Nas, Sean Combs, and The Notorious B.I.G. The Recording Industry Association of America (RIAA) has recognized R. Kelly as one of the best-selling music artists in the United States with 40 million albums sold as well as only the fifth black artist to crack the top 50 of the...",R. Kelly has been recognized as one of the best-selling music artists in the United States.,1,2,"[(R. Kelly, country of citizenship, United States)]","{(R. Kelly, per:countries_of_residence, the United States), (one, per:countries_of_residence, the United States)}"
...,...,...,...,...,...,...,...,...,...
1972,Adrianne Palicki,3541,"Adrianne Lee Palicki (born May 6, 1983) is an American actress best known for her roles as Tyra Collette in the television series Friday Night Lights (2006 - 2011) and supporting roles in the films Legion (2010), Red Dawn (2012), G.I. Joe: Retaliation (2013), and John Wick (2014).","Adrianne Lee Palicki (born May 6, 1983) is an American actress best known for her roles as Tyra Collette in the television series Friday Night Lights (2006 - 2011) and supporting roles in the films Legion (2010), Red Dawn (2012), G.I. Joe: Retaliation (2013), and John Wick (2014).\nShe played Barbara ""Bobbi"" Morse on the ABC series Agents of S.H.I.E.L.D. (2014 - 2016).",Adrianne Palicki was born in 1983.,,,,
1973,Adrianne Palicki,3541,"Adrianne Lee Palicki (born May 6, 1983) is an American actress best known for her roles as Tyra Collette in the television series Friday Night Lights (2006 - 2011) and supporting roles in the films Legion (2010), Red Dawn (2012), G.I. Joe: Retaliation (2013), and John Wick (2014).","Adrianne Lee Palicki (born May 6, 1983) is an American actress best known for her roles as Tyra Collette in the television series Friday Night Lights (2006 - 2011) and supporting roles in the films Legion (2010), Red Dawn (2012), G.I. Joe: Retaliation (2013), and John Wick (2014).\nShe played Barbara ""Bobbi"" Morse on the ABC series Agents of S.H.I.E.L.D. (2014 - 2016).",Adrianne Palicki is an American.,,,,
1974,Adrianne Palicki,3541,"Adrianne Lee Palicki (born May 6, 1983) is an American actress best known for her roles as Tyra Collette in the television series Friday Night Lights (2006 - 2011) and supporting roles in the films Legion (2010), Red Dawn (2012), G.I. Joe: Retaliation (2013), and John Wick (2014).","Adrianne Lee Palicki (born May 6, 1983) is an American actress best known for her roles as Tyra Collette in the television series Friday Night Lights (2006 - 2011) and supporting roles in the films Legion (2010), Red Dawn (2012), G.I. Joe: Retaliation (2013), and John Wick (2014).\nShe played Barbara ""Bobbi"" Morse on the ABC series Agents of S.H.I.E.L.D. (2014 - 2016).",Adrianne Palicki is best known for her role as Tyra Collette.,,,,
1975,Adrianne Palicki,3541,"Adrianne Lee Palicki (born May 6, 1983) is an American actress best known for her roles as Tyra Collette in the television series Friday Night Lights (2006 - 2011) and supporting roles in the films Legion (2010), Red Dawn (2012), G.I. Joe: Retaliation (2013), and John Wick (2014).","Adrianne Lee Palicki (born May 6, 1983) is an American actress best known for her roles as Tyra Collette in the television series Friday Night Lights (2006 - 2011) and supporting roles in the films Legion (2010), Red Dawn (2012), G.I. Joe: Retaliation (2013), and John Wick (2014).\nShe played Barbara ""Bobbi"" Morse on the ABC series Agents of S.H.I.E.L.D. (2014 - 2016).",Adrianne Palicki is best known for her role in Legion.,,,,


## Generate batch files

In [41]:
batch_folder = "/home/ullriher/ullriher/slurm/tmp"
metric = "qags"
i=1
for model in ['qacg', 'qlora-mistral-instruct-v0.2','gpt-4-turbo-3-shot', 't5_small_multiclaim','t5_small_diverse_7_beam_search']:
                with open(f"{batch_folder}/{i}.batch","w") as f:
                    print(f"""#!/bin/bash
#SBATCH --partition amdgpu
#SBATCH --nodes 1
#SBATCH --ntasks-per-node 1
#SBATCH --mem-per-cpu 64G
#SBATCH --gres gpu:1
#SBATCH --time 24:00:00
#SBATCH --job-name  {metric}
#SBATCH --output /home/ullriher/ullriher/logs/metrics/amdsum.%j.out



ml Python/3.10.4-GCCcore-11.3.0-bare
module unload OpenSSL/1.1

source ~/venvs/2023feb/bin/activate
cd ~/ullriher/src

export PYTHONPATH=/home/ullriher/ullriher/src:$PYTHONPATH
export PATH=/home/ullriher/nodejs-latest/node-v15.14.0:/home/ullriher/venv_amd/bin:$PATH

#deepspeed --num_gpus=1 
~/venvs/2023feb/bin/python metric_{metric}.py {model}""",file=f)
                    print(f"sbatch {batch_folder}/{i}.batch")
                    i+=1

sbatch /home/ullriher/ullriher/slurm/tmp/1.batch
sbatch /home/ullriher/ullriher/slurm/tmp/2.batch
sbatch /home/ullriher/ullriher/slurm/tmp/3.batch
sbatch /home/ullriher/ullriher/slurm/tmp/4.batch
sbatch /home/ullriher/ullriher/slurm/tmp/5.batch


## Atomicity

In [None]:
#use importlib and reload utils.datautils.extract_triplets
import importlib
import utils.datautils
importlib.reload(utils.datautils)

<module 'utils.datautils' from '/home/ullriher/ullriher/src/utils/datautils.py'>

In [None]:
from alignscore import AlignScore

scorer = AlignScore(model='roberta-base', batch_size=32, device="cuda:0", ckpt_path='/home/ullriher/ullriher/models/alignscore/AlignScore-base.ckpt', evaluation_mode='nli_sp')
score = scorer.score(contexts=['hello world.'], claims=['hello world.'])

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Lightning automatically upgraded your loaded checkpoint from v1.7.7 to v1.9.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file models/alignscore/AlignScore-base.ckpt`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 32/32 [00:00<00:00, 65.12it/s]


In [None]:
score

[0.9947293400764465,
 0.9947293400764465,
 0.9947293400764465,
 0.9947293400764465,
 0.9947293400764465,
 0.9947293400764465,
 0.9947293400764465,
 0.9947293400764465,
 0.9947293400764465,
 0.9947293400764465,
 0.9947293400764465,
 0.9947293400764465,
 0.9947293400764465,
 0.9947293400764465,
 0.9947293400764465,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673,
 0.013116993941366673]

In [148]:
from os import path
import json
import bert_score
import sys
from tqdm import tqdm
from utils.datautils import extract_triplets
from transformers import pipeline
from utils.ntbutils import load_user_libs
import pysbd
from utils.datautils import avg_top_n
import pandas as pd

load_user_libs("/home/ullriher/lib", ".path_include")
from alignscore import AlignScore

#segmenter 
segmenter = pysbd.Segmenter(language="en", clean=False)
sent_tokenize = segmenter.segment

align_large = AlignScore(model='roberta-large', batch_size=32, device="cuda:0", ckpt_path='/home/ullriher/ullriher/models/alignscore/AlignScore-large.ckpt', evaluation_mode='nli_sp')
align_base = AlignScore(model='roberta-base', batch_size=32, device="cuda:0", ckpt_path='/home/ullriher/ullriher/models/alignscore/AlignScore-base.ckpt', evaluation_mode='nli_sp')


def bertscore_solve(premise, claim):
    text_sentencewise = sent_tokenize(premise)
    P, R, F = bert_score.score([claim] * len(text_sentencewise), text_sentencewise, model_type="roberta-base")

    return (float(avg_top_n(P, 2)),float(avg_top_n(R, 2)),float(avg_top_n(F, 2)))


# arse first argument as model name
if False:
    model_name = sys.argv[1]
else:
    model_name = "t5_small_multiclaim"

metric = "faithfulness"

print(f"Computing metric: {metric}")
print(f"Using model {model_name}")


df = pd.read_json("/mnt/data/factcheck/claim_extraction/feversum/hf_multiclaim/test.jsonl", lines=True)
df["generated"] = None
predictions = f"/home/ullriher/ullriher/data/_paper/predictions/{model_name}.jsonl"
outfile = f"/home/ullriher/ullriher/data/_paper/metrics/{metric}/{model_name}.jsonl"
skip_title_from_context = False

with open(predictions, "r") as f:
    for i, line in enumerate(f):
        df.at[i, "generated"] = json.loads(line)
        # if context starts with source\n, remove it
        if skip_title_from_context:
            if df.at[i, "sentence_context"].startswith(df.at[i, "source"] + "\n"):
                df.at[i, "sentence_context"] = df.at[i, "sentence_context"][len(df.at[i, "source"]) + 1 :]

# expand df by generated, one row per element of generated list
df = df.explode("generated").reset_index(drop=True)
df.drop(columns=["source_text"], inplace=True)
# remove leading source\n from sentence_context
df["bertscore"] = None
df["bertscore_avgtop2"] = None
df["alignscore_base"] = None
df["alignscore_large"] = None

if path.exists(outfile):
    df = pd.read_json(outfile, lines=True)
    print(f"Loaded checkpoint from {outfile}")

# if claims column, drop it
if "claims" in df.columns:
    df.drop(columns=["claims"], inplace=True)

print("predicting")
for index, row in tqdm(df.iterrows()):
    if row["alignscore_large"] is not None:
        continue

    premise = row["sentence_context"]
    claim = row["generated"]
    
    df.at[index, "bertscore"] = tuple(map(float, bert_score.score([premise], [claim], model_type="roberta-base")))
    df.at[index, "bertscore_avgtop2"] = bertscore_solve(premise, claim)
    df.at[index, "alignscore_base"] = align_base.score(contexts=[premise], claims=[claim])[0]
    df.at[index, "alignscore_large"] = align_large.score(contexts=[premise], claims=[claim])[0]
    
    # break at 100
    # save df to outfile
    if True or index % 100 == 0:
        df.to_json(outfile, lines=True, orient="records")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Lightning automatically upgraded your loaded checkpoint from v1.8.0.post1 to v1.9.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file models/alignscore/AlignScore-large.ckpt`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  rank_zero_warn(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'rober

Computing metric: faithfulness
Using model t5_small_multiclaim
Loaded checkpoint from /home/ullriher/ullriher/data/_paper/metrics/faithfulness/t5_small_multiclaim.jsonl
predicting


0it [00:00, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 45.32it/s]
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 17.61it/s]
20it [00:02,  8.73it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

KeyboardInterrupt: 

In [145]:
s = bert_score.score([premise], [claim], model_type="roberta-base")
# (tensor([0.7974]), tensor([0.9319]), tensor([0.8594]))
#convert to tuple of floats
s = tuple(map(float, s))
s

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(0.7973551750183105, 0.9318747520446777, 0.859382688999176)

In [108]:
preds

[{'generated_token_ids': tensor([    0, 50267, 18289,  1840,   605,  9554,  1437, 50266,  2370,  8453,
           1437, 50265,   737,   547,     2])}]

In [106]:
extracted_preds

['<s><triplet> Harold Godwinson <subj> English king <obj> position held</s>']

In [107]:
facts = extract_triplets(extracted_preds[0])
facts

[('Harold Godwinson', 'position held', 'English king')]

In [95]:
count_facts(facts)

6

In [57]:
extracted_preds

['<s><triplet> R. Kelly <subj> Nas <obj> member of <triplet> Nas <subj> R. Kelly <obj> has part</s>']

In [65]:
def extract_triplets(text):
    triplets = []
    subject, object_, relation = '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append((subject.strip(), relation.strip(), object_.strip()))
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append((subject.strip(), relation.strip(), object_.strip()))
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append((subject.strip(), relation.strip(), object_.strip()))
    return triplets

In [118]:
get_facts("Richard III of England was the first English king to die in battle since Harold Godwinson.")

2024-02-14 00:45:52,928 SequenceTagger predicts: Dictionary with 75 tags: O, S-PERSON, B-PERSON, E-PERSON, I-PERSON, S-GPE, B-GPE, E-GPE, I-GPE, S-ORG, B-ORG, E-ORG, I-ORG, S-DATE, B-DATE, E-DATE, I-DATE, S-CARDINAL, B-CARDINAL, E-CARDINAL, I-CARDINAL, S-NORP, B-NORP, E-NORP, I-NORP, S-MONEY, B-MONEY, E-MONEY, I-MONEY, S-PERCENT, B-PERCENT, E-PERCENT, I-PERCENT, S-ORDINAL, B-ORDINAL, E-ORDINAL, I-ORDINAL, S-LOC, B-LOC, E-LOC, I-LOC, S-TIME, B-TIME, E-TIME, I-TIME, S-WORK_OF_ART, B-WORK_OF_ART, E-WORK_OF_ART, I-WORK_OF_ART, S-FAC


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at studio-ousia/luke-large-finetuned-tacred were not used when initializing LukeForEntityPairClassification: ['luke.embeddings.position_ids']
- This IS expected if you are initializing LukeForEntityPairClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeForEntityPairClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{('England', 'per:origin', 'English'),
 ('English', 'org:country_of_headquarters', 'England'),
 ('Harold Godwinson', 'per:countries_of_residence', 'England'),
 ('Harold Godwinson', 'per:origin', 'English'),
 ('Richard III', 'per:countries_of_residence', 'England'),
 ('Richard III', 'per:origin', 'English'),
 ('first', 'per:countries_of_residence', 'England'),
 ('first', 'per:origin', 'English')}

In [121]:
get_facts("The Battle of Bosworth Field took place in Leicestershire.")

{('The Battle of Bosworth Field',
  'org:stateorprovince_of_headquarters',
  'Leicestershire')}

In [59]:
#autoreload
%load_ext autoreload
%autoreload 2

## Focus, covfefe

In [176]:
from os import path
import json
import bert_score
import sys
from tqdm import tqdm
from utils.datautils import extract_triplets
from transformers import pipeline
from utils.ntbutils import load_user_libs
import pysbd
from utils.datautils import avg_top_n
import pandas as pd
import numpy as np

load_user_libs("/home/ullriher/lib", ".path_include")
from alignscore import AlignScore
from factsumm import FactSumm
from sentence_transformers import CrossEncoder
deberta = CrossEncoder('cross-encoder/nli-deberta-v3-small')

#segmenter 
factsumm = FactSumm()
segmenter = pysbd.Segmenter(language="en", clean=False)
sent_tokenize = segmenter.segment

align_large = AlignScore(model='roberta-large', batch_size=32, device="cuda:0", ckpt_path='/home/ullriher/ullriher/models/alignscore/AlignScore-large.ckpt', evaluation_mode='nli_sp', verbose=False)
align_base = AlignScore(model='roberta-base', batch_size=32, device="cuda:0", ckpt_path='/home/ullriher/ullriher/models/alignscore/AlignScore-base.ckpt', evaluation_mode='nli_sp', verbose=False)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Lightning automatically upgraded your loaded checkpoint from v1.8.0.post1 to v1.9.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file models/alignscore/AlignScore-large.ckpt`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  rank_zero_warn(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'rober

In [177]:
df[["claims","generated"]].head(3)

Unnamed: 0,claims,generated
0,R. Kelly sold 40 million albums.\nR. Kelly is a musician.\nR. Kelly was recognized as one of the best-selling music artists in the United States.,"[R. Kelly has been a guest vocalist for Nas., R. Kelly has been a guest vocalist for Sean Combs., R. Kelly has been a guest vocalist for The Notorious B.I.G., R. Kelly has been recognized by the Recording Industry Association of America., R. Kelly has been recognized as one of the best-selling music artists in the United States.]"
1,Greenpeace is focused on the issues of over-fishing.\nGreenpeace focuses on multiple environmental issues.\nGreenpeace is an organization.\nGreenpeace is focused on the issues of whaling.\nGreenpeace is focused on the issues of deforestation.,"[Greenpeace is a non-governmental environmental organization., Greenpeace is a US ex-pat environmental activists., Greenpeace is a nonprofit organization., Greenpeace is a non-governmental environmental organization.]"
2,Girls' Generation is a South Korean girl group.\nGirls' Generation was formed by S.M. Entertainment.\nGirls' Generation is a music act.\nGirls' Generation was formed by a South Korean entertainment company.\nGirls' Generation is also known as SNSD.,"[Girls' Generation is a South Korean girl group., Girls' Generation is formed by S.M. Entertainment., Girls' Generation is a South Korean girl group., Girls' Generation is formed by S.M. Entertainment., Girls' Generation is a group.]"


In [194]:
def focus(gold_claims, predicted_claims, same=False):
    if not isinstance(gold_claims, list):
        gold_claims = gold_claims.split("\n")
    if not isinstance(predicted_claims, list):
        predicted_claims = predicted_claims.split("\n")
    
    result = []
    for claim in predicted_claims:
        gold_claims_copy = gold_claims.copy()
        if same: # pop claim from gold_pairs
            # copy gold claims
            for j in range(len(gold_claims_copy)):
                if claim == gold_claims_copy[j]:
                    gold_claims_copy.pop(j)
                    break
        scores=deberta.predict(list(zip(gold_claims_copy, [claim]*len(gold_claims_copy))), apply_softmax=True, show_progress_bar=False)[:,1]
        result.append(np.max(scores))
    return result, np.mean(result) 

focus(df.iloc[0]["claims"], df.iloc[0]["generated"])

([0.00022816141, 0.00032590298, 0.0006033575, 0.00035612102, 0.9962369],
 0.19955009)

In [195]:
def focus_alignscore(gold_claims, predicted_claims, align=align_base, same=False):
    if not isinstance(gold_claims, list):
        gold_claims = gold_claims.split("\n")
    if not isinstance(predicted_claims, list):
        predicted_claims = predicted_claims.split("\n")
    align.verbose = False
    result = []
    for claim in predicted_claims:
        gold_claims_copy = gold_claims.copy()
        if same: # pop claim from gold_pairs
            # copy gold claims
            for j in range(len(gold_claims_copy)):
                if claim == gold_claims_copy[j]:
                    gold_claims_copy.pop(j)
                    break
        score=align.score(["\n".join(gold_claims_copy)], [claim])[0]
        result.append(score)
    return result, np.mean(result) 

focus_alignscore(df.iloc[0]["claims"], df.iloc[0]["generated"])

([0.0008317941683344543,
  0.0013024344807490706,
  0.0024810044560581446,
  0.050148364156484604,
  0.9872347712516785],
 0.20839967370266094)

In [196]:
def bertscore_solve(premise, claim):
    text_sentencewise = sent_tokenize(premise)
    P, R, F = bert_score.score([claim] * len(text_sentencewise), text_sentencewise, model_type="roberta-base")

    return (float(avg_top_n(P, 2)), float(avg_top_n(R, 2)), float(avg_top_n(F, 2)))


# arse first argument as model name
if False:
    model_name = sys.argv[1]
else:
    model_name = "t5_small_multiclaim"

metric = "multiclaim"

print(f"Computing metric: {metric}")
print(f"Using model {model_name}")


df = pd.read_json("/mnt/data/factcheck/claim_extraction/feversum/hf_multiclaim/test.jsonl", lines=True)
df["generated"] = None
predictions = f"/home/ullriher/ullriher/data/_paper/predictions/{model_name}.jsonl"
outfile = f"/home/ullriher/ullriher/data/_paper/metrics/{metric}/{model_name}.jsonl"
skip_title_from_context = False

with open(predictions, "r") as f:
    for i, line in enumerate(f):
        df.at[i, "generated"] = json.loads(line)
        # if context starts with source\n, remove it
        if skip_title_from_context:
            if df.at[i, "sentence_context"].startswith(df.at[i, "source"] + "\n"):
                df.at[i, "sentence_context"] = df.at[i, "sentence_context"][len(df.at[i, "source"]) + 1 :]

# expand df by generated, one row per element of generated list
# df = df.explode("generated").reset_index(drop=True)
df.drop(columns=["source_text"], inplace=True)
# remove leading source\n from sentence_context
df["claims"] = df["claims"].str.split("\n")

df["f_deberta"]=None
df["c_deberta"]=None
df["r_deberta"]=None
df["f_align"]=None
df["c_align"]=None
df["r_align"]=None
df["f_align_mean"]=None
df["f_deberta_mean"]=None
df["c_deberta_mean"]=None
df["c_align_mean"]=None
df["r_deberta_mean"]=None
df["r_align_mean"]=None

if path.exists(outfile):
    df = pd.read_json(outfile, lines=True)
    print(f"Loaded checkpoint from {outfile}")

df = df.dropna(subset=["generated"])
print("predicting")

Computing metric: multiclaim
Using model t5_small_multiclaim
Loaded checkpoint from /home/ullriher/ullriher/data/_paper/metrics/multiclaim/t5_small_multiclaim.jsonl
predicting


In [197]:
for index, row in tqdm(df.iterrows()):
    if row["r_align"] is not None:
        continue
    claims = row["generated"]
    # if claims or generated is empty, skip
    df.at[index, "f_deberta"], df.at[index, "f_deberta_mean"] = focus(row["claims"], claims)
    df.at[index, "c_deberta"], df.at[index, "c_deberta_mean"] = focus(claims, row["claims"])
    df.at[index, "r_deberta"], df.at[index, "r_deberta_mean"] = focus(claims, claims, same=True)
    df.at[index, "f_align"], df.at[index, "f_align_mean"] = focus_alignscore(row["claims"], claims)
    df.at[index, "c_align"], df.at[index, "c_align_mean"] = focus_alignscore(claims, row["claims"])
    df.at[index, "r_align"], df.at[index, "r_align_mean"] = focus_alignscore(claims, claims, same=True)
    
    # break at 100
    # save df to outfile
    if True or index % 100 == 0:
        df.to_json(outfile, lines=True, orient="records")

277it [00:45,  6.07it/s] 


KeyboardInterrupt: 

In [21]:
from os import path
import json
import sys
from tqdm import tqdm
from utils.datautils import extract_triplets
from transformers import pipeline
from utils.ntbutils import load_user_libs
import pysbd
from utils.datautils import avg_top_n
import pandas as pd
import numpy as np

load_user_libs("/home/ullriher/lib", ".path_include")
from alignscore import AlignScore
from factsumm import FactSumm
from sentence_transformers import CrossEncoder
deberta = CrossEncoder('cross-encoder/nli-deberta-v3-small')

#segmenter 
#factsumm = FactSumm()
segmenter = pysbd.Segmenter(language="en", clean=False)
sent_tokenize = segmenter.segment

# align_large = AlignScore(model='roberta-large', batch_size=32, device="cuda:0", ckpt_path='/home/ullriher/ullriher/models/alignscore/AlignScore-large.ckpt', evaluation_mode='nli_sp', verbose=False)
align_base = AlignScore(model='roberta-base', batch_size=32, device="cuda:0", ckpt_path='/home/ullriher/ullriher/models/alignscore/AlignScore-base.ckpt', evaluation_mode='nli_sp', verbose=False)


def focus(gold_claims, predicted_claims, same=False):
    if not isinstance(gold_claims, list):
        gold_claims = gold_claims.split("\n")
    if not isinstance(predicted_claims, list):
        predicted_claims = predicted_claims.split("\n")
    if len(gold_claims) == 0 or len(predicted_claims) == 0:
        return [], 0
    
    result = []
    for claim in predicted_claims:
        gold_claims_copy = gold_claims.copy()
        if same: # pop claim from gold_pairs
            # copy gold claims
            for j in range(len(gold_claims_copy)):
                if claim == gold_claims_copy[j]:
                    gold_claims_copy.pop(j)
                    break
        scores=deberta.predict(list(zip(gold_claims_copy, [claim]*len(gold_claims_copy))), apply_softmax=True, show_progress_bar=False)[:,1]
        result.append(np.max(scores))
    return result, np.mean(result) 

def focus_alignscore(gold_claims, predicted_claims, align=align_base, same=False):
    if not isinstance(gold_claims, list):
        gold_claims = gold_claims.split("\n")
    if not isinstance(predicted_claims, list):
        predicted_claims = predicted_claims.split("\n")
    if len(gold_claims) == 0 or len(predicted_claims) == 0:
        return [], 0
    align.verbose = False
    result = []
    for claim in predicted_claims:
        gold_claims_copy = gold_claims.copy()
        if same: # pop claim from gold_pairs
            # copy gold claims
            for j in range(len(gold_claims_copy)):
                if claim == gold_claims_copy[j]:
                    gold_claims_copy.pop(j)
                    break
        score=align.score(["\n".join(gold_claims_copy)], [claim])[0]
        result.append(score)
    return result, np.mean(result) 


# arse first argument as model name
if False:
    model_name = sys.argv[1]
else:
    model_name = "t5_small_multiclaim"

metric = "multiclaim"

print(f"Computing metric: {metric}")
print(f"Using model {model_name}")


df = pd.read_json("/mnt/data/factcheck/claim_extraction/feversum/hf_multiclaim/test.jsonl", lines=True)
df["generated"] = None
predictions = f"/home/ullriher/ullriher/data/_paper/predictions/{model_name}.jsonl"
outfile = f"/home/ullriher/ullriher/data/_paper/metrics/{metric}/{model_name}.jsonl"
skip_title_from_context = False

with open(predictions, "r") as f:
    for i, line in enumerate(f):
        df.at[i, "generated"] = json.loads(line)
        # if context starts with source\n, remove it
        if skip_title_from_context:
            if df.at[i, "sentence_context"].startswith(df.at[i, "source"] + "\n"):
                df.at[i, "sentence_context"] = df.at[i, "sentence_context"][len(df.at[i, "source"]) + 1 :]



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Lightning automatically upgraded your loaded checkpoint from v1.7.7 to v1.9.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file models/alignscore/AlignScore-base.ckpt`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  rank_zero_warn(


Computing metric: multiclaim
Using model t5_small_multiclaim


In [22]:
df.head(3)

Unnamed: 0,source,sentence_id,claims,source_text,sentence,sentence_context,generated
0,R. Kelly,6140,R. Kelly sold 40 million albums.\nR. Kelly is ...,R. Kelly\nRobert Sylvester Kelly (born January...,"In 2002 and 2004, Kelly released collaboration...","R. Kelly\nIn 1996, Kelly was nominated for a G...","[R. Kelly has been a guest vocalist for Nas., ..."
1,Greenpeace,4799,Greenpeace is focused on the issues of over-fi...,Greenpeace\nGreenpeace is a non-governmental e...,Founded by Canadian and US ex-pat environmenta...,Greenpeace\nGreenpeace is a non-governmental e...,[Greenpeace is a non-governmental environmenta...
2,Girls' Generation,4773,Girls' Generation is a South Korean girl group...,"Girls' Generation\nGirls' Generation, also kno...","Girls' Generation, also known as SNSD, is a So...","Girls' Generation\nGirls' Generation, also kno...",[Girls' Generation is a South Korean girl grou...


In [23]:
# show where generated is null
df[df["generated"].isna()]

Unnamed: 0,source,sentence_id,claims,source_text,sentence,sentence_context,generated


In [24]:
df.reset_index(drop=True, inplace=True)

In [25]:
# expand df by generated, one row per element of generated list
# df = df.explode("generated").reset_index(drop=True)
if "source_text" in df.columns: 
    df.drop(columns=["source_text"], inplace=True)
# remove leading source\n from sentence_context
df["claims"] = df["claims"].str.split("\n")

df["f_deberta"]=None
df["c_deberta"]=None
df["r_deberta"]=None
df["f_align"]=None
df["c_align"]=None
df["r_align"]=None
df["f_align_mean"]=None
df["f_deberta_mean"]=None
df["c_deberta_mean"]=None
df["c_align_mean"]=None
df["r_deberta_mean"]=None
df["r_align_mean"]=None

if path.exists(outfile):
    df = pd.read_json(outfile, lines=True)
    print(f"Loaded checkpoint from {outfile}")

df = df.dropna(subset=["generated"])
print("predicting")

Loaded checkpoint from /home/ullriher/ullriher/data/_paper/metrics/multiclaim/t5_small_multiclaim.jsonl
predicting


In [26]:
df

Unnamed: 0,source,sentence_id,claims,sentence,sentence_context,generated,f_deberta,c_deberta,r_deberta,f_align,c_align,r_align,f_align_mean,f_deberta_mean,c_deberta_mean,c_align_mean,r_deberta_mean,r_align_mean
0,R. Kelly,6140,"[R. Kelly sold 40 million albums., R. Kelly is...","In 2002 and 2004, Kelly released collaboration...","R. Kelly\nIn 1996, Kelly was nominated for a G...","[R. Kelly has been a guest vocalist for Nas., ...","[0.0002281614, 0.000325903, 0.0006033575000000...","[0.0001285921, 0.9811982512, 0.995229125]","[0.00027731990000000003, 0.0001378678000000000...","[0.0008317942, 0.0013024345, 0.0024810045, 0.0...","[0.0022423693000000002, 0.9961805344, 0.987279...","[0.0048222994000000005, 0.0043610968, 0.010591...",0.208400,0.199550,0.658852,0.661901,0.000200,0.022772
1,Greenpeace,4799,[Greenpeace is focused on the issues of over-f...,Founded by Canadian and US ex-pat environmenta...,Greenpeace\nGreenpeace is a non-governmental e...,[Greenpeace is a non-governmental environmenta...,"[0.001277686, 0.0001793168, 0.0019212369, 0.00...","[0.00021571100000000002, 0.10045540330000001, ...","[0.9955329895, 9.41527e-05, 0.9884605408, 0.99...","[0.0733137056, 0.0035160400000000002, 0.022341...","[0.0006854869, 0.1337228417, 0.9841062427, 0.0...","[0.9759155512000001, 0.0031969612, 0.133909627...",0.043121,0.001164,0.219137,0.223968,0.744905,0.522234
2,Girls' Generation,4773,[Girls' Generation is a South Korean girl grou...,"Girls' Generation, also known as SNSD, is a So...","Girls' Generation\nGirls' Generation, also kno...",[Girls' Generation is a South Korean girl grou...,"[0.9857805371, 0.9966781139, 0.9857805371, 0.9...","[0.9857805371, 0.9961774349, 0.007004799300000...","[0.9857805371, 0.9911391139000001, 0.985780537...","[0.9809049368, 0.9545260668000001, 0.980904936...","[0.9862440228, 0.9725143909, 0.6093394756, 0.9...","[0.9858044982, 0.9702588320000001, 0.985804498...",0.971409,0.990029,0.593890,0.704504,0.987814,0.981001
3,Lisa Lopes,617,[Lisa Lopes collaborated with rapper Lil' Kim....,"Lisa Nicole Lopes (May 27, 1971 - April 25, 20...","Lisa Lopes\nLisa Nicole Lopes (May 27, 1971 - ...","[Lisa Lopes was a hip hop singer., Lisa Lopes ...","[0.3414902091, 0.9561726451, 0.0172328167, 0.0...","[0.00041509500000000004, 0.0137727642, 0.01636...","[0.9940065145, 0.7346350551, 0.0008511407, 0.1...","[0.0144883571, 0.0024146722, 0.0859003961, 0.0...","[0.0033907159, 0.0025693241, 0.0064581046, 0.0...","[0.938795507, 0.0538470112, 0.0028041604, 0.01...",0.021860,0.279670,0.006249,0.003524,0.473433,0.326412
4,Linkin Park,5387,[Linkin Park released the album Hybrid Theory....,"Formed in 1996, the band rose to international...",Linkin Park\nLinkin Park is an American rock b...,"[Linkin Park is a rock band., Linkin Park's de...","[0.8129991889, 0.0013062379000000001, 0.000167...","[0.9977391958, 0.9891306162000001]","[0.8427196145, 0.00021641180000000001, 0.00015...","[0.0634528026, 0.0574352629, 0.002694009100000...","[0.9855245948, 0.9902856946]","[0.3731473386, 0.0019890338, 0.000768648100000...",0.032827,0.203682,0.993435,0.987905,0.210814,0.095506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
439,Angela Lansbury,6003,[Angela Lansbury contributed to animated films...,"She also moved into voice work, thereby contri...","Angela Lansbury\nThrough Corymore Productions,...",[Angela Lansbury contributed to Disney's Beaut...,"[0.000523449, 0.9968160987, 0.0004051966]","[0.9968160987, 0.9545787573000001]","[0.058683347000000004, 0.9861628413, 0.9956840...","[0.0018639604000000001, 0.9938004613, 0.001941...","[0.9853401780000001, 0.06975777450000001]","[0.47333866360000004, 0.9294198155000001, 0.98...",0.332535,0.332582,0.975697,0.527549,0.680177,0.796507
440,Seinfeld,4679,"[Seinfeld was named the ""number 1 reason the '...","E! named the series the ""number 1 reason the'9...","Seinfeld\nIn 2013, the Writers Guild of Americ...","[Seinfeld is a TV series., Seinfeld is a serie...","[0.004850402, 0.0220304672, 0.0220304672]","[0.0001621146, 0.0006575976]","[0.9433251023, 0.9953304529, 0.9953304529]","[0.1863470674, 0.2847853005, 0.2847853005]","[0.0048092376, 0.0224094186]","[0.9391465187, 0.9925837517, 0.9925837517]",0.251973,0.016304,0.000410,0.013609,0.977995,0.974771
441,Sherlock (TV series),5302,"[Sherlock has thirteen episodes., Sherlock is ...","Thirteen episodes have been produced, with thr...",Sherlock (TV series)\nCreated by Steven Moffat...,"[Sherlock is a television series., Sherlock is...","[0.9980496168, 0.9957154393000001, 0.000801315...","[0.1388856322, 0.9979077578000001]","[0.9918445945000001, 0.9954407215000001, 0.000...","[0.9894272685000001, 0.9855688214, 0.004765541...","[0.0025653131, 0.9859529734]","[0.9809195399, 0.9895373583, 0.004161926000000...",0.741333,0.747570,0.568397,0.494259,0.745783,0.741039
442,Christopher Lloyd,4911,[Christopher Lloyd voiced a character name The...,He has also done extensive voiceover work for ...,Christopher Lloyd\nHe earned a third Emmy for ...,"[Christopher Lloyd voicing The Hacker., Christ...","[0.9894266129, 0.9683456421000001, 0.982550323...","[0.9775460362, 0.9943634272, 0.002574031, 0.98...","[0.00020294560000000002, 0.9619967937, 0.00039...","[0.9046993256, 0.9577310085, 0.43554458020000003]","[0.9536542892000001, 0.9738487005, 0.370740354...","[0.0044621211, 0.28494578600000003, 0.00279645...",0.765992,0.980108,0.566172,0.605810,0.320866,0.097401


In [27]:

for index, row in tqdm(df.iterrows()):
    if row["r_align"] is not None:
        continue
    claims = list(row["generated"].copy())
    gold_claims = list(row["claims"].copy())
    
    # if claims or generated is empty, skip
    df.at[index, "f_deberta"], df.at[index, "f_deberta_mean"] = focus(gold_claims, claims)
    df.at[index, "c_deberta"], df.at[index, "c_deberta_mean"] = focus(claims, gold_claims)
    df.at[index, "r_deberta"], df.at[index, "r_deberta_mean"] = focus(claims, claims, same=True)
    df.at[index, "f_align"], df.at[index, "f_align_mean"] = focus_alignscore(gold_claims, claims)
    df.at[index, "c_align"], df.at[index, "c_align_mean"] = focus_alignscore(claims, gold_claims)
    df.at[index, "r_align"], df.at[index, "r_align_mean"] = focus_alignscore(claims, claims, same=True)
    
    # break at 100
    # save df to outfile
    if True or index % 100 == 0:
        df.to_json(outfile, lines=True, orient="records")

444it [00:00, 23800.51it/s]


In [37]:
from factsumm import FactSumm
from factsumm.utils.utils import qags_score

factsumm = FactSumm()

def focus(gold_claims: list[str], pred_claims: list[str], verbose: bool = False, same=False) -> float:
    try:
        if isinstance(factsumm.qg, str) or isinstance(factsumm.qa, str) or isinstance(factsumm.ner, str):
            factsumm.extract_qas("b", " ".join(pred_claims), verbose=False, device="cuda:0")
                
        # gold_entities = factsumm.ner(gold_claims)
        pred_entities = factsumm.ner(pred_claims)
        Q = factsumm.qg(pred_claims, pred_entities)

        gold_answers = factsumm.qa(" ".join(gold_claims), Q)
        pred_answers = factsumm.qa(" ".join(pred_claims), Q)

        
        if verbose:
            factsumm._print_qas("gold", gold_answers)
            factsumm._print_qas("pred", pred_answers)

        focus = qags_score(gold_answers, pred_answers)
        if verbose:
            print(f"QAGS Score: {focus}\n")

        return focus, pred_entities, Q, gold_answers, pred_answers
    except Exception as e:
        print(e)
        return np.nan, [], [], [], []

In [40]:
from factsumm.utils.utils import qags_score
focus(df["generated"][0],df["claims"][0])

(0.5,
 [[{'word': 'R. Kelly', 'entity': 'PERSON', 'start': 0, 'end': 8},
   {'word': '40 million', 'entity': 'CARDINAL', 'start': 14, 'end': 24}],
  [{'word': 'R. Kelly', 'entity': 'PERSON', 'start': 0, 'end': 8}],
  [{'word': 'R. Kelly', 'entity': 'PERSON', 'start': 0, 'end': 8},
   {'word': 'one', 'entity': 'CARDINAL', 'start': 27, 'end': 30},
   {'word': 'the United States', 'entity': 'GPE', 'start': 68, 'end': 85}]],
 [{'question': 'Who sold 40 million albums?', 'answer': 'R. Kelly'},
  {'question': 'How many albums did Kelly sell?', 'answer': '40 million'},
  {'question': 'Who is a musician?', 'answer': 'R. Kelly'},
  {'question': 'Who was one of the best selling artists in the US?',
   'answer': 'R. Kelly'},
  {'question': 'How many times has Kelly been rated as a best selling artist in the US?',
   'answer': 'one'},
  {'question': 'Where was R. Kelly a well-known artist?',
   'answer': 'the United States'}],
 [{'question': 'Who sold 40 million albums?',
   'answer': 'R. Kelly',
