In [1]:
import numpy as np
from tqdm import tqdm

from collections import defaultdict, OrderedDict, Counter
from dataclasses import dataclass
import datetime as dt
from itertools import chain
import os
import pathlib
from pathlib import Path
import pandas as pd
import unicodedata as ud
from time import time
from typing import Dict, Type, Callable, List, Union
import sys
import ujson

import torch
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
from datasets import load_dataset

from aic_nlp_utils.json import read_jsonl, read_json, write_json, write_jsonl
from aic_nlp_utils.encoding import nfc
from aic_nlp_utils.fever import fever_detokenize

%load_ext autoreload
%autoreload 2

**TODO** move elsewhere NLI models should be covered in own package. Currently it is here for convenience only.

In [16]:
# LANG = "cs"
# # DATE = "20230220" # wrong contexts (titles) here!
# DATE = "20230801"
# TRAIN_DATA = "claim"
# NER_DIR = "PAV-ner-CNEC"
# QG_DIR = "mt5-large-cp59k"
# QACG_DIR = "mbart-large-cc25_cp26k"

LANG = "en"
DATE = "20230220" # wrong contexts (titles) here!
TRAIN_DATA = "claim"
NER_DIR = "stanza"
QG_DIR = "mt5-large-cp64k"
QACG_DIR = "mbart-large-cc25_cp30k"

# BELOW configuration is language-agnostic

WIKI_ROOT = f"/mnt/data/factcheck/wiki/{LANG}/{DATE}"
WIKI_CORPUS = f"{WIKI_ROOT}/paragraphs/{LANG}wiki-{DATE}-paragraphs.jsonl"

QACG_ROOT = f"{WIKI_ROOT}/qacg"

CLAIM_DIR = Path(TRAIN_DATA, NER_DIR, QG_DIR, QACG_DIR)
CLAIM_ROOT = Path(QACG_ROOT, CLAIM_DIR)

TRAIN_FILES = {"s": Path(CLAIM_ROOT, "train_support.json"), 
               "r": Path(CLAIM_ROOT, "train_refute.json"),
               "n": Path(CLAIM_ROOT, "train_nei.json")}

DEV_FILES = {"s": Path(CLAIM_ROOT, "dev_support.json"), 
               "r": Path(CLAIM_ROOT, "dev_refute.json"),
               "n": Path(CLAIM_ROOT, "dev_nei.json")}

TEST_FILES = {"s": Path(CLAIM_ROOT, "test_support.json"), 
               "r": Path(CLAIM_ROOT, "test_refute.json"),
               "n": Path(CLAIM_ROOT, "test_nei.json")}


TRAIN_FILES_SR = {"s": Path(CLAIM_ROOT, "train_support.json"), 
               "r": Path(CLAIM_ROOT, "train_refute.json")}

DEV_FILES_SR = {"s": Path(CLAIM_ROOT, "dev_support.json"), 
               "r": Path(CLAIM_ROOT, "dev_refute.json")}

TEST_FILES_SR = {"s": Path(CLAIM_ROOT, "test_support.json"), 
               "r": Path(CLAIM_ROOT, "test_refute.json")}

In [31]:
def import_corpus(corpus_file):
    # it already has correct format
    raw = read_jsonl(corpus_file, show_progress=True)
    for e in raw:
        e["id"] = nfc(e["id"])
        e["did"] = nfc(e["did"])
        e["text"] = nfc(e["text"])
    return raw


def generate_original_id2pid_mapping(corpus):
    original_id2pid = {}
    for pid, r in enumerate(corpus):
        original_id = r["id"]
        # assert original_id not in original_id2pid, f"original ID not unique! {original_id}"
        if original_id in original_id2pid:
            print(f"original ID not unique! {pid} {original_id}, previous pid: {original_id2pid[original_id]}")
        original_id2pid[original_id] = pid
    return original_id2pid

corpus = import_corpus(WIKI_CORPUS)
original_id2pid = generate_original_id2pid_mapping(corpus)

0.00it [00:00, ?it/s]

In [44]:
def prepare_nli_data(src_files, dst_file, corpus, original_id2pid, seed=1234):
    rng = np.random.RandomState(seed)
    recs = []
    counts = Counter()
    for label, fname in src_files.items():
        data = read_json(fname)
        for k, v in tqdm(data.items()):
            context = corpus[original_id2pid[k]]["text"]
            for ner, claim in v.items():
                recs.append({"claim": claim, "context": context, "label": label})
                counts[label] += 1
    rng.shuffle(recs)
    print(f"exporting {len(recs)}, label counts: {counts} to:\n {str(dst_file)}")
    write_jsonl(dst_file, recs)

# prepare_nli_data(TRAIN_FILES, Path(CLAIM_ROOT, "train_nli.jsonl"), corpus, original_id2pid, seed=1234)
# prepare_nli_data(DEV_FILES, Path(CLAIM_ROOT, "dev_nli.jsonl"), corpus, original_id2pid, seed=1235)
# prepare_nli_data(TEST_FILES, Path(CLAIM_ROOT, "test_nli.jsonl"), corpus, original_id2pid, seed=1236)

# prepare_nli_data(TRAIN_FILES_SR, Path(CLAIM_ROOT, "train_nli_sr.jsonl"), corpus, original_id2pid, seed=1234)
# prepare_nli_data(DEV_FILES_SR, Path(CLAIM_ROOT, "dev_nli_sr.jsonl"), corpus, original_id2pid, seed=1235)
# prepare_nli_data(TEST_FILES_SR, Path(CLAIM_ROOT, "test_nli_sr.jsonl"), corpus, original_id2pid, seed=1236)

# Test Models

In [17]:
raw_nli = load_dataset("json", data_files={
    "train": str(Path(CLAIM_ROOT, "train_nli.jsonl")),
    "dev": str(Path(CLAIM_ROOT, "dev_nli.jsonl")),
    "test": str(Path(CLAIM_ROOT, "test_nli.jsonl"))
    })

# raw_nli = load_dataset("json", data_files={
#     "train": str(Path(CLAIM_ROOT, "train_nli_sr.jsonl")),
#     "dev": str(Path(CLAIM_ROOT, "dev_nli_sr.jsonl")),
#     "test": str(Path(CLAIM_ROOT, "test_nli_sr.jsonl"))
#     })

Found cached dataset json (/home/drchajan/.cache/huggingface/datasets/json/default-44857ba4e44e6e2b/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/3 [00:00<?, ?it/s]

In [18]:
for i, c in enumerate(raw_nli["train"]["context"]):
    if len(c) < 30:
        print(i, c)

29 SS Arthur M. Anderson
33 Tracteurs FAR
35 Frederick Mair
40 Ralph B. DeWitt
49 Tsurumi, Ōita
61 Vili Matula
84 Terry and Me
144 Carrie Ingalls
161 Franz Karl of Auersperg
162 Christoph Betzl
170 Iowa Auditor of State
211 Marsden Manfred
223 Abbie Cornish
224 One true church
229 Nick Warren
232 1994–95 QMJHL season
233 K.R.C. Gent
240 Killing of Ronnie Paris
247 Drake Maye
251 Zhenxun
273 Shure SM7
290 Pampaqucha (Ayacucho)
295 Edward Hewetson
360 Sai Kiran
363 Hailey Armstrong
367 Taylor Leach
396 Marcin Jędrusiński
412 Plomo Mummy
427 Stephen Adams (politician)
454 Gillan (band)
465 Forfar Town and County Hall
490 Anceya giraudi
493 Harvey Watkins Jr.
535 Randallstown, Maryland
541 Vulcan changeup
586 Edgar Smith Wigg
605 Cornmill Stream
612 Longworth Powers
617 Academy for German Law
628 Antaeotricha theoretica
640 Vitamin Smith
665 Matthew Paul Moyle
667 Revekka Galperina
677 Socialism in Italy
715 Pensnett Chase
719 Palestine (2011 book)
724 Vkusno i tochka
734 Clint Bolton
749 

In [56]:
# model_id = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/nli/google/flan-t5-base_cs_CZ/checkpoint-896"
# model_id = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/nli/google/flan-t5-large_cs_CZ/checkpoint-1568"
# model_id = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/nli/google/flan-t5-large_cs_CZ-20230801/checkpoint-6144"
# model_id = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/nli/google/flan-t5-large_cs_CZ-20230801/checkpoint-23936"

# SUPPORT/REFUTE only models
# model_id = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/nli/google/umt5-base_cs_CZ-20230801_sr/checkpoint-23936"
# model_id = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/nli/google/flan-t5-large_cs_CZ-20230801_sr/checkpoint-256"

# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto")

# Encoder (Softmax) models
# model_id = "ctu-aic/xlm-roberta-large-xnli-csfever"
# model_id = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/nli/deepset/xlm-roberta-large-squad2_cs_CZ-20230801_lr1e-6/checkpoint-41792"
# model_id = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/nli/deepset/xlm-roberta-large-squad2_en_US-20230220_lr1e-6/checkpoint-48416"
model_id = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/nli_fever/deepset/xlm-roberta-large-squad2_en_US_lr1e-6/checkpoint-132864"

id2label = {0: "s", 1: "r", 2: "n"}
label2id = {"s": 0, "r": 1, "n": 1}
tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForSequenceClassification.from_pretrained(model_id, device_map="auto", id2label=id2label, label2id=label2id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, device_map="auto")


In [57]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from aic_nlp_utils.batch import batch_apply

def split_predict(model, split, batch_size=128, device="cuda", max_length=128):
    def predict(inputs):
        X = tokenizer(inputs, max_length=max_length, padding=True, truncation=True, return_tensors="pt")
        input_ids = X["input_ids"].to(device)
        attention_mask = X["attention_mask"].to(device)
        with torch.no_grad():
            Y = model(input_ids=input_ids, attention_mask=attention_mask).logits
            return Y
        
    inputs = [[claim, context] for claim, context in zip(split["claim"],  split["context"])]
    # inputs = [[context, claim] for claim, context in zip(split["claim"],  split["context"])] # SWITCHED CTX and CLAIM!!!
    Ys = batch_apply(predict, inputs, batch_size=batch_size, show_progress=True)
    Y = torch.vstack(Ys)
    C = [model.config.id2label[id_.item()] for id_ in Y.argmax(dim=1)]
    T = [l for l in split["label"]]
    return Y, C, T

# Y, C, T = split_predict(model, raw_nli["test"], device="cuda", max_length=128) # FAST
Y, C, T = split_predict(model, raw_nli["test"], device="cuda", max_length=512) # CORRECT, set to model maximum input length

  0%|          | 0/174 [00:00<?, ?it/s]

In [58]:
print(f"acc: {accuracy_score(T, C)}")
print(f"F1: {f1_score(T, C, average='macro')}")
# print(f"cm:\n{confusion_matrix(T, C)}")
print()
print(f"cm:\n{confusion_matrix(T, C, labels=['s', 'r', 'n'])}")
print()
print(f"C={Counter(C)}")
print(f"T={Counter(T)}")

acc: 0.6492715709710883
F1: 0.5975708877212096

cm:
[[7657  233  210]
 [3508 1488 1310]
 [ 591 1924 5250]]

C=Counter({'s': 11756, 'n': 6770, 'r': 3645})
T=Counter({'s': 8100, 'n': 7765, 'r': 6306})


In [53]:
print(f"acc: {accuracy_score(T, C)}")
print(f"F1: {f1_score(T, C, average='macro')}")
# print(f"cm:\n{confusion_matrix(T, C)}")
print()
print(f"cm:\n{confusion_matrix(T, C, labels=['s', 'r', 'n'])}")
print()
print(f"C={Counter(C)}")
print(f"T={Counter(T)}")

acc: 0.9010419015831491
F1: 0.8978291959432556

cm:
[[7504  568   28]
 [1120 5171   15]
 [ 195  268 7302]]

C=Counter({'s': 8819, 'n': 7345, 'r': 6007})
T=Counter({'s': 8100, 'n': 7765, 'r': 6306})


In [7]:
def preprocess_function(examples):
    max_length = 128
    claims = examples["claim"]
    contexts = examples["context"]
    targets = examples["label"]
    inputs = [claim + "</s>" + context for claim, context in zip(claims, contexts)]
    # inputs = [context + "</s>" + claim for claim, context in zip(claims, contexts)]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    # labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="pt")
    # labels = labels["input_ids"]
    # labels[labels == tokenizer.pad_token_id] = -100
    # model_inputs["labels"] = labels
    return model_inputs

# tokenized_nli = raw_nli.map(preprocess_function, batched=True,  
#                     #   remove_columns=raw_nli["train"].column_names,
#                       load_from_cache_file=False)
# tokenized_nli.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [37]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

def split_predict(model, split, batch_size=256):
    def predict_batch(inputs):
        Y = model(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda")).logits.argmax(dim=1)
        C = [model.config.id2label[id_.item()] for id_ in Y]
        out = {"pred": C}
        return out
    
    split = split.map(predict_batch, batch_size=batch_size, batched=True)
    C = split["pred"]
    T = tokenized_nli["dev"]["label"]
    return C, T

def split_predict_generate(model, split, batch_size=256):
    def predict_batch(inputs):
        Y = model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"].to("cuda"))
        out = {"pred": tokenizer.batch_decode(Y, skip_special_tokens=True)}
        return out
    
    split = split.map(predict_batch, batch_size=batch_size, batched=True)
    C = split["pred"]
    T = tokenized_nli["dev"]["label"]
    return C, T

# C, T = split_predict(model, tokenized_nli["dev"], batch_size=64)
# C, T = split_predict_generate(model, tokenized_nli["dev"])

In [47]:
# old type of models used in FactSearch
from prediction.nli import SupportRefuteNEIModel # Make OBSOLETE
from sentence_transformers.cross_encoder import CrossEncoder

model_id = "ctu-aic/xlm-roberta-large-xnli-csfever"
id2label = {0: "s", 1: "r", 2: "n"}
label2id = {"s": 0, "r": 1, "n": 1}
model = CrossEncoder(model_id, device="cuda")

In [49]:
from sklearn.metrics import accuracy_score, confusion_matrix
from aic_nlp_utils.batch import batch_apply

def split_predict_generate(model, split, batch_size=128):
    def predict(inputs):
        X = tokenizer(inputs, max_length=128, padding="max_length", truncation=True, return_tensors="pt")
        input_ids = X["input_ids"].to("cuda")
        attention_mask = X["attention_mask"].to("cuda")
        Y = model.generate(input_ids=input_ids, attention_mask=attention_mask)
        C = tokenizer.batch_decode(Y, skip_special_tokens=True)
        return C

def split_predict_crossencoder(model, split, batch_size=10*128):
    def predict(inputs):
        print(len(inputs))
        Y = model.predict(inputs).argmax(axis=1)
        C = [id2label[id_.item()] for id_ in Y]
        return C
        
    # SWITCHED CTX and CLAIM!!!
    inputs = [[context, claim] for claim, context in zip(split["claim"],  split["context"])]
    C = batch_apply(predict, inputs, batch_size=batch_size)
    T = [l for l in split["label"]]
    return C, T

# C, T = split_predict_generate(model, raw_nli["dev"])
C, T = split_predict_crossencoder(model, raw_nli["dev"])
print(f"acc: {accuracy_score(T, C)}")
print(f"cm:\n{confusion_matrix(T, C)}")

1280
1280
1280
1280
1280
1280
1280
1280
1280
1280
1280
1280
1280
1280
1280
1280
1280
1280
1280
647
acc: 0.446749709616694
cm:
[[   0    0    0]
 [2359 1251 8553]
 [2504  397 9903]]


In [31]:
C, T = split_predict(model, raw_nli["test"])
print(f"acc: {accuracy_score(T, C)}")
print(f"cm:\n{confusion_matrix(T, C)}")

acc: 0.5060786993741457
cm:
[[4183   68  188]
 [2349  826 1184]
 [2531  546 2026]]


In [26]:
C, T = split_predict(model, raw_nli["train"])
print(f"acc: {accuracy_score(T, C)}")
print(f"cm:\n{confusion_matrix(T, C)}")

acc: 0.49263804969579006
cm:
[[    0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0]
 [    0     0     1     0     0     0     0     0 41839  1438  4126]
 [    1     0     0     1     0     0     0     0 19317  7391 17376]
 [    1     1     1     0     1     1     2     1 23257  6941 21133]]
