In [1]:
from collections import Counter, OrderedDict
import numpy as np
from pathlib import Path
import pandas as pd
import sys
import textwrap
from tqdm import tqdm
import torch
from typing import Dict, List, Set, Union

import evaluate
from transformers import Seq2SeqTrainingArguments
import bert_score

import unicodedata
import uuid

from aic_nlp_utils.batch import batch_apply
from aic_nlp_utils.encoding import nfc
from aic_nlp_utils.json import read_jsonl, read_json, write_json, write_jsonl
from aic_nlp_utils.fever import fever_detokenize, import_fever_corpus_from_sqlite
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs
import stanza
# stanza.download("en")

from zshot_fact_verify.models.arguments import ModelArguments, DataTrainingArguments
from zshot_fact_verify.models.load import load_tokenizer_and_model, find_last_checkpoint

%load_ext autoreload
%autoreload 2

# Combine and Fix QA2D Datasets

In [8]:
def combine_qa2ds(split_files, out_file, seed=1234):
    rng = np.random.RandomState(seed)
    data = []
    for sfile in split_files:
        assert len(sfile.items()) == 1
        for lang, path_ in sfile.items():
            pass
        split = read_jsonl(path_)
        for s in split:
            ta = s["turker_answer"]
            # temporal fix for lower case starting letters, should be fixed in all individual datasets
            # see below :)
            s["turker_answer"] = ta[:1].upper() + ta[1:]
            s["lang"] = lang
        data += split
    rng.shuffle(data)
    print(f"writing {len(data)} records to {out_file}")
    write_jsonl(out_file, data, mkdir=True)

combine_qa2ds([
    {"cs": "/mnt/data/factcheck/qa2d/cs/dev.jsonl"},
    {"en": "/mnt/data/factcheck/qa2d/en/dev.jsonl"},
    {"pl": "/mnt/data/factcheck/qa2d/pl/dev.jsonl"},
    {"sk": "/mnt/data/factcheck/qa2d/sk/dev.jsonl"}], 
    "/mnt/data/factcheck/qa2d/cs_en_pl_sk/dev.jsonl")

combine_qa2ds([
    {"cs": "/mnt/data/factcheck/qa2d/cs/train.jsonl"},
    {"en": "/mnt/data/factcheck/qa2d/en/train.jsonl"},
    {"pl": "/mnt/data/factcheck/qa2d/pl/train.jsonl"},
    {"sk": "/mnt/data/factcheck/qa2d/sk/train.jsonl"}], 
    "/mnt/data/factcheck/qa2d/cs_en_pl_sk/train.jsonl")

writing 41376 records to /mnt/data/factcheck/qa2d/cs_en_pl_sk/dev.jsonl
writing 242840 records to /mnt/data/factcheck/qa2d/cs_en_pl_sk/train.jsonl


In [20]:
def fix_qa2ds(split_files):
    # answers sometimes started with lowe-case letters
    for sfile in split_files:
        lines = read_jsonl(sfile)
        assert not Path(sfile + ".orig").is_file(), f"already exists: '{sfile}.orig'"
        Path(sfile).rename(sfile + ".orig") # backup
        for r in lines:
            rb = r["rule-based"]
            ta = r["turker_answer"]
            r["rule-based"] = rb[:1].upper() + rb[1:]
            r["turker_answer"] = ta[:1].upper() + ta[1:]
        print(f"writing {len(lines)} records to {sfile}")
        write_jsonl(sfile, lines, mkdir=True)

fix_qa2ds([
    "/mnt/data/factcheck/qa2d/cs/dev.jsonl",
    "/mnt/data/factcheck/qa2d/en/dev.jsonl",
    "/mnt/data/factcheck/qa2d/pl/dev.jsonl",
    "/mnt/data/factcheck/qa2d/sk/dev.jsonl", 
    "/mnt/data/factcheck/qa2d/cs/train.jsonl",
    "/mnt/data/factcheck/qa2d/en/train.jsonl",
    "/mnt/data/factcheck/qa2d/pl/train.jsonl",
    "/mnt/data/factcheck/qa2d/sk/train.jsonl", 
])

writing 10344 records to /mnt/data/factcheck/qa2d/cs/dev.jsonl
writing 10344 records to /mnt/data/factcheck/qa2d/en/dev.jsonl
writing 10344 records to /mnt/data/factcheck/qa2d/pl/dev.jsonl
writing 10344 records to /mnt/data/factcheck/qa2d/sk/dev.jsonl
writing 60710 records to /mnt/data/factcheck/qa2d/cs/train.jsonl
writing 60710 records to /mnt/data/factcheck/qa2d/en/train.jsonl
writing 60710 records to /mnt/data/factcheck/qa2d/pl/train.jsonl
writing 60710 records to /mnt/data/factcheck/qa2d/sk/train.jsonl


# Test Models

In [2]:
MODEL_NAME_ALL = "/mnt/personal/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/google/umt5-base_all"
MODEL_NAME_ALL2 = "/mnt/personal/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/google/mt5-large_all/BKP/checkpoint-121000"
MODEL_NAME_CS1 = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/google/mt5-large_cs_CZ/checkpoint-76000"
MODEL_NAME_CS2 = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/facebook/mbart-large-cc25_cs_CZ/checkpoint-26000"
MODEL_NAME_EN1 = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/facebook/mbart-large-cc25_en_US/checkpoint-30000"
MODEL_NAME_EN2 = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/google/mt5-large_en_US/checkpoint-94000"
# MODEL_NAME_PL1 = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/facebook/mbart-large-cc25_pl_PL/checkpoint-43000"
# MODEL_NAME_SK1 = "/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/facebook/mbart-large-cc25_sk_SK/checkpoint-37000"

DEV_FILE_ALL = "/mnt/data/factcheck/qa2d/cs_en_pl_sk/dev.jsonl"
DEV_FILE_CS = "/mnt/data/factcheck/qa2d/cs/dev.jsonl"
DEV_FILE_EN = "/mnt/data/factcheck/qa2d/en/dev.jsonl"
DEV_FILE_PL = "/mnt/data/factcheck/qa2d/pl/dev.jsonl"
DEV_FILE_SK = "/mnt/data/factcheck/qa2d/sk/dev.jsonl"

In [None]:
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs

def predict_split_original(model, data):
    # use batches for faster
    T = []
    Y = []
    X = [nfc(sample["answer"] + "[SEP]" + sample["question"]) for sample in data]
    Y = model.predict(X)
    T = [sample["turker_answer"] for sample in data]
    return Y, T

def evaluate_original_model(cfgs, out_json):
    rouge = evaluate.load("rouge")
    model_args = Seq2SeqArgs()
    model_args.max_length = 64
    original_model = Seq2SeqModel(
                encoder_decoder_type="bart", 
                encoder_decoder_name="/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/dependencies/QA2D_model",
                cuda_device=0,
                args=model_args
            )

    results = []
    for cfg in cfgs:
        lang = cfg['lang']
        data_file = cfg["data_file"]
        print(f"lang: {lang}, data file: {data_file}")

        data = read_jsonl(data_file)
        print(f"  loaded {len(data)} samples")
        
        Y, T = predict_split_original(original_model, data)
        
        ev = rouge.compute(predictions=Y, references=T)
        bsP, bsR, bsF1 = bert_score.score(Y, T, model_type="bert-base-multilingual-cased")
        ev["bert_score_P"] = bsP.mean().item()
        ev["bert_score_R"] = bsR.mean().item()
        ev["bert_score_F1"] = bsF1.mean().item()
        
        print(f"  EVAL = {ev}")
        print("---------------------------------------")
        res = cfg.copy()
        res["eval"] = ev
        res["Y"] = Y
        res["T"] = T
        results.append(res)
        write_jsonl(out_json, [res], append=True)
    return results

cfgs = [
    {"lang": "cs_CZ", "model": "original", "data_file": DEV_FILE_CS},
    {"lang": "en_US", "model": "original", "data_file": DEV_FILE_EN},
    {"lang": "pl_PL", "model": "original", "data_file": DEV_FILE_PL},
    {"lang": "sk_SK", "model": "original", "data_file": DEV_FILE_SK},
]

results = evaluate_original_model(cfgs, "/mnt/personal/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/results.jsonl")

In [5]:
def predict(model, tokenizer, inputs, max_source_length=1024, padding=True, device="cuda"):
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True, return_tensors="pt")
    input_ids = model_inputs["input_ids"].to(device)
    attention_mask = model_inputs["attention_mask"].to(device)
    with torch.no_grad():
        Y = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=768)
        predictions = tokenizer.batch_decode(
            Y, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
    return predictions

def predict_split(model, tokenizer, data, batch_size=32):
    # use batches for faster
    T = []
    Y = []
    X = [nfc(sample["answer"] + "</s>" + sample["question"]) for sample in data]
    pfunc = lambda batch: predict(model, tokenizer, batch)
    Y = batch_apply(pfunc, X, batch_size=batch_size, show_progress=True)
    # some turker answers start with lower case letters; ROUGE ignores this but anyway,...
    T = [nfc(sample["turker_answer"][0:1].upper() + sample["turker_answer"][1:]) for sample in data]
    return Y, T

def evaluate_quality(cfgs, out_json):
    rouge = evaluate.load("rouge")
    results = []
    for cfg in cfgs:
        lang = cfg['lang']
        data_file = cfg["data_file"]
        model_name = cfg["model"]
        model_short = "/".join(Path(model_name).parts[8:])
        print(f"lang: {lang}, model: {model_short}, data file: {data_file}")

        data = read_jsonl(data_file)
        print(f"  loaded {len(data)} samples")
        
        model_args = ModelArguments(model_name_or_path=model_name)
        tokenizer, model, data_collator = load_tokenizer_and_model(model_args, lang=lang, fp16=True)
        model.to("cuda")
        model.eval();

        Y, T = predict_split(model, tokenizer, data, batch_size=32)
        ev = rouge.compute(predictions=Y, references=T)

        bsP, bsR, bsF1 = bert_score.score(Y, T, model_type="bert-base-multilingual-cased")
        ev["bert_score_P"] = bsP.mean().item()
        ev["bert_score_R"] = bsR.mean().item()
        ev["bert_score_F1"] = bsF1.mean().item()
        
        print(f"  EVAL = {ev}")
        print("---------------------------------------")
        res = cfg.copy()
        res["eval"] = ev
        res["Y"] = Y
        res["T"] = T
        results.append(res)
        write_jsonl(out_json, [res], append=True)
    return results

In [None]:
cfgs = [
    {"lang": "cs_CZ", "model": MODEL_NAME_CS2, "data_file": DEV_FILE_CS},
    {"lang": "cs_CZ", "model": MODEL_NAME_CS1, "data_file": DEV_FILE_CS},
    {"lang": "en_US", "model": MODEL_NAME_EN1, "data_file": DEV_FILE_EN},
    {"lang": "en_US", "model": MODEL_NAME_EN2, "data_file": DEV_FILE_EN},
    # {"lang": "pl_PL", "model": MODEL_NAME_PL1, "data_file": DEV_FILE_PL},
    # {"lang": "sk_SK", "model": MODEL_NAME_SK1, "data_file": DEV_FILE_SK},
]

results = evaluate_quality(cfgs, 
                         "/mnt/personal/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/results.jsonl")

In [6]:
cfgs = [
    {"lang": "all", "model": MODEL_NAME_ALL, "data_file": DEV_FILE_ALL},
    {"lang": "cs_CZ", "model": MODEL_NAME_ALL, "data_file": DEV_FILE_CS},
    {"lang": "en_US", "model": MODEL_NAME_ALL, "data_file": DEV_FILE_EN},
    {"lang": "pl_PL", "model": MODEL_NAME_ALL, "data_file": DEV_FILE_PL},
    {"lang": "sk_SK", "model": MODEL_NAME_ALL, "data_file": DEV_FILE_SK},
]

results = evaluate_quality(cfgs, 
                         "/mnt/personal/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/results.jsonl")

lang: all, model: experiments/qa2d/google/umt5-base_all, data file: /mnt/data/factcheck/qa2d/cs_en_pl_sk/dev.jsonl
  loaded 41376 samples


  0%|          | 0/1293 [00:00<?, ?it/s]

  EVAL = {'rouge1': 0.8122218120177676, 'rouge2': 0.6774367151770639, 'rougeL': 0.7498164591304859, 'rougeLsum': 0.7497761148519413, 'bert_score_P': 0.9321883916854858, 'bert_score_R': 0.9293510317802429, 'bert_score_F1': 0.930552065372467}
---------------------------------------
lang: cs_CZ, model: experiments/qa2d/google/umt5-base_all, data file: /mnt/data/factcheck/qa2d/cs/dev.jsonl
  loaded 10344 samples


  0%|          | 0/324 [00:00<?, ?it/s]

  EVAL = {'rouge1': 0.7781002659870636, 'rouge2': 0.6232363149620266, 'rougeL': 0.7038499291502314, 'rougeLsum': 0.7038628248956882, 'bert_score_P': 0.9212839007377625, 'bert_score_R': 0.9181382060050964, 'bert_score_F1': 0.9194667339324951}
---------------------------------------
lang: en_US, model: experiments/qa2d/google/umt5-base_all, data file: /mnt/data/factcheck/qa2d/en/dev.jsonl
  loaded 10344 samples


  0%|          | 0/324 [00:00<?, ?it/s]

  EVAL = {'rouge1': 0.9332151559207333, 'rouge2': 0.8545758468590733, 'rougeL': 0.885467944163314, 'rougeLsum': 0.8854070582375144, 'bert_score_P': 0.9651930332183838, 'bert_score_R': 0.9628877639770508, 'bert_score_F1': 0.9638970494270325}
---------------------------------------
lang: pl_PL, model: experiments/qa2d/google/umt5-base_all, data file: /mnt/data/factcheck/qa2d/pl/dev.jsonl
  loaded 10344 samples


  0%|          | 0/324 [00:00<?, ?it/s]

  EVAL = {'rouge1': 0.7577914458960934, 'rouge2': 0.6038776767322465, 'rougeL': 0.7010437934418678, 'rougeLsum': 0.7009633554413504, 'bert_score_P': 0.9213378429412842, 'bert_score_R': 0.9185487031936646, 'bert_score_F1': 0.9197044372558594}
---------------------------------------
lang: sk_SK, model: experiments/qa2d/google/umt5-base_all, data file: /mnt/data/factcheck/qa2d/sk/dev.jsonl
  loaded 10344 samples


  0%|          | 0/324 [00:00<?, ?it/s]

  EVAL = {'rouge1': 0.7794955482700204, 'rouge2': 0.6281938885964595, 'rougeL': 0.7089312577856803, 'rougeLsum': 0.7086990889936273, 'bert_score_P': 0.9209386706352234, 'bert_score_R': 0.9178295731544495, 'bert_score_F1': 0.9191399812698364}
---------------------------------------


In [None]:
cfgs = [
    {"lang": "all", "model": MODEL_NAME_ALL2, "data_file": DEV_FILE_ALL},
    {"lang": "cs_CZ", "model": MODEL_NAME_ALL2, "data_file": DEV_FILE_CS},
    {"lang": "en_US", "model": MODEL_NAME_ALL2, "data_file": DEV_FILE_EN},
    {"lang": "pl_PL", "model": MODEL_NAME_ALL2, "data_file": DEV_FILE_PL},
    {"lang": "sk_SK", "model": MODEL_NAME_ALL2, "data_file": DEV_FILE_SK},
]

results = evaluate_quality(cfgs, 
                         "/mnt/personal/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/results.jsonl")

In [7]:
def compare_results_qa2d(result_jsonls):
    data = []
    for rjsonl in result_jsonls:
        data += read_jsonl(rjsonl)
    # for d in data:
    #     t = d["eval"]["bert_score_R1"]
    #     d["eval"]["bert_score_R"] = t
    #     del d["eval"]["bert_score_R1"]
    # write_jsonl(result_jsonls[0], data)
    # return
    df = pd.DataFrame(data)
    models = ['/'.join(m.split("/")[-3:]) for m in df.model]
    df["model"] = models
    df["rouge1"] = [e["rouge1"] for e in df["eval"]]
    df["rouge2"] = [e["rouge2"] for e in df["eval"]]
    df["rougeL"] = [e["rougeL"] for e in df["eval"]]
    df["rougeLsum"] = [e["rougeLsum"] for e in df["eval"]]
    df["bert_score_P"] = [e["bert_score_P"] for e in df["eval"]]
    df["bert_score_R"] = [e["bert_score_R"] for e in df["eval"]]
    df["bert_score_F1"] = [e["bert_score_F1"] for e in df["eval"]]
    df = df[["lang", "model", "rouge1", "rouge2", "rougeL", "rougeLsum", "bert_score_P", "bert_score_R", "bert_score_F1"]]
    df.sort_values("lang", inplace=True)
    return df

df = compare_results_qa2d([
    "/mnt/personal/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/results.jsonl"
])

df

Unnamed: 0,lang,model,rouge1,rouge2,rougeL,rougeLsum,bert_score_P,bert_score_R,bert_score_F1
0,all,mt5-large_all/BKP/checkpoint-121000,0.817812,0.685361,0.756357,0.756424,0.933643,0.931691,0.932459
13,all,qa2d/google/umt5-base_all,0.812222,0.677437,0.749816,0.749776,0.932188,0.929351,0.930552
1,cs_CZ,original,0.645022,0.445025,0.530238,0.530233,0.835602,0.841203,0.837771
2,cs_CZ,facebook/mbart-large-cc25_cs_CZ/checkpoint-26000,0.773603,0.61628,0.696729,0.696732,0.917696,0.915932,0.916566
3,cs_CZ,google/mt5-large_cs_CZ/checkpoint-76000,0.785204,0.635334,0.712915,0.712964,0.923145,0.920847,0.921758
4,cs_CZ,mt5-large_all/BKP/checkpoint-121000,0.784969,0.633072,0.711863,0.712106,0.923036,0.920931,0.921752
14,cs_CZ,qa2d/google/umt5-base_all,0.7781,0.623236,0.70385,0.703863,0.921284,0.918138,0.919467
15,en_US,qa2d/google/umt5-base_all,0.933215,0.854576,0.885468,0.885407,0.965193,0.962888,0.963897
8,en_US,mt5-large_all/BKP/checkpoint-121000,0.935298,0.858313,0.887878,0.887918,0.965979,0.963956,0.964828
7,en_US,google/mt5-large_en_US/checkpoint-94000,0.935031,0.858347,0.887431,0.887499,0.965914,0.963958,0.964799


In [None]:
data = read_jsonl("/mnt/personal/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qacg/google/mt5_results.jsonl")

In [None]:
[{'lang': 'all',
  'model': '/mnt/personal/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/google/umt5-base_all',
  'data_file': '/mnt/data/factcheck/qa2d/cs_en_pl_sk/dev.jsonl',
  'rouge': {'rouge1': 0.8121849974992059,
   'rouge2': 0.6774419510196882,
   'rougeL': 0.7497730778117646,
   'rougeLsum': 0.7497340248321841}},
 {'lang': 'cs_CZ',
  'model': '/mnt/personal/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/google/umt5-base_all',
  'data_file': '/mnt/data/factcheck/qa2d/cs/dev.jsonl',
  'rouge': {'rouge1': 0.7783607331568722,
   'rouge2': 0.6231575608427122,
   'rougeL': 0.7039034412670073,
   'rougeLsum': 0.703985133472832}},
 {'lang': 'en_US',
  'model': '/mnt/personal/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/google/umt5-base_all',
  'data_file': '/mnt/data/factcheck/qa2d/en/dev.jsonl',
  'rouge': {'rouge1': 0.9333374676443688,
   'rouge2': 0.8544504708653158,
   'rougeL': 0.88537866791236,
   'rougeLsum': 0.8855242981238776}},
 {'lang': 'pl_PL',
  'model': '/mnt/personal/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/google/umt5-base_all',
  'data_file': '/mnt/data/factcheck/qa2d/pl/dev.jsonl',
  'rouge': {'rouge1': 0.7579418551444153,
   'rouge2': 0.6037844541036594,
   'rougeL': 0.70104065486047,
   'rougeLsum': 0.7011065700120231}},
 {'lang': 'sk_SK',
  'model': '/mnt/personal/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/google/umt5-base_all',
  'data_file': '/mnt/data/factcheck/qa2d/sk/dev.jsonl',
  'rouge': {'rouge1': 0.7796807605964733,
   'rouge2': 0.6282711920363941,
   'rougeL': 0.7087832206401827,
   'rougeLsum': 0.7089273080145163}}]

In [3]:
model_args = ModelArguments(model_name_or_path="/home/drchajan/devel/python/FC/Zero-shot-Fact-Verification/experiments/qa2d/facebook/mbart-large-cc25_cs_CZ/BEST/checkpoint-26000")
tokenizer, model, data_collator = load_tokenizer_and_model(model_args, lang="cs_CZ", fp16=True)

In [11]:
model.to("cuda");

In [20]:
def predict(model, tokenizer, inputs, max_source_length=1024, padding=False):
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True, return_tensors="pt")
    model_inputs = {k: model_inputs[k].to("cuda") for k in model_inputs.keys()}
    with torch.no_grad():
        Y = model.generate(**model_inputs, max_new_tokens=768)
        predictions = tokenizer.batch_decode(
            Y, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
    return predictions


sample = data[6]
question = sample["question"]
answer = sample["answer"]
# question = "V kolika letech zemřel Petr?"
# answer = "25"
print(textwrap.fill(question))
print(answer)
predict(model, tokenizer, [answer + "</s>" + question])

Proč organismy dědí vlastnosti svých rodičů?
buňky potomků obsahují kopie genů z buněk jejich rodičů


['organismy dědí vlastnosti svých rodičů, protože buňky potomků obsahují kopie genů z buněk jejich rodičů.']