In [1]:
import numpy as np
from tqdm import tqdm

from collections import defaultdict, OrderedDict, Counter
from dataclasses import dataclass
import datetime as dt
from itertools import chain
import math
import os
import pathlib
from pathlib import Path
import pandas as pd
import unicodedata as ud
from time import time
from typing import Dict, Type, Callable, List, Union
import sys
import ujson

import torch
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
from datasets import load_dataset

from aic_nlp_utils.json import read_jsonl, read_json, write_json, write_jsonl
from aic_nlp_utils.encoding import nfc
from aic_nlp_utils.fever import fever_detokenize

%load_ext autoreload
%autoreload 2

**TODO** move elsewhere NLI models should be covered in own package. Currently it is here for convenience only.

This one is for CTK, cRO, Parlamentni Listy and DenikN

Splits are done in ColBERT notebooks. See: `prepare_data_news.ipynb`

In [6]:
# APPROACH = "full" # all generated data
# APPROACH = "balanced" # balanced classes
APPROACH = "balanced_shuf" # balanced classes, shuffled
# APPROACH = "fever_size" # QACG data subsampled to Cs/EnFEVER dataset size

LANG = "cs"
NER_DIR = "PAV-ner-CNEC"

QG_DIR = "mt5-large_all-cp126k"
QACG_DIR = "mt5-large_all-cp156k"

# BELOW configuration is language-agnostic

# DATA_ROOT = f"/mnt/data/cro/factcheck/v1"
# DATA_CORPUS = Path(DATA_ROOT, "interim", "cro_paragraphs_filtered.jsonl")

# DATA_ROOT = f"/mnt/data/ctknews/factcheck/par6"
# DATA_CORPUS = Path(DATA_ROOT, "interim", "jsonl", "ctk_filtered.jsonl")

# DATA_ROOT = f"/mnt/data/factcheck/denikn/v1"
# DATA_CORPUS = Path(DATA_ROOT, "interim", "denikn_paragraphs.jsonl")

DATA_ROOT = f"/mnt/data/newton/parlamentni_listy/factcheck/v1"
DATA_CORPUS = Path(DATA_ROOT, "interim", "plisty_paragraphs.jsonl")

QACG_ROOT = Path(DATA_ROOT, "qacg")

NLI_DIR = Path("nli", NER_DIR, QG_DIR, QACG_DIR)
NLI_ROOT = Path(QACG_ROOT, NLI_DIR)

SPLIT_DIR = Path("splits", NER_DIR, QG_DIR, QACG_DIR)
SPLIT_ROOT = Path(QACG_ROOT, SPLIT_DIR)

In [7]:
SPLIT_ROOT

PosixPath('/mnt/data/newton/parlamentni_listy/factcheck/v1/qacg/splits/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k')

In [8]:
def import_corpus(corpus_file):
    # it already has correct format
    raw = read_jsonl(corpus_file, show_progress=True)
    for e in raw:
        e["id"] = nfc(e["id"])
        if "did" not in e:
            did, bid = e["id"].split("_")
            e["bid"] = bid
            e["did"] = did
        e["did"] = nfc(str(e["did"]))
        e["text"] = nfc(e["text"])
    return raw


def generate_original_id2pid_mapping(corpus):
    original_id2pid = {}
    for pid, r in enumerate(corpus):
        original_id = r["id"]
        # assert original_id not in original_id2pid, f"original ID not unique! {original_id}"
        if original_id in original_id2pid:
            print(f"original ID not unique! {pid} {original_id}, previous pid: {original_id2pid[original_id]}")
        original_id2pid[original_id] = pid
    return original_id2pid

corpus = import_corpus(DATA_CORPUS)
original_id2pid = generate_original_id2pid_mapping(corpus)

0.00it [00:00, ?it/s]

In [18]:
def prepare_nli_data(src_file, dst_file, corpus, original_id2pid, seed=1234):
    # imports data created for Evidence retrieval (ColBERTv2:prepare_data_wiki.ipynb)
    rng = np.random.RandomState(seed)
    recs = []
    counts = Counter()
    data = read_jsonl(src_file)
    for sample in tqdm(data):
        claim = sample["claim"]
        label = sample["label"]
        evidence_bids = sample["evidence"]
        assert len(evidence_bids) == 1, "More than single evidence not impemented (yet)" 
        context = corpus[original_id2pid[evidence_bids[0]]]["text"]
        recs.append({"claim": claim, "context": context, "label": label})
        counts[label] += 1
    rng.shuffle(recs)
    print(f"exporting {len(recs)}, label counts: {counts} to:\n {str(dst_file)}")
    write_jsonl(dst_file, recs, mkdir=True)

prepare_nli_data(Path(SPLIT_ROOT, f"train_{APPROACH}.jsonl"), Path(NLI_ROOT, f"train_{APPROACH}.jsonl"), corpus, original_id2pid, seed=1234)
prepare_nli_data(Path(SPLIT_ROOT, f"dev_{APPROACH}.jsonl"), Path(NLI_ROOT, f"dev_{APPROACH}.jsonl"), corpus, original_id2pid, seed=1235)
prepare_nli_data(Path(SPLIT_ROOT, f"test_{APPROACH}.jsonl"), Path(NLI_ROOT, f"test_{APPROACH}.jsonl"), corpus, original_id2pid, seed=1236)

100%|██████████| 57938/57938 [00:00<00:00, 422070.43it/s]


exporting 57938, label counts: Counter({'n': 19477, 'r': 19393, 's': 19068}) to:
 /mnt/data/newton/parlamentni_listy/factcheck/v1/qacg/nli/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/train_balanced_shuf.jsonl


100%|██████████| 5903/5903 [00:00<00:00, 521077.06it/s]


exporting 5903, label counts: Counter({'n': 1993, 'r': 1958, 's': 1952}) to:
 /mnt/data/newton/parlamentni_listy/factcheck/v1/qacg/nli/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/dev_balanced_shuf.jsonl


100%|██████████| 5867/5867 [00:00<00:00, 573759.74it/s]

exporting 5867, label counts: Counter({'n': 1987, 'r': 1941, 's': 1939}) to:
 /mnt/data/newton/parlamentni_listy/factcheck/v1/qacg/nli/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k/test_balanced_shuf.jsonl





# Sum Sources
Currently combines: CTK, cRO, PListy and DenikN

In [14]:
def create_sum_split(src_files, dst_files, rng):
    # the sum dataset simply concatenates (and shuffles) all source language datasets
    data = [read_jsonl(src_file[1]) for src_file in src_files]
    sources = [src_file[0] for src_file in src_files]
    recs = []
    for source, d in zip(sources, data):
        indices = range(len(d))
        rec = []
        for idx in indices:
            r = d[idx]
            r["source"] = source
            r["orig_idx"] = idx # the index in the original language claim file
            rec.append(r)
        recs += list(rec)
    rng.shuffle(recs)
    write_jsonl(dst_files, recs, mkdir=True)

APPROACH = "balanced_shuf"
rng = np.random.RandomState(1234)
for split in [f"train_{APPROACH}.jsonl", f"dev_{APPROACH}.jsonl", f"test_{APPROACH}.jsonl", f"train_{APPROACH}_no_nei.jsonl", f"dev_{APPROACH}_no_nei.jsonl", f"test_{APPROACH}_no_nei.jsonl"]:
    create_sum_split([
        ("cro", Path("/mnt/data/cro/factcheck/v1/qacg/splits/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k", split)),
        ("ctk", Path("/mnt/data/ctknews/factcheck/par6/qacg/splits/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k", split)),
        ("denikn", Path("/mnt/data/factcheck/denikn/v1/qacg/splits/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k", split)),
        ("plisty", Path("/mnt/data/newton/parlamentni_listy/factcheck/v1/qacg/splits/PAV-ner-CNEC/mt5-large_all-cp126k/mt5-large_all-cp156k", split)),
        ],
        Path("/mnt/data/factcheck/qacg/news_sum/qacg/splits", split),
        rng=rng)

In [12]:
def prepare_nli_data_combined(src_files, dst_files, src2fcorpus, seed=1234):
    # imports data created for Evidence retrieval (ColBERTv2:prepare_data_news.ipynb)
    srcs = [read_jsonl(src_file) for src_file in tqdm(src_files, desc="reading sources")]

    for source, fcorpus in src2fcorpus.items():
        print(f"loading corpus for {source.upper()} from '{fcorpus}'")
        corpus = import_corpus(fcorpus)
        original_id2pid = generate_original_id2pid_mapping(corpus)
        for src in srcs:
            for sample in src:
                if sample["source"] == source:
                    evidence_bids = sample["evidence"]
                    assert len(evidence_bids) == 1, "More than single evidence not impemented (yet)" 
                    context = corpus[original_id2pid[evidence_bids[0]]]["text"]
                    sample["context"] = context
    for src, dst_file in zip(srcs, dst_files):
        print(f"exporting {len(src)} to:\n {str(dst_file)}")
        write_jsonl(dst_file, src, mkdir=True)

In [13]:
prepare_nli_data_combined(
    src_files=[
        f"/mnt/data/factcheck/qacg/news_sum/qacg/splits/dev_balanced_shuf.jsonl",
        f"/mnt/data/factcheck/qacg/news_sum/qacg/splits/test_balanced_shuf.jsonl",
        f"/mnt/data/factcheck/qacg/news_sum/qacg/splits/train_balanced_shuf.jsonl",
    ],
    dst_files=[
        f"/mnt/data/factcheck/qacg/news_sum/qacg/nli/dev_balanced_shuf.jsonl",
        f"/mnt/data/factcheck/qacg/news_sum/qacg/nli/test_balanced_shuf.jsonl",
        f"/mnt/data/factcheck/qacg/news_sum/qacg/nli/train_balanced_shuf.jsonl",
    ],
    src2fcorpus={
        "cro": "/mnt/data/cro/factcheck/v1/interim/cro_paragraphs_filtered.jsonl",
        "ctk": "/mnt/data/ctknews/factcheck/par6/interim/jsonl/ctk_filtered.jsonl",
        "denikn": "/mnt/data/factcheck/denikn/v1/interim/denikn_paragraphs.jsonl",
        "plisty": "/mnt/data/newton/parlamentni_listy/factcheck/v1/interim/plisty_paragraphs.jsonl"
    })

reading sources: 100%|██████████| 3/3 [00:02<00:00,  1.44it/s]

loading corpus for CRO from '/mnt/data/cro/factcheck/v1/interim/cro_paragraphs_filtered.jsonl'





0.00it [00:00, ?it/s]

loading corpus for CTK from '/mnt/data/ctknews/factcheck/par6/interim/jsonl/ctk_filtered.jsonl'


0.00it [00:00, ?it/s]

loading corpus for DENIKN from '/mnt/data/factcheck/denikn/v1/interim/denikn_paragraphs.jsonl'


0.00it [00:00, ?it/s]

loading corpus for PLISTY from '/mnt/data/newton/parlamentni_listy/factcheck/v1/interim/plisty_paragraphs.jsonl'


0.00it [00:00, ?it/s]

exporting 27601 to:
 /mnt/data/factcheck/qacg/news_sum/qacg/nli/dev_balanced_shuf.jsonl
exporting 27612 to:
 /mnt/data/factcheck/qacg/news_sum/qacg/nli/test_balanced_shuf.jsonl
exporting 273300 to:
 /mnt/data/factcheck/qacg/news_sum/qacg/nli/train_balanced_shuf.jsonl
