# 🇨🇿🌡 CsFEVER Dataset
Shared notebook version 1.0

## 📂 Add `src` to path

In [1]:
import sys

sys.path.append("../src")

## 📑 Import Clauses

In [6]:
import json
import os
import pickle
import requests
from os.path import join as pjoin

import datautils

## 🌡 Loading the CsFEVER Data into single file

In [3]:
all_splits = [datautils.load_jsonl(f"../data/nfd/{s}_deepl.jsonl") for s in ["train","dev","test"]]

In [4]:
concat = all_splits[0] + all_splits[1] + all_splits[2]

## ✂️ Dataset Splitting

In [5]:
concat[1]

{'id': 56360,
 'verifiable': 'VERIFIABLE',
 'label': 'REFUTES',
 'claim': 'Nelson Mandela zavedl bezplatnou zdravotní péči.',
 'evidence': [[[72715, 83763, 'Nelson Mandela', 21, 'Nelson Mandela']]],
 'claim_en': 'Nelson Mandela introduced free healthcare.'}

In [11]:
for datapoint in concat[:5]:
    response = requests.get("http://lindat.mff.cuni.cz/services/nametag/api/recognize", {"data": datapoint["claim"]})
    print(response.json())


{'model': 'czech-cnec2.0-200831', 'acknowledgements': ['http://ufal.mff.cuni.cz/nametag/2#acknowledgements', 'http://ufal.mff.cuni.cz/nametag/2/models#czech-cnec2_acknowledgements'], 'result': '<sentence><ne type="oa"><token>Útěk</token> <token>z</token> <token>planety</token> <ne type="g_"><token>Země</token></ne></ne> <token>je</token> <token>vědecký</token> <token>dokumentární</token> <token>film</token><token>.</token></sentence>'}
{'model': 'czech-cnec2.0-200831', 'acknowledgements': ['http://ufal.mff.cuni.cz/nametag/2#acknowledgements', 'http://ufal.mff.cuni.cz/nametag/2/models#czech-cnec2_acknowledgements'], 'result': '<sentence><ne type="P"><ne type="pf"><token>Nelson</token></ne> <ne type="ps"><token>Mandela</token></ne></ne> <token>zavedl</token> <token>bezplatnou</token> <token>zdravotní</token> <token>péči</token><token>.</token></sentence>'}
{'model': 'czech-cnec2.0-200831', 'acknowledgements': ['http://ufal.mff.cuni.cz/nametag/2#acknowledgements', 'http://ufal.mff.cuni.cz

In [14]:
groups = []
i = 1
for datapoint in concat:
    datapoint_articles = set()
    for evset in datapoint['evidence']:
        for evidence in evset:
            datapoint_articles.add(evidence[2])

    datapoint_group = [datapoint]
    groups2 = []

    for group_articles, group in groups:
        if datapoint_articles & group_articles:
            datapoint_articles |= group_articles
            datapoint_group.extend(group)
        else:
            groups2.append((group_articles, group))

    groups2.append((datapoint_articles, datapoint_group))
    groups = groups2

    print("#", i, "out of", len(concat), datapoint)
    i += 1

print(groups[-5:])


finished 1 out of 127328 {'id': 188153, 'verifiable': 'NOT VERIFIABLE', 'label': 'NOT ENOUGH INFO', 'claim': 'Útěk z planety Země je vědecký dokumentární film.', 'evidence': [], 'claim_en': 'Escape from Planet Earth is a science documentary film.'}
finished 1 out of 127328 {'id': 56360, 'verifiable': 'VERIFIABLE', 'label': 'REFUTES', 'claim': 'Nelson Mandela zavedl bezplatnou zdravotní péči.', 'evidence': [[[72715, 83763, 'Nelson Mandela', 21, 'Nelson Mandela']]], 'claim_en': 'Nelson Mandela introduced free healthcare.'}
finished 1 out of 127328 {'id': 188525, 'verifiable': 'VERIFIABLE', 'label': 'SUPPORTS', 'claim': 'Eddie Vedder je známý především jako člen skupiny Pearl Jam.', 'evidence': [[[220058, 225927, 'Eddie Vedder', 0, 'Eddie Vedder']]], 'claim_en': 'Eddie Vedder is best recognized as a portion of Pearl Jam.'}
finished 1 out of 127328 {'id': 11580, 'verifiable': 'NOT VERIFIABLE', 'label': 'NOT ENOUGH INFO', 'claim': 'Sojuz je stále ve vesmíru.', 'evidence': [], 'claim_en': 'S

KeyboardInterrupt: 

In [9]:
len(train)+len(validation)+len(test),len(train),len(validation),len(test)

(4666, 3626, 482, 558)

### 💧 Leakage Prevention Levels
1. *source* - No source article can occur in two different splits
2. *mutated_from* - No extracted claim could have been used to mutate claims in two distinct splits
3. *id* - **Weakest, least admissible prevention!** - No pair of different evidence sets for the same claim can occur in two different splits

### 🎛️ Other params with defaults
TODO: explain

test_size=.12, validation_size=.12, skip_ids=PREVIOUSLY_USED, leakage_prevention_level="source", seed=1234

### (optional) Convert splits to single-evidence format

In [6]:
single_evidence=1
train,validation,test = (datautils.expand_by_evidence(split) for split in (train,validation,test))
print(datautils.counter(train), "\n", datautils.counter(validation), "\n", datautils.counter(test),)

[('NOT ENOUGH INFO', 1021, 0.2815774958632101), ('REFUTES', 851, 0.23469387755102042), ('SUPPORTS', 1754, 0.48372862658576943)] 
 [('NOT ENOUGH INFO', 177, 0.36721991701244816), ('REFUTES', 114, 0.23651452282157676), ('SUPPORTS', 191, 0.3962655601659751)] 
 [('NOT ENOUGH INFO', 183, 0.3279569892473118), ('REFUTES', 115, 0.2060931899641577), ('SUPPORTS', 260, 0.4659498207885305)]


### 💾 Save DS's to folder

In [31]:
datautils.save_splits(train, validation, test, f"/mnt/data/factcheck/CTK/dataset/splits2_{format}_{evidence_format}_s{simulate_nei_evidence}_si{single_evidence}_t{test_size}_v{validation_size}_{leakage_prevention_level}_{seed}")

### 🔌 Convert to SentenceTransformers trainingExamples

In [7]:
trn_examples, val_examples, tst_examples = (
    datautils.to_examples(split) for split in (train, validation, test)
)

In [8]:
output_path = "../data/demo_splits/pickle"
os.makedirs(output_path, exist_ok=True)
pickle.dump(trn_examples, open(pjoin(output_path, "trn_examples.p"), "wb"))
pickle.dump(tst_examples, open(pjoin(output_path, "tst_examples.p"), "wb"))
pickle.dump(val_examples, open(pjoin(output_path, "val_examples.p"), "wb"))

🎉 Congratulations! You now have a set of training examples ready to be used in the train_rte or in training pipeline sheets of May-July '21!

Plug'n'play!
## 🧭 Exploratory Analysis
TODO