# 📰 ČTK Dataset
Shared notebook version 1.0

## 📂 Add `src` to path

In [1]:
import sys

sys.path.append("../src")

## 📑 Import Clauses

In [2]:
import json
import os
import pickle
from os.path import join as pjoin

import datautils

## 🌐 Loading the ČTK data

In [3]:
format = "nli"
evidence_format = "text"
simulate_nei_evidence = 1
single_evidence = 0
test_size = 0.095
validation_size = 0.12
seed = 77
leakage_prevention_level = "source"

### 1. ⚡ From Live Api

In [4]:
dataset = datautils.load_api_export(
    format=format,
    evidence_format=evidence_format,
    simulate_nei_evidence=simulate_nei_evidence,
    single_evidence=single_evidence,
)

### 2. 💾 ... Or from JSONL Dump

In [11]:
location = "../export-snapshots/export_08-30-2021_0200am_173.jsonl"
dataset = datautils.load_jsonl(location)

## ✂️ Dataset Splitting

In [5]:
train, validation, test = datautils.split(
    dataset,
    leakage_prevention_level=leakage_prevention_level,
    test_size=test_size,
    validation_size=validation_size,
    seed=seed,
    skip_ids=[]
)

print(datautils.counter(train), "\n", datautils.counter(validation), "\n", datautils.counter(test),)

[('NOT ENOUGH INFO', 723, 0.3033990767939572), ('REFUTES', 556, 0.23331934536298782), ('SUPPORTS', 1104, 0.46328157784305496)] 
 [('NOT ENOUGH INFO', 105, 0.31626506024096385), ('REFUTES', 85, 0.2560240963855422), ('SUPPORTS', 142, 0.42771084337349397)] 
 [('NOT ENOUGH INFO', 127, 0.3324607329842932), ('REFUTES', 79, 0.20680628272251309), ('SUPPORTS', 176, 0.4607329842931937)]


In [9]:
len(train)+len(validation)+len(test),len(train),len(validation),len(test)

(4666, 3626, 482, 558)

### 💧 Leakage Prevention Levels
1. *source* - No source article can occur in two different splits
2. *mutated_from* - No extracted claim could have been used to mutate claims in two distinct splits
3. *id* - **Weakest, least admissible prevention!** - No pair of different evidence sets for the same claim can occur in two different splits

### 🎛️ Other params with defaults
TODO: explain

test_size=.12, validation_size=.12, skip_ids=PREVIOUSLY_USED, leakage_prevention_level="source", seed=1234

### (optional) Convert splits to single-evidence format

In [6]:
single_evidence=1
train,validation,test = (datautils.expand_by_evidence(split) for split in (train,validation,test))
print(datautils.counter(train), "\n", datautils.counter(validation), "\n", datautils.counter(test),)

[('NOT ENOUGH INFO', 1021, 0.2815774958632101), ('REFUTES', 851, 0.23469387755102042), ('SUPPORTS', 1754, 0.48372862658576943)] 
 [('NOT ENOUGH INFO', 177, 0.36721991701244816), ('REFUTES', 114, 0.23651452282157676), ('SUPPORTS', 191, 0.3962655601659751)] 
 [('NOT ENOUGH INFO', 183, 0.3279569892473118), ('REFUTES', 115, 0.2060931899641577), ('SUPPORTS', 260, 0.4659498207885305)]


### 💾 Save DS's to folder

In [31]:
datautils.save_splits(train, validation, test, f"/mnt/data/factcheck/CTK/dataset/splits2_{format}_{evidence_format}_s{simulate_nei_evidence}_si{single_evidence}_t{test_size}_v{validation_size}_{leakage_prevention_level}_{seed}")

### 🔌 Convert to SentenceTransformers trainingExamples

In [7]:
trn_examples, val_examples, tst_examples = (
    datautils.to_examples(split) for split in (train, validation, test)
)

In [8]:
output_path = "../data/demo_splits/pickle"
os.makedirs(output_path, exist_ok=True)
pickle.dump(trn_examples, open(pjoin(output_path, "trn_examples.p"), "wb"))
pickle.dump(tst_examples, open(pjoin(output_path, "tst_examples.p"), "wb"))
pickle.dump(val_examples, open(pjoin(output_path, "val_examples.p"), "wb"))

🎉 Congratulations! You now have a set of training examples ready to be used in the train_rte or in training pipeline sheets of May-July '21!

Plug'n'play!
## 🧭 Exploratory Analysis
TODO