# 📰 ČTK Dataset
Shared notebook version 1.0

## 📂 Add `src` to path

In [1]:
import sys
sys.path.append("../src")

## 📑 Import Clauses

In [2]:
import os
import pickle
import json

from os.path import join as pjoin
import datautils

## 🌐 Loading the ČTK data
### 1. ⚡ From Live Api

In [3]:
dataset = datautils.load_api_export(format="nli",
                                    evidence_format="text",
                                    simulate_nei_evidence=1,
                                    single_evidence=0)

### 2. 💾 ... Or from JSONL Dump

In [11]:
location = "../export-snapshots/export_08-30-2021_0200am_173.jsonl"
dataset = datautils.load_jsonl(location)

## ✂️ Dataset Splitting

In [4]:
dataset = datautils.expand_by_evidence(dataset)  # convert multi- to single-evidence-set format
train, validation, test = datautils.split(dataset, leakage_prevention_level="source")

print(datautils.counter(train), "\n", datautils.counter(validation), "\n", datautils.counter(test))

[('NOT ENOUGH INFO', 1074, 0.27716129032258063), ('REFUTES', 928, 0.23948387096774193), ('SUPPORTS', 1873, 0.48335483870967744)] 
 [('NOT ENOUGH INFO', 87, 0.258160237388724), ('REFUTES', 85, 0.2522255192878338), ('SUPPORTS', 165, 0.4896142433234421)] 
 [('NOT ENOUGH INFO', 139, 0.3726541554959786), ('REFUTES', 67, 0.17962466487935658), ('SUPPORTS', 167, 0.4477211796246649)]


In [15]:
dataset

[]

### 💧 Leakage Prevention Levels
1. *source* - No source article can occur in two different splits
2. *mutated_from* - No extracted claim could have been used to mutate claims in two distinct splits
3. *id* - **Weakest, least admissible prevention!** - No pair of different evidence sets for the same claim can occur in two different splits

### 🎛️ Other params with defaults
TODO: explain

test_size=.12, validation_size=.12, skip_ids=PREVIOUSLY_USED, leakage_prevention_level="source", seed=1234

### 💾 Save DS's to folder

In [5]:
datautils.save_splits(train, validation, test, "../data/demo_splits")

### 🔌 Convert to SentenceTransformers trainingExamples

In [None]:
trn_examples, val_examples, tst_examples = (datautils.to_examples(split) for split in (train, validation, test))

In [None]:
output_path = "../data/demo_splits/pickle"
os.makedirs(output_path, exist_ok=True)
pickle.dump(trn_examples, open(pjoin(output_path, "trn_examples.p"), "wb"))
pickle.dump(tst_examples, open(pjoin(output_path, "tst_examples.p"), "wb"))
pickle.dump(val_examples, open(pjoin(output_path, "val_examples.p"), "wb"))

🎉 Congratulations! You now have a set of training examples ready to be used in the train_rte or in training pipeline sheets of May-July '21!

Plug'n'play!
## 🧭 Exploratory Analysis
TODO