In [1]:
import argparse
import csv
import datetime as dt
import json
import logging
import math
import os
import pickle
import string
import unicodedata as ud
import urllib.request
from collections import Counter, OrderedDict, defaultdict
from itertools import chain, product
from os.path import join as pjoin

import numpy as np
import sklearn
import torch
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator

from sentence_transformers.evaluation import (
    SequentialEvaluator,
)
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader

from models.rte import (
    convert_to_sentence_pairs,
    create_splits,
    import_csv_dataset,
    import_csv_dataset_nei,
    split_evidence_sets,
    unique_evidence,
)
from utils.dbcache import DBCache
from utils.tokenization import detokenize, detokenize2

logger = logging.getLogger(__name__)

In [2]:
dataset = []
with open("export.jsonl") as file:
    for line in file:
        dataset.append(json.loads(line))


def import_ctk_data_stratified(reverse_pairs=True):
    labels = []
    examples = []
    for row in dataset:
        claim = row["claim"]
        label = row[
            "label"
        ]  # (0 if row["label"] == "REFUTES" else 1 if row["label"] == "SUPPORTS" else 2)
        for eset in row["evidence"]:
            #             assert len(eset) == 1 # for now only single document
            evidence = " ".join([ud.normalize("NFD", detokenize2(ev)) for ev in eset])
            texts = [ud.normalize("NFD", claim), evidence]
            if reverse_pairs:
                texts.reverse()
            if evidence.strip():
                examples.append(texts)
                labels.append(label)

    trn_texts = [(r[0], r[1]) for r in examples]
    trn_labels = labels

    (
        trn_texts,
        tst_texts,
        trn_labels,
        tst_labels,
    ) = sklearn.model_selection.train_test_split(
        trn_texts, trn_labels, train_size=0.8, random_state=1234, stratify=trn_labels
    )

    (
        trn_texts,
        val_texts,
        trn_labels,
        val_labels,
    ) = sklearn.model_selection.train_test_split(
        trn_texts, trn_labels, train_size=0.83, random_state=1234, stratify=trn_labels
    )

    logger.info(Counter(trn_labels))
    logger.info(Counter(val_labels))
    logger.info(Counter(tst_labels))
    return trn_texts, val_texts, tst_texts, trn_labels, val_labels, tst_labels


def load_examples_from_nli_format():
    def convert_to_examples(txts, labels):
        examples = []
        label2id = {"REFUTES": 1, "SUPPORTS": 0, "NOT ENOUGH INFO": 2}
        for (evidence, claim), label in zip(txts, labels):
            examples.append(
                InputExample(
                    texts=[ud.normalize("NFC", evidence), ud.normalize("NFC", claim)],
                    label=label2id[label],
                )
            )
        return examples

    (
        trn_texts,
        val_texts,
        tst_texts,
        trn_labels,
        val_labels,
        tst_labels,
    ) = import_ctk_data_stratified()
    trn_examples = convert_to_examples(trn_texts, trn_labels)
    tst_examples = convert_to_examples(tst_texts, tst_labels)
    val_examples = convert_to_examples(val_texts, val_labels)
    return trn_examples, tst_examples, val_examples

In [46]:
label2id = {"REFUTES": 1, "SUPPORTS": 0, "NOT ENOUGH INFO": 2}

def load_examples_from_splits(folder):
    examples = {}
    for filename in ("train", "test", "val"):
        with open(folder+"/"+filename+".jsonl","r") as f:
            examples[filename] = []
            for line in f:
                example = json.loads(line)
                examples[filename].append(InputExample(
                    texts = [ud.normalize("NFC"," ".join(example["evidence"])), ud.normalize("NFC",example["claim"])],
                    label = label2id[example["label"]],
                    guid = example["id"]
                ))
    return examples["train"], examples["test"], examples["val"]

In [48]:
trn_examples, tst_examples, val_examples = load_examples_from_splits("ctkclean2")

In [52]:
trn_examples[1].label

0

In [53]:
trn_labels[1]

'SUPPORTS'

In [4]:
trn_texts = []
with open("export.jsonl") as file:
    for line in file:
        trn_texts.append(json.loads(line))


trn_labels = [datapoint["label"] for datapoint in trn_texts]
trn_texts, tst_texts, trn_labels, tst_labels = sklearn.model_selection.train_test_split(
    trn_texts, trn_labels, train_size=0.8, random_state=1234, stratify=trn_labels
)

trn_texts, val_texts, trn_labels, val_labels = sklearn.model_selection.train_test_split(
    trn_texts, trn_labels, train_size=0.83, random_state=1234, stratify=trn_labels
)

In [11]:
tst_splits = [(None, None) for _ in range(10)]

In [5]:
tst_splits

NameError: name 'tst_splits' is not defined

In [6]:
len(trn_texts), len(val_texts), len(tst_labels)

(2009, 412, 606)

In [14]:
len(used_ids)

98

In [15]:
for text in tst_texts:
    used_ids.append(text["id"])

In [16]:
len(used_ids)

704

In [24]:
used_ids={39, 60, 69, 70, 71, 72, 82, 83, 89, 94, 115, 116, 126, 138, 139, 147, 149, 152, 153, 157, 159, 186, 197, 198, 202, 208, 209, 222, 226, 235, 237, 240, 243, 256, 260, 262, 263, 270, 274, 280, 283, 298, 326, 330, 349, 360, 367, 371, 373, 378, 383, 392, 394, 395, 400, 408, 411, 418, 419, 428, 429, 432, 433, 434, 441, 455, 457, 462, 472, 481, 484, 500, 505, 508, 518, 519, 521, 522, 536, 537, 542, 556, 558, 560, 570, 586, 598, 600, 605, 607, 610, 623, 624, 639, 647, 651, 657, 659, 661, 663, 666, 671, 675, 682, 687, 689, 713, 715, 720, 723, 733, 735, 737, 740, 750, 751, 756, 768, 769, 770, 779, 790, 792, 794, 795, 804, 818, 822, 824, 825, 828, 829, 833, 838, 878, 881, 908, 921, 922, 928, 957, 958, 967, 976, 982, 983, 989, 992, 997, 1006, 1008, 1029, 1037, 1041, 1043, 1071, 1072, 1074, 1080, 1088, 1095, 1106, 1107, 1115, 1116, 1125, 1126, 1138, 1139, 1140, 1144, 1148, 1149, 1158, 1160, 1162, 1163, 1170, 1174, 1177, 1189, 1190, 1199, 1204, 1215, 1220, 1228, 1235, 1239, 1243, 1246, 1259, 1261, 1270, 1274, 1297, 1306, 1313, 1315, 1338, 1350, 1353, 1361, 1362, 1365, 1366, 1379, 1382, 1388, 1396, 1399, 1400, 1410, 1411, 1413, 1423, 1444, 1446, 1447, 1453, 1504, 1505, 1522, 1534, 1541, 1542, 1543, 1554, 1556, 1570, 1574, 1578, 1599, 1610, 1623, 1624, 1630, 1632, 1634, 1651, 1663, 1671, 1676, 1677, 1683, 1691, 1695, 1720, 1723, 1724, 1732, 1733, 1739, 1744, 1751, 1755, 1761, 1762, 1769, 1779, 1807, 1817, 1818, 1821, 1825, 1830, 1834, 1835, 1839, 1850, 1856, 1862, 1876, 1897, 1898, 1900, 1906, 1927, 1939, 1942, 1944, 1952, 1955, 1956, 1966, 1970, 1971, 1973, 1975, 1981, 1983, 2006, 2015, 2016, 2060, 2067, 2071, 2083, 2092, 2093, 2094, 2097, 2100, 2107, 2112, 2120, 2122, 2123, 2126, 2129, 2142, 2152, 2165, 2167, 2171, 2173, 2187, 2195, 2197, 2204, 2218, 2229, 2233, 2234, 2239, 2242, 2254, 2272, 2275, 2291, 2295, 2298, 2318, 2324, 2334, 2365, 2372, 2375, 2378, 2383, 2389, 2394, 2397, 2405, 2410, 2418, 2425, 2443, 2452, 2453, 2462, 2466, 2469, 2494, 2497, 2498, 2516, 2525, 2538, 2544, 2549, 2550, 2558, 2569, 2573, 2578, 2584, 2585, 2597, 2608, 2613, 2618, 2621, 2627, 2628, 2632, 2646, 2658, 2666, 2672, 2676, 2678, 2680, 2687, 2688, 2695, 2703, 2710, 2711, 2715, 2716, 2717, 2719, 2721, 2725, 2730, 2737, 2742, 2744, 2753, 2754, 2766, 2769, 2770, 2775, 2776, 2777, 2778, 2786, 2796, 2806, 2810, 2834, 2846, 2847, 2852, 2858, 2890, 2891, 2900, 2902, 2903, 2907, 2915, 2923, 2932, 2935, 2938, 2951, 2980, 2993, 2994, 2996, 2997, 2999, 3000, 3005, 3006, 3011, 3012, 3013, 3032, 3053, 3063, 3069, 3070, 3071, 3073, 3075, 3076, 3093, 3095, 3097, 3105, 3108, 3110, 3112, 3114, 3118, 3124, 3125, 3131, 3139, 3158, 3159, 3161, 3164, 3168, 3174, 3178, 3181, 3205, 3211, 3214, 3225, 3233, 3234, 3235, 3237, 3238, 3247, 3255, 3256, 3273, 3277, 3278, 3285, 3286, 3288, 3289, 3290, 3296, 3302, 3304, 3305, 3306, 3310, 3311, 3321, 3338, 3351, 3354, 3365, 3379, 3382, 3388, 3393, 3400, 3403, 3421, 3423, 3441, 3445, 3450, 3458, 3471, 3487, 3494, 3500, 3501, 3523, 3529, 3530, 3541, 3544, 3549, 3552, 3553, 3559, 3567, 3569, 3570, 3572, 3575, 3576, 3584, 3585, 3591, 3602, 3605, 3609, 3615, 3620, 3621, 3633, 3638, 3649, 3676, 3677, 3687, 3703, 3704, 3705, 3707, 3717, 3718, 3720, 3721, 3727, 3738, 3740, 3744, 3745, 3747, 3748, 3759, 3764, 3769, 3779, 3782, 3798, 3823, 3824, 3831, 3833, 3844, 3845, 3852, 3855, 3856, 3874, 3876, 3889, 3892, 3912, 3936, 3940, 3943, 3956, 3959, 3965, 3985, 3993, 3996, 4004, 4012, 4029, 4034, 4049, 4064, 4076, 4081, 4084, 4093, 4098, 4102, 4110, 4112, 4118, 4129, 4139, 4141, 4148, 4154, 4182, 4194, 4209, 4227, 4236, 4256, 4258, 4261, 4265, 4269, 4270, 4272, 4276, 4277, 4280, 4281, 4282, 4283, 4284, 4285, 4288, 4306, 4310, 4313, 4326, 4327, 4329, 4331, 4332, 4338, 4343, 4346, 4353, 4355, 4364, 4367, 4368, 4380, 4383, 4394, 4396, 4402, 4404, 4407, 4409, 4414, 4416, 4419, 4421, 4426, 4438, 4440, 4442, 4447, 4449, 4451, 4455, 4457, 4459, 4465, 4466, 4469, 4474, 4476}

In [44]:
trn_texts = []
with open("export_08-25-2021_1109pm_173.jsonl") as file:
    for line in file:
        trn_texts.append(json.loads(line))


trn_labels = [datapoint["label"] for datapoint in trn_texts]
print(len(trn_texts),len(trn_labels))
rand_state = 1
min_intersection, min_split = 50000, None
for i in range(10000):
    trn2_texts, tst_texts, trn2_labels, tst_labels = sklearn.model_selection.train_test_split(
            trn_texts, trn_labels, train_size=0.89, random_state=hash(i), stratify=trn_labels
    )
    intersect = used_ids.intersection(set(datapoint["id"] for datapoint in tst_texts))
    if (len(intersect) < min_intersection):
        min_intersection, min_split = len(intersect), (trn2_texts, tst_texts, trn2_labels, tst_labels)
        
trn_texts, tst_texts, trn_labels, tst_labes = min_split

trn_texts, val_texts, trn_labels, val_labels = sklearn.model_selection.train_test_split(
        trn_texts, trn_labels, train_size=0.87, random_state=1234, stratify=trn_labels
)
min_intersection

4555 4555


152

In [42]:
for datapoint in tst_texts:
    used_ids.add(datapoint["id"])
print(len(used_ids))
print(",".join(str(idd) for idd in used_ids))

1071
4098,4099,4102,7,2058,2060,4110,4112,4114,2067,4118,2071,28,29,4126,4129,2083,39,4137,4139,2092,2093,2094,4141,4140,2097,2091,2100,4148,4149,4153,4154,2107,60,2112,4160,66,2114,67,69,70,71,72,2120,2122,2123,4170,4173,2126,79,2129,82,83,4182,87,89,94,2142,2143,2145,4194,2152,4202,2154,108,4209,115,116,2165,2167,4216,4217,122,2171,2173,126,4223,2177,4227,134,135,138,139,2187,4236,140,146,147,2195,149,2197,152,153,154,4251,2204,157,4254,159,4256,2207,4258,161,4261,4265,2218,4266,170,4269,4270,4271,4272,4276,2229,4277,2228,4280,2233,186,2234,4281,4282,4283,2239,4284,4285,2242,4288,4289,197,198,2238,196,200,202,201,4294,2254,208,209,4306,4304,2259,2261,4310,2257,4311,4313,2266,222,2272,225,226,2275,4326,4327,4329,234,235,4331,237,4332,236,240,2281,4338,243,2291,239,4342,2295,4343,2296,2298,4346,2303,256,4353,4355,260,2309,262,263,264,4363,4364,270,2318,4367,4368,274,2323,2324,276,280,282,283,4380,2334,4383,2337,4386,291,4389,4392,298,4394,4396,300,301,4402,4403,4404,4407,4409,2363,4412

In [34]:
min_intersection

82

In [45]:
with open("ctkclean2/train.jsonl", "w") as t, open("ctkclean2/val.jsonl", "w") as v, open(
    "ctkclean2/test.jsonl", "w"
) as ts:
    for (fi, ex) in [(t, trn_texts), (v, val_texts), (ts, tst_texts)]:
        for e in ex:
            print(json.dumps(e, ensure_ascii=False), file=fi)

In [5]:
labels

NameError: name 'labels' is not defined

In [None]:
2021-07-28 18:02:01,696 - root:21 - INFO - TRN: Counter({'SUPPORTS': 1058, 'REFUTES': 502, 'NEI': 245})
2021-07-28 18:02:01,698 - root:23 - INFO - TST: Counter({'SUPPORTS': 395, 'REFUTES': 200, 'NEI': 89})
2021-07-28 18:02:01,699 - root:25 - INFO - VAL: Counter({'SUPPORTS': 311, 'REFUTES': 145, 'NEI': 68})

In [19]:
trn_examples[3].texts

['Kyjev 14. srpna (ČTK) - Odvolací soud v Kyjevě dnes propustil na svobodu vězněného ukrajinského exministra obrany Valerije Ivaščenka, když jeho pětiletý trest vězení změnil na roční podmínku. Bývalý člen vlády někdejší premiérky Julije Tymošenkové byl letos v dubnu odsouzen za zneužití pravomocí. Soud dnes přihlédl k tomu, že odsouzený částečně přiznal vinu.',
 'Ivaščenko spolupracoval s Julijí Tymošenkovéóu.']

In [4]:
log_fmt = "%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=log_fmt)

fdir = "/mnt/data/factcheck/CTK/par4"
# db_name = pjoin(fdir, "interim/ctk_filtered10000.db")
db_name = pjoin(fdir, "interim/ctk_filtered.db")
excludekw = "souhrn;sport;kolo;fotbal;hokej;Volejbal;Atletika;Lyžování;Cyklistika;Tenis;stolní tenis;Olympijské;AVÍZO;TABULKA;UPOZORNĚNÍ;PROTEXT;Deník"
excludekw2 = "basketbal"
db = DBCache(db_name, excludekw=excludekw + ";" + excludekw2)

2021-07-28 17:53:21,885 - utils.dbcache:26 - INFO - reading database to RAM
2021-07-28 17:53:21,886 - utils.dbcache:27 - INFO - excluding keywords: ['souhrn', 'sport', 'kolo', 'fotbal', 'hokej', 'volejbal', 'atletika', 'lyžování', 'cyklistika', 'tenis', 'stolní tenis', 'olympijské', 'avízo', 'tabulka', 'upozornění', 'protext', 'deník', 'basketbal']
2021-07-28 17:54:29,832 - utils.dbcache:37 - INFO - processing total 15032152 rows
100%|██████████| 15032152/15032152 [03:12<00:00, 77972.07it/s]
2021-07-28 17:58:44,440 - utils.dbcache:73 - INFO - blocks imported: 11134727, excluded based on keywords: 3897425


In [5]:
db

<utils.dbcache.DBCache at 0x7f0b63575b50>

In [13]:
logger = logging.getLogger()
fh = logging.FileHandler(
    "logs/utery.log",
)
fh.setLevel(logging.INFO)
fh.setFormatter(
    logging.Formatter("%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s")
)
logger.addHandler(fh)
logging.info("Hello world")

In [7]:
def load_examples_from_raw_csvs():
    # dict_dataset_base = import_csv_dataset("/mnt/data/factcheck/CTK/par4-dec2020-annotations_drchajan/raw")
    # dict_dataset_nei = import_csv_dataset_nei("/mnt/data/factcheck/CTK/par4-dec2020-annotations_drchajan/raw", 336)

    dict_dataset_base = import_csv_dataset(
        "/mnt/data/factcheck/CTK/par4-may2021-annotations_drchajan/raw"
    )
    dict_dataset_nei = import_csv_dataset_nei(
        "/mnt/data/factcheck/CTK/par4-may2021-annotations_drchajan/raw", 2175
    )

    dataset1 = list({**dict_dataset_base, **dict_dataset_nei}.values())
    logger.info(f"dataset1 size: {len(dataset1)}")

    rng = np.random.RandomState(1234)
    dataset1_trn, dataset1_tst, dataset1_val = create_splits(
        unique_evidence(dataset1), rng
    )

    dataset2_trn = split_evidence_sets(dataset1_trn)
    logger.info(f'TRN: {Counter([s["label"] for s in dataset2_trn])}')
    dataset2_tst = split_evidence_sets(dataset1_tst)
    logger.info(f'TST: {Counter([s["label"] for s in dataset2_tst])}')
    dataset2_val = split_evidence_sets(dataset1_val)
    logger.info(f'VAL: {Counter([s["label"] for s in dataset2_val])}')

    reverse_pairs = True
    trn_examples = convert_to_sentence_pairs(
        db, dataset2_trn, reverse_pairs=reverse_pairs
    )
    tst_examples = convert_to_sentence_pairs(
        db, dataset2_tst, reverse_pairs=reverse_pairs
    )
    val_examples = convert_to_sentence_pairs(
        db, dataset2_val, reverse_pairs=reverse_pairs
    )

    return trn_examples, tst_examples, val_examples


def load_examples_from_nli_format():
    def convert_to_examples(txts, labels):
        examples = []
        label2id = {"REFUTES": 1, "SUPPORTS": 0, "NOT ENOUGH INFO": 2}
        for (evidence, claim), label in zip(txts, labels):
            examples.append(
                InputExample(
                    texts=[ud.normalize("NFC", evidence), ud.normalize("NFC", claim)],
                    label=label2id[label],
                )
            )
        return examples

    (
        trn_texts,
        val_texts,
        tst_texts,
        trn_labels,
        val_labels,
        tst_labels,
    ) = import_nli_ctk(datadir="/mnt/data/factcheck/CTK/dataset/v2.1nli/nfc")
    trn_examples = convert_to_examples(trn_texts, trn_labels)
    tst_examples = convert_to_examples(tst_texts, tst_labels)
    val_examples = convert_to_examples(val_texts, val_labels)
    return trn_examples, tst_examples, val_examples


# trn_examples, tst_examples, val_examples = load_examples_from_raw_csvs()
trn_examples2, tst_examples2, val_examples2 = load_examples_from_raw_csvs()

2021-07-28 18:02:01,658 - models.rte:160 - INFO - round_cnt 608 -> new round 1
2021-07-28 18:02:01,665 - models.rte:160 - INFO - round_cnt 585 -> new round 2
2021-07-28 18:02:01,671 - models.rte:160 - INFO - round_cnt 516 -> new round 3
2021-07-28 18:02:01,675 - models.rte:160 - INFO - round_cnt 389 -> new round 4
2021-07-28 18:02:01,678 - models.rte:160 - INFO - round_cnt 68 -> new round 5
2021-07-28 18:02:01,682 - root:13 - INFO - dataset1 size: 2387
2021-07-28 18:02:01,696 - root:21 - INFO - TRN: Counter({'SUPPORTS': 1058, 'REFUTES': 502, 'NEI': 245})
2021-07-28 18:02:01,698 - root:23 - INFO - TST: Counter({'SUPPORTS': 395, 'REFUTES': 200, 'NEI': 89})
2021-07-28 18:02:01,699 - root:25 - INFO - VAL: Counter({'SUPPORTS': 311, 'REFUTES': 145, 'NEI': 68})


original pararagraphs #, TRN: 169, TST: 54, VAL: 43


In [11]:
trn_examples[0].texts, trn_examples2[0].texts, len(tst_examples), len(tst_examples2)

(['Jedním ze čtveřice vítězů, kteří titul Young Architect Award 2011 letos získali, byla například Ida Čapounová, která navrhla urbanistická řešení pro český venkov, konkrétně pro benešovský Měchnov. Další z laureátů, mladý architekt Jakub Jílek, se věnoval řece Vltavě v české metropoli, architektka Pavlína Macháčková a Klára Makovcová ze studia MMM Architekti pak zpracovaly studii týkající se zemědělského učiliště. Jediným zahraničním autorem mezi vítězi pak je slovinská architektka Polonca Kastelicová, která navrhla revitalizaci hlavního náměstí v jedné ze slovinských obcí.',
  'Ida Čapounová prohrála v soutěži Young Architect Award.'],
 ['Podle Punčocháře je mýtem, že kdyby bylo více vody ve vodním profilu, tak se zajistí plnění nádrží i průtoky řek. Podle něj je v ČR kolem 165 nádrží s objemem 3360 milionů metrů krychlových s plochou hladiny 30.000 hektarů. V rybnících, kterých je zhruba 23.000, je pak 500 milionů metrů krychlových, tedy násobně méně. Ale jejich plocha je 51.000 he

In [None]:
bert_name = (
    bert_name_short
) = "deepset/xlm-roberta-large-squad2"  # "DeepPavlov/bert-base-multilingual-cased-sentence" #"bert-base-multilingual-cased"  # "deepset/xlm-roberta-large-squad2"
max_length = None
batch_size = 5
num_epochs = 30
post = ""
model_name = f"{bert_name_short}_bs{batch_size}{post}"
output_path = pjoin("FINAL_MODELS", model_name)

In [None]:
os.makedirs(output_path, exist_ok=True)

In [54]:
evals = {}

In [9]:
evals[output_path] = tst_evaluator(model, output_path=output_path)

In [56]:
for folder in ["ctkclean2", "ctkclean1"]:
    for i in range(0, 1):
        if i == 0:
            bert_name = (
                bert_name_short
            ) = "deepset/xlm-roberta-large-squad2"  # "DeepPavlov/bert-base-multilingual-cased-sentence" #"bert-base-multilingual-cased"  # "deepset/xlm-roberta-large-squad2"
            max_length = None
            batch_size = 2
            num_epochs = 30
            post = ""
            model_name = f"{bert_name_short}_bs{batch_size}{post}{folder}"
            output_path = pjoin("models_clean", model_name)
        if i == 1:
            bert_name = (
                bert_name_short
            ) = "deepset/xlm-roberta-large-squad2"  # "DeepPavlov/bert-base-multilingual-cased-sentence" #"bert-base-multilingual-cased"  # "deepset/xlm-roberta-large-squad2"
            max_length = None
            batch_size = 8
            num_epochs = 30
            post = ""
            model_name = f"{bert_name_short}_bs{batch_size}{post}{folder}"
            output_path = pjoin("models_clean", model_name)
        
        trn_examples, tst_examples, val_examples = load_examples_from_splits(folder)
        os.makedirs(output_path, exist_ok=True)
        logger.info(f"output path: {output_path}")
        pickle.dump(trn_examples, open(pjoin(output_path, "trn_examples.p"), "wb"))
        pickle.dump(tst_examples, open(pjoin(output_path, "tst_examples.p"), "wb"))
        pickle.dump(val_examples, open(pjoin(output_path, "val_examples.p"), "wb"))

        cfg = OrderedDict(
            [
                ("bert_name", bert_name),
                ("bert_name_short", bert_name_short),
                ("batch_size", batch_size),
                ("max_length", max_length),
            ]
        )

        with open(pjoin(output_path, "rteconfig.json"), "w") as outfile:
            outfile.write(json.dumps(cfg, indent=3))

        trn_dataloader = DataLoader(trn_examples, shuffle=True, batch_size=batch_size)
        val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=batch_size)
        tst_dataloader = DataLoader(tst_examples, shuffle=False, batch_size=batch_size)

        trn_evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(
            trn_examples, name="train"
        )
        val_evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(
            val_examples, name="validation"
        )
        tst_evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(
            tst_examples, name="test"
        )

        # 10% of train data for warm-up
        warmup_steps = math.ceil(len(trn_dataloader) * num_epochs * 0.1)
        logger.info(f"warmup_steps: {warmup_steps}")

        model = CrossEncoder(bert_name, num_labels=3, max_length=max_length)

        def cb(score, epoch, steps):
            logger.info(f"E{epoch}: score: {score}")
            if score > model.best_score:
                logger.info(f"new best model for score: {score}")

        model.fit(
            train_dataloader=trn_dataloader,
            epochs=num_epochs,
            warmup_steps=warmup_steps,
            evaluator=SequentialEvaluator([trn_evaluator, val_evaluator]),
            output_path=output_path,
            callback=cb,
            save_best_model=True,
        )

        model = CrossEncoder(output_path, max_length=max_length)
        evals[output_path] = tst_evaluator(model, output_path=output_path)

Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForSequenceClassification: ['qa_outputs.weight', 'qa_outputs.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at deepset/xlm-roberta-large-squad2 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classi

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=30.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…





Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForSequenceClassification: ['qa_outputs.weight', 'qa_outputs.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at deepset/xlm-roberta-large-squad2 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classi

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=30.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1763.0, style=ProgressStyle(description_w…





In [17]:
for i in range(1):
    if i == 0:
        bert_name = (
            bert_name_short
        ) = "deepset/xlm-roberta-large-squad2"  # "DeepPavlov/bert-base-multilingual-cased-sentence" #"bert-base-multilingual-cased"  # "deepset/xlm-roberta-large-squad2"
        max_length = None
        batch_size = 12
        num_epochs = 30
        post = ""
        model_name = f"{bert_name_short}_bs{batch_size}{post}"
        output_path = pjoin("models_clean", model_name)
    elif i == 1:
        bert_name = (
            bert_name_short
        ) = "DeepPavlov/bert-base-multilingual-cased-sentence"  # "bert-base-multilingual-cased"  # "deepset/xlm-roberta-large-squad2"
        max_length = 512
        batch_size = 7
        num_epochs = 30
        post = ""
        model_name = f"{bert_name_short}_bs{batch_size}{post}"
        output_path = pjoin("models_clean", model_name)
    elif i == 3:
        bert_name = (
            bert_name_short
        ) = "DeepPavlov/bert-base-bg-cs-pl-ru-cased"  # "bert-base-multilingual-cased"  # "deepset/xlm-roberta-large-squad2"
        max_length = 512
        batch_size = 2
        num_epochs = 30
        post = ""
        model_name = f"{bert_name_short}_bs{batch_size}{post}"
        output_path = pjoin("models_clean", model_name)
    if i == 2:
        bert_name = (
            bert_name_short
        ) = "deepset/xlm-roberta-large-squad2"  # "DeepPavlov/bert-base-multilingual-cased-sentence" #"bert-base-multilingual-cased"  # "deepset/xlm-roberta-large-squad2"
        max_length = None
        batch_size = 8
        num_epochs = 30
        post = ""
        model_name = f"{bert_name_short}_bs{batch_size}{post}"
        output_path = pjoin("models_clean", model_name)

    os.makedirs(output_path, exist_ok=True)
    logger.info(f"output path: {output_path}")
    pickle.dump(trn_examples, open(pjoin(output_path, "trn_examples.p"), "wb"))
    pickle.dump(tst_examples, open(pjoin(output_path, "tst_examples.p"), "wb"))
    pickle.dump(val_examples, open(pjoin(output_path, "val_examples.p"), "wb"))

    cfg = OrderedDict(
        [
            ("bert_name", bert_name),
            ("bert_name_short", bert_name_short),
            ("batch_size", batch_size),
            ("max_length", max_length),
        ]
    )

    with open(pjoin(output_path, "rteconfig.json"), "w") as outfile:
        outfile.write(json.dumps(cfg, indent=3))

    trn_dataloader = DataLoader(trn_examples, shuffle=True, batch_size=batch_size)
    val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=batch_size)
    tst_dataloader = DataLoader(tst_examples, shuffle=False, batch_size=batch_size)

    trn_evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(
        trn_examples, name="train"
    )
    val_evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(
        val_examples, name="validation"
    )
    tst_evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(
        tst_examples, name="test"
    )

    # 10% of train data for warm-up
    warmup_steps = math.ceil(len(trn_dataloader) * num_epochs * 0.1)
    logger.info(f"warmup_steps: {warmup_steps}")

    model = CrossEncoder(bert_name, num_labels=3, max_length=max_length)

    def cb(score, epoch, steps):
        logger.info(f"E{epoch}: score: {score}")
        if score > model.best_score:
            logger.info(f"new best model for score: {score}")

    model.fit(
        train_dataloader=trn_dataloader,
        epochs=num_epochs,
        warmup_steps=warmup_steps,
        evaluator=SequentialEvaluator([trn_evaluator, val_evaluator]),
        output_path=output_path,
        callback=cb,
        save_best_model=True,
    )

    model = CrossEncoder(output_path, max_length=max_length)
    evals[output_path] = tst_evaluator(model, output_path=output_path)

Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForSequenceClassification: ['qa_outputs.weight', 'qa_outputs.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at deepset/xlm-roberta-large-squad2 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classi

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=30.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=248.0, style=ProgressStyle(description_wi…





RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 31.75 GiB total capacity; 29.09 GiB already allocated; 62.00 MiB free; 30.91 GiB reserved in total by PyTorch)

In [None]:
len(trn_examples)

In [13]:
print("bia")

bia


In [14]:
evals

{'models_anew/deepset/xlm-roberta-large-squad2_bs8': 0.7890625,
 'models_anew/deepset/xlm-roberta-large-squad2_bs9': 0.7924107142857143,
 'models_anew/deepset/xlm-roberta-large-squad2_bs10': 0.7991071428571429,
 'models_anew/deepset/xlm-roberta-large-squad2_bs11': 0.7689732142857143}

In [57]:
torch.cuda.empty_cache()

In [58]:
import gc

gc.collect()

48

In [19]:
torch.cuda.empty_cache()

In [None]:
bert_name = (
    bert_name_short
) = "bert-base-multilingual-cased"  # "deepset/xlm-roberta-large-squad2"  # "DeepPavlov/bert-base-multilingual-cased-sentence" #"deepset/xlm-roberta-large-squad2"
max_length = None
batch_size = 8
num_epochs = 100
post = ""
model_name = f"{bert_name_short}_bs{batch_size}{post}"
output_path = pjoin("~/nli/EXP/may21_17_raw__", model_name)

os.makedirs(output_path, exist_ok=True)

logger.info(f"output path: {output_path}")
pickle.dump(trn_examples, open(pjoin(output_path, "trn_examples.p"), "wb"))
pickle.dump(tst_examples, open(pjoin(output_path, "tst_examples.p"), "wb"))
pickle.dump(val_examples, open(pjoin(output_path, "val_examples.p"), "wb"))

cfg = OrderedDict(
    [
        ("bert_name", bert_name),
        ("bert_name_short", bert_name_short),
        ("batch_size", batch_size),
        ("max_length", max_length),
    ]
)

with open(pjoin(output_path, "rteconfig.json"), "w") as outfile:
    outfile.write(json.dumps(cfg, indent=3))

trn_dataloader = DataLoader(trn_examples, shuffle=True, batch_size=batch_size)
val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=batch_size)
tst_dataloader = DataLoader(tst_examples, shuffle=False, batch_size=batch_size)

trn_evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(
    trn_examples, name="train"
)
val_evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(
    val_examples, name="validation"
)
tst_evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(
    tst_examples, name="test"
)

# 10% of train data for warm-up
warmup_steps = math.ceil(len(trn_dataloader) * num_epochs * 0.1)
logger.info(f"warmup_steps: {warmup_steps}")

model = CrossEncoder(bert_name, num_labels=3, max_length=max_length)


def cb(score, epoch, steps):
    logger.info(f"E{epoch}: score: {score}")
    if score > model.best_score:
        logger.info(f"new best model for score: {score}")


model.fit(
    train_dataloader=trn_dataloader,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    evaluator=SequentialEvaluator([trn_evaluator, val_evaluator]),
    output_path=output_path,
    callback=cb,
    save_best_model=True,
)

model = CrossEncoder(output_path, max_length=max_length)
tst_evaluator(model, output_path=output_path)