# 🏋️ Training sentence_transformers using ČTK data
Shared notebook version 1.0

## 📑 Import Clauses

In [1]:
import json, logging, math, os, pickle, gc
from collections import Counter, OrderedDict
from os.path import join as pjoin

import numpy as np
import sklearn
import torch
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator
from sentence_transformers.evaluation import (
    SequentialEvaluator,
)
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader

import datautils
from datautils import detokenize2

logger = logging.getLogger(__name__)

## ⚓ Load a dataset (see [Dataset NB](datasets.ipynb))

In [48]:
trn_examples, tst_examples, val_examples = datautils.load_examples_from_pickle("../data/demo_splits/pickle")
[datautils.counter(split) for split in (trn_examples, tst_examples, val_examples)]

In [None]:
trn_examples[0].texts,trn_examples[0].label,val_examples[0].texts,val_examples[0].label

## 📂 Prepare an output directory

In [None]:
outdir = "../models"

## 📅 Schedule a bunch of training jobs!
Set parameters for each in respective if's - omit them if not needed, alter the iterated range if some are to be skipped

In [None]:
for i in range(0, 3):
    if i == 0:
        bert_name = (bert_name_short) = "deepset/xlm-roberta-large-squad2"  # "DeepPavlov/bert-base-multilingual-cased-sentence" #"bert-base-multilingual-cased"  # "deepset/xlm-roberta-large-squad2"
        max_length = None
        batch_size = 12
        num_epochs = 30
        model_name = f"{bert_name_short}_bs{batch_size}"
    elif i == 1:
        bert_name = (bert_name_short) = "DeepPavlov/bert-base-multilingual-cased-sentence"  # "bert-base-multilingual-cased"  # "deepset/xlm-roberta-large-squad2"
        max_length = 512
        batch_size = 7
        num_epochs = 30
        model_name = f"{bert_name_short}_bs{batch_size}"
    if i == 2:
        bert_name = (
            bert_name_short
        ) = "deepset/xlm-roberta-large-squad2"  # "DeepPavlov/bert-base-multilingual-cased-sentence" #"bert-base-multilingual-cased"  # "deepset/xlm-roberta-large-squad2"
        max_length = None
        batch_size = 8
        num_epochs = 30
        model_name = f"{bert_name_short}_bs{batch_size}"

    output_path = pjoin(outdir, model_name)
    os.makedirs(output_path, exist_ok=True)
    logger.info(f"output path: {output_path}")
    pickle.dump(trn_examples, open(pjoin(output_path, "trn_examples.p"), "wb"))
    pickle.dump(tst_examples, open(pjoin(output_path, "tst_examples.p"), "wb"))
    pickle.dump(val_examples, open(pjoin(output_path, "val_examples.p"), "wb"))

    cfg = OrderedDict(
        [
            ("bert_name", bert_name),
            ("bert_name_short", bert_name_short),
            ("batch_size", batch_size),
            ("max_length", max_length),
        ]
    )

    with open(pjoin(output_path, "rteconfig.json"), "w") as outfile:
        outfile.write(json.dumps(cfg, indent=3))

    trn_dataloader = DataLoader(trn_examples, shuffle=True, batch_size=batch_size)
    val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=batch_size)
    tst_dataloader = DataLoader(tst_examples, shuffle=False, batch_size=batch_size)

    trn_evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(trn_examples, name="train")
    val_evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(val_examples, name="validation")
    tst_evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(tst_examples, name="test")

    # 10% of train data for warm-up
    warmup_steps = math.ceil(len(trn_dataloader) * num_epochs * 0.1)
    logger.info(f"warmup_steps: {warmup_steps}")

    model = CrossEncoder(bert_name, num_labels=3, max_length=max_length)

    def cb(score, epoch, steps):
        logger.info(f"E{epoch}: score: {score}")
        if score > model.best_score: logger.info(f"new best model for score: {score}")

    model.fit(
        train_dataloader=trn_dataloader,
        epochs=num_epochs,
        warmup_steps=warmup_steps,
        evaluator=SequentialEvaluator([trn_evaluator, val_evaluator]),
        output_path=output_path,
        callback=cb,
        save_best_model=True,
    )

    model = CrossEncoder(output_path, max_length=max_length)
    if 'evals' not in globals(): evals = {}
    evals[output_path] = tst_evaluator(model, output_path=output_path)
    tst_evaluator(model, output_path=output_path)

### 🤯 Out of memory? Free some!

In [58]:
torch.cuda.empty_cache()
gc.collect()

48

## 📜 How did Your models do?

In [19]:
evals