In [24]:
import logging
from typing import List, Union, Tuple, Any

In [25]:
import os
import zipfile

In [26]:
zip_f = 'texts.zip'
z = zipfile.ZipFile(zip_f, 'r')
z.extractall('train_texts_tiny')

In [27]:
print(len(os.listdir('train_texts_tiny')))

310


### DataLoader

In [28]:
import csv

from tqdm import tqdm

In [29]:
class DataLoader:

    def __init__(self, path_to_files):
        self._path_to_files = path_to_files
        self._max_len = 50

    def load_dataset(self):
        dataset_samples = []
        dataset_labels = []
        files = sorted(os.listdir(self._path_to_files))
        for fname in tqdm(files, desc='loading dataset'):
                if fname.startswith('.'):
                    continue
                with open(os.path.join(self._path_to_files, fname), 'r') as f:
                    tokens = []
                    labels = []
                    reader = csv.DictReader(f)
                    for row in reader:
                        tokens.append(row['token'])
                        try:
                          labels.append(row['tag'])
                        except KeyError:
                          labels.append(row['reviewed'])
                        if len(tokens) == self._max_len:
                            dataset_samples.append(tokens)
                            dataset_labels.append(labels)
                            tokens = []
                            labels = []
                    if len(tokens) > 0:
                        dataset_samples.append(tokens)
                        dataset_labels.append(labels)
        print(f'Loaded {len(dataset_samples)} samples')
        return dataset_samples, dataset_labels


### get_model

In [0]:
!pip install transformers

In [30]:
import tensorflow as tf
from tensorflow.keras.layers import TimeDistributed, Dense
from transformers import TFBertModel, BertConfig, TFBertForTokenClassification

In [0]:
#from google.colab import userdata
#userdata.get('rubert-tiny2')

In [31]:
def get_model():
    config = BertConfig.from_pretrained('cointegrated/rubert-tiny2', from_pt = True, num_labels=3)
    model = TFBertForTokenClassification.from_pretrained(
            'cointegrated/rubert-tiny2',
            config=config,
            from_pt = True
    )
    model.layers[-1].activation = tf.keras.activations.softmax
    print(model.summary())
    return model

### Vectorizer

In [32]:
from enum import Enum


class Tags(Enum):
    B_TERM = 'B-TERM'
    I_TERM = 'I-TERM'
    NOT_TERM = 'O'


TERM_SET = {Tags.B_TERM.value, Tags.I_TERM.value}

label2class = {
            Tags.NOT_TERM.value: 0,
            Tags.B_TERM.value: 1,
            Tags.I_TERM.value: 2
        }

In [33]:
from transformers import BertTokenizer

class Vectorizer:

    def __init__(self):
        self._tokenizer = BertTokenizer.from_pretrained("cointegrated/rubert-tiny2",
                                                        do_lower_case=False)

        self._label2class = label2class
        self._max_length = 128

    def vectorize(self, text: List[str], token_labels: List[str]) -> Tuple[List[str], List[int], List[int], List[int]]:
        tokenized_text, input_masks, labels = self._tokenize(text, token_labels)

        input_ids = self._tokenizer.convert_tokens_to_ids(tokenized_text)

        tags = []
        for label in labels:
            if label == '':
                label = 'O'
            tags.append(self._label2class[label])

        input_ids = self._pad(input_ids)
        input_masks = self._pad(input_masks)
        tags = self._pad(tags)

        return tokenized_text, input_ids, input_masks, tags

    def _pad(self, input: List[Any]) -> List[Any]:
        if len(input) >= self._max_length:
            return input[:self._max_length]
        while len(input) < self._max_length:
            input.append(0)
        return input

    def _tokenize(self, text: List[str], token_labels: List[str]) -> Tuple[List[str], List[int], List[str]]:
        tokenized_text = []
        labels = []

        for token, label in zip(text, token_labels):
            # Tokenize the word and count # of subwords the word is broken into
            tokenized_word = self._tokenizer.tokenize(token)
            n_subwords = len(tokenized_word)

            # Add the tokenized word to the final tokenized word list
            tokenized_text.extend(tokenized_word)

            # Add the same label to the new list of labels `n_subwords` times
            labels.extend([label] * n_subwords)

        try:

            inputs = self._tokenizer.encode_plus(
                tokenized_text,
                is_pretokenized=True,
                return_attention_mask=True,
                max_length=self._max_length,
                truncation=True
            )

        except:
            print(text)
            inputs = dict()
            inputs['attention_mask'] = np.zeros(self._max_length)

        return tokenized_text, inputs['attention_mask'], labels


### Trainer

In [34]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [35]:
class Trainer:

    def __init__(self, vectorizer, samples, model, experiment_name, path_to_weights_dir):
        self._vectorizer = vectorizer
        self._model = model

        self._batch_size = 24
        self._epochs = 100
        self._patience = 5
        self._path_to_weights_dir = path_to_weights_dir

        if not os.path.exists(self._path_to_weights_dir):
            os.makedirs(self._path_to_weights_dir, exist_ok=True)
        weights_fname = f'{experiment_name}_weights.h5'

        self._path_to_weights = os.path.join(self._path_to_weights_dir, weights_fname)
        
        if os.path.exists(self._path_to_weights):
            self._model.load_weights(self._path_to_weights)

        self._X_train, self._X_val, self._y_train, self._y_val = train_test_split(
            samples[0], samples[1], random_state=2020
        )
        print(f'{len(self._X_train)} train samples, {len(self._X_val)} val samples')

        self._steps_per_epoch = int(len(self._X_train) / self._batch_size)
        self._validation_steps = int(len(self._X_val) / self._batch_size)

        self._num_of_train_samples = self._steps_per_epoch * self._batch_size
        self._num_of_val_samples = self._validation_steps * self._batch_size

    def _generate_samples(self, samples, labels, num_of_samples):
        i = 0
        while True:
            texts = samples[i:i + self._batch_size]
            y_labels = labels[i:i + self._batch_size]
            X_ids = []
            X_masks = []
            y = []
            i += self._batch_size
            for text, token_labels in zip(texts, y_labels):
                _, input_ids, input_masks, tags = self._vectorizer.vectorize(text, token_labels)
                X_ids.append(np.array(input_ids))
                y.append(tags)
                X_masks.append(np.array(input_masks))
            yield [np.asarray(X_ids, dtype='int32'), np.asarray(X_masks, dtype='int32')], np.array(y)
            if i == num_of_samples:
                i = 0

    def train(self):
        saver = keras.callbacks.ModelCheckpoint(
            self._path_to_weights,
            monitor='val_loss',
            verbose=1,
            save_best_only=True,
            mode='auto',
            save_weights_only=True
        )
        stopper = keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=self._patience,
            verbose=1,
            mode='auto'
        )
        self._model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=0.00000001),
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
        )
        history = self._model.fit(
            self._generate_samples(self._X_train, self._y_train, self._num_of_train_samples),
            epochs=self._epochs,
            validation_data=self._generate_samples(self._X_val, self._y_val, self._num_of_val_samples),
            steps_per_epoch=self._steps_per_epoch,
            validation_steps=self._validation_steps,
            verbose=1,
            callbacks=[saver, stopper]
        )
        np.save(f'history_{experiment_name}.npy',history.history)
        return history


### start

In [0]:
logging.basicConfig(level=logging.ERROR)

if __name__ == '__main__':
    data_loader = DataLoader('train_texts_tiny')
    samples, labels = data_loader.load_dataset()

    vectorizer = Vectorizer()

    model = get_model()

    trainer = Trainer(
        vectorizer=vectorizer,
        samples=(samples, labels),
        model=model,
        experiment_name='bert_for_token_classification_rubert-tiny2',
        path_to_weights_dir='weights'
    )

    trainer.train()


loading dataset: 100%|██████████| 310/310 [00:02<00:00, 119.60it/s]
2024-02-28 21:02:14.362025: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64
2024-02-28 21:02:14.362057: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2024-02-28 21:02:14.362072: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (s-4a07c28a-521d-4fa1-8fd0-b5364d7b09e1): /proc/driver/nvidia/version does not exist
2024-02-28 21:02:14.362272: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable the


Epoch 00027: val_loss improved from 0.37897 to 0.37764, saving model to weights/bert_for_token_classification_rubert-tiny2_weights.h5
Epoch 28/100

Epoch 00028: val_loss improved from 0.37764 to 0.37632, saving model to weights/bert_for_token_classification_rubert-tiny2_weights.h5
Epoch 29/100

Epoch 00029: val_loss improved from 0.37632 to 0.37500, saving model to weights/bert_for_token_classification_rubert-tiny2_weights.h5
Epoch 30/100

Epoch 00030: val_loss improved from 0.37500 to 0.37368, saving model to weights/bert_for_token_classification_rubert-tiny2_weights.h5
Epoch 31/100

Epoch 00031: val_loss improved from 0.37368 to 0.37236, saving model to weights/bert_for_token_classification_rubert-tiny2_weights.h5
Epoch 32/100

Epoch 00032: val_loss improved from 0.37236 to 0.37105, saving model to weights/bert_for_token_classification_rubert-tiny2_weights.h5
Epoch 33/100

Epoch 00033: val_loss improved from 0.37105 to 0.36974, saving model to weights/bert_for_token_classification_r