# Czech ALBERT classifier

In [27]:
import pandas as pd
import numpy as np

## Load dataset 
This test dataset is very small

In [20]:
dataset = pd.read_csv("novinky_small.csv")
dataset.head()

Unnamed: 0,text,category
0,Ruští diplomaté vypovězení ze Spojených států ...,2
1,"Izrael obvinil radikální hnutí Hamás z toho, ž...",2
2,Italské ministerstvo zahraničí si v sobotu pře...,2
3,Pět stovek pozvaných smutečních hostů se v sob...,2
4,"Tisíce mrtvých zvířat, poničená příroda a míst...",2


In [21]:
x_raw = dataset["text"]
y_raw = dataset["category"]

## Select ALBERT model

In [22]:
model_defs = {
    # example: (path, width, batch_size)
    "csbase3": ("pretrained/csbase3_ckpt/", 256, 24),
    "csbase4": ("pretrained/csbase4_ckpt/", 256, 16),
    "cslarge3": ("pretrained/cslarge3_ckpt/", 512, 12)
}

model_name = "csbase3"
model_path, max_seq_len, batch_size = model_defs[model_name]

## Tokenize text

In [28]:
import sentencepiece as spm
import tensorflow as tf
import tensorflow.keras as keras
import re

class SPMEmbedder():
    def __init__(self, path, lemmatize=None):
        sp = spm.SentencePieceProcessor()
        sp.Load(path)
        self.sp = sp
        if lemmatize is None:
            lemmatize = lambda x: x
        self.lemmatize = lemmatize
        
    def encode(self, text):
        return [self.sp.piece_to_id("[CLS]"),
                *self.sp.EncodeAsIds(self.lemmatize(text)), 
                self.sp.piece_to_id("[SEP]")
               ]
    
    def decode(self, indexes):
        return self.sp.DecodeIds(list(map(int, indexes)))
    
    def decode_pieces(self, indexes):
        return " ".join(self.sp.id_to_piece(int(id_)) for id_ in indexes)
    
    def get_embed_fcn(self, width):
        def embed(sentence):
            return keras.preprocessing.sequence.pad_sequences(
                [self.encode(sentence)], width, padding="post", truncating="post")[0]
        return embed
    
def lemmatize(text, lowercase=True):
    if lowercase:
        text = text.lower()
    text = re.sub("[0-9]", "#", text)
    text = re.sub("[‒–—―]", "-", text)
    text = re.sub("[“”‘’„“‚‘\"']", "'", text)
    text = re.sub("[^0-9a-zóěščřžýáíďéťňůúA-ZÓĚŠČŘŽÝÁÍĎÉŤŇŮÚ\\.,\\!\\?%\\(\\)\\-'#: ]", "", text)

    return text


embedder = SPMEmbedder(f"{model_path}/spm.model", lemmatize)

## Embed the inputs

In [30]:
x = np.stack(x_raw.map(embedder.get_embed_fcn(max_seq_len)))
x

array([[    2,  2243,   886, ...,     0,     0,     0],
       [    2,  7181, 18226, ...,     0,     0,     0],
       [    2,  7427,  2182, ...,     0,     0,     0],
       ...,
       [    2,   561,    84, ...,     0,     0,     0],
       [    2,   144, 14978, ...,     0,     0,     0],
       [    2,   396,  4101, ...,     0,     0,     0]], dtype=int32)

## Create category map and normalize categories

In [45]:
cat2index = {}
index2cat = {}

for i, cat in enumerate(y_raw.value_counts().index):
    cat2index[cat] = i
    index2cat[i] = cat
    
y = y_raw.map(lambda x: cat2index[x])

cat2index

{740: 0, 986: 1, 13: 2, 12: 3, 5: 4, 4: 5, 3: 6, 2: 7}

## Create ALBERT model 

In [42]:
import tensorflow as tf
import tensorflow.keras as keras
import bert

def get_albert_layer(path, max_seq_len, name, trainable=True):
    albert_params = bert.albert_params(path)
    l_albert = bert.BertModelLayer.from_params(albert_params, name=name, 
                                             shared_layer=True, trainable=trainable)
    return l_albert
    
def get_model(path, max_seq_len, category_map, albert_trainable=True, init=True):
    l_input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_tokens")
    l_albert = get_albert_layer(path, max_seq_len, "albert", albert_trainable)
    l_middle = l_albert(l_input_ids)
    cls_embed = keras.layers.Lambda(lambda seq: seq[:, 0, :], name="cls_selector")(l_middle)
    
    num_classes = len(category_map)
    l_output = keras.layers.Dense(num_classes, activation='softmax', name="classifier")(cls_embed)
    
    model = keras.Model(inputs=l_input_ids, outputs=l_output)
    opt = keras.optimizers.Adam(learning_rate=1e-5, beta_1=0.9, beta_2=0.999, amsgrad=True)
    model.compile(optimizer=opt,
                  loss="sparse_categorical_crossentropy",
                  metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
    )

    # load the pretrained weights
    if init:
        bert.load_albert_weights(l_albert, path+"model.ckpt-best")
    
    return model

model = get_model(model_path, max_seq_len, cat2index)
model.summary()

Loading google-research/ALBERT weights...
Done loading 22 BERT weights from: pretrained/csbase3_ckpt/model.ckpt-best into <bert.model.BertModelLayer object at 0x7fc98436bee0> (prefix:albert_3). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/LayerNorm/beta/adam_m
	bert/embeddings/LayerNorm/beta/adam_v
	bert/embeddings/LayerNorm/gamma/adam_m
	bert/embeddings/LayerNorm/gamma/adam_v
	bert/embeddings/position_embeddings/adam_m
	bert/embeddings/position_embeddings/adam_v
	bert/embeddings/token_type_embeddings
	bert/embeddings/token_type_embeddings/adam_m
	bert/embeddings/token_type_embeddings/adam_v
	bert/embeddings/word_embeddings/adam_m
	bert/embeddings/word_embeddings/adam_v
	bert/encoder/embedding_hidden_mapping_in/bias/adam_m
	bert/encoder/embedding_hidden_mapping_in/bias/adam_v
	bert/encoder/embedding_hidden_mapping_in/kernel/adam_m
	bert/encoder/embedding_hidden_mapping_in/kernel/adam

##  Train model
Because of the small datasets, the model does not have enough data to train

In [49]:
hist = model.fit(
    x, y,
    epochs=6,
    batch_size=batch_size,
    validation_split=0.2
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
