In [None]:
import re

import pandas as pd

# read data: lectures + questions
DATA_PATH = ""
lectures = pd.read_csv(DATA_PATH + 'lectures_texts.csv')

# pre processing text removing html tags, newlines
lectures['transcript']=lectures['transcript'].apply(lambda x : re.sub('<[^<]+?>', '', str(x)))
lectures = lectures.replace('\n','', regex=True)
lectures = lectures.replace('\r','', regex=True)

# join all text together
text = ' '.join(lectures['transcript'].tolist())

# split sequences based on puntuactions 
subs = re.split('(?<=[.!?]) +',text)

In [None]:
# pre training code from  https://www.kaggle.com/riblidezso/finetune-xlm-roberta-on-jigsaw-test-data-with-mlm
import numpy as np
import tensorflow as tf
from transformers import TFAutoModelWithLMHead, AutoTokenizer
import logging

MAX_LEN = 128
BATCH_SIZE = 32  # per TPU core
EPOCHS = 24
LR = 1e-5
PRETRAINED_MODEL = 'distilbert-base-uncased'


# no extensive logging
logging.basicConfig(level=logging.ERROR)
AUTO = tf.data.experimental.AUTOTUNE


def connect_to_TPU():
    """Detect hardware, return appropriate distribution strategy"""
    try:
        # TPU detection. No parameters necessary if TPU_NAME environment variable is
        # set: this is always the case on Kaggle.
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.TPUStrategy(tpu)
    else:
        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
        strategy = tf.distribute.get_strategy()

    global_batch_size = BATCH_SIZE * strategy.num_replicas_in_sync

    return tpu, strategy, global_batch_size


tpu, strategy, global_batch_size = connect_to_TPU()
print("REPLICAS: ", strategy.num_replicas_in_sync)

total_samples = len(subs)
TOTAL_STEPS = int((total_samples / global_batch_size) * EPOCHS)
EVALUATE_EVERY = int(total_samples / global_batch_size)


def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts,
        return_attention_mask=False,
        return_token_type_ids=False,
        padding='max_length',
        max_length=maxlen,
        truncation=True
    )

    return np.array(enc_di['input_ids'])


tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
X_train_mlm = regular_encode(subs, tokenizer, maxlen=MAX_LEN)


def prepare_mlm_input_and_labels(X, tokenizer):

    vocab_size = tokenizer.vocab_size -1
    mask_id = tokenizer.mask_token_id

    # 15% BERT masking
    inp_mask = np.random.rand(*X.shape) < 0.15
    # do not mask special tokens first 998 tokens are UNUSED
    inp_mask[X <= 998] = False
    # set targets to -1 by default, it means ignore
    labels = -1 * np.ones(X.shape, dtype=int)
    # set labels for masked tokens
    labels[inp_mask] = X[inp_mask]

    # prepare input
    X_mlm = np.copy(X)
    # set input to [MASK] which is the last token for the 90% of tokens
    # this means leaving 10% unchanged
    inp_mask_2mask = inp_mask & (np.random.rand(*X.shape) < 0.90)
    X_mlm[inp_mask_2mask] = mask_id  #  set mask token id

    # set 10% to a random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(*X.shape) < 1 / 9)
    X_mlm[inp_mask_2random] = np.random.randint(998, vocab_size, inp_mask_2random.sum())

    return X_mlm, labels


# masks and labels


X_train_mlm, y_train_mlm = prepare_mlm_input_and_labels(X_train_mlm, tokenizer)


def create_dist_dataset(X, y=None, training=False):
    dataset = tf.data.Dataset.from_tensor_slices(X)

    ### Add y if present ###
    if y is not None:
        dataset_y = tf.data.Dataset.from_tensor_slices(y)
        dataset = tf.data.Dataset.zip((dataset, dataset_y))

    ### Repeat if training ###
    if training:
        dataset = dataset.shuffle(len(X)).repeat()

    dataset = dataset.batch(global_batch_size).prefetch(AUTO)

    ### make it distributed  ###
    dist_dataset = strategy.experimental_distribute_dataset(dataset)

    return dist_dataset


train_dist_dataset = create_dist_dataset(X_train_mlm, y_train_mlm, True)


def create_mlm_model_and_optimizer():
    with strategy.scope():
        model = TFAutoModelWithLMHead.from_pretrained(PRETRAINED_MODEL)
        optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
    return model, optimizer


mlm_model, optimizer = create_mlm_model_and_optimizer()
mlm_model.summary()


def define_mlm_loss_and_metrics():
    with strategy.scope():
        mlm_loss_object = masked_sparse_categorical_crossentropy

        def compute_mlm_loss(labels, predictions):
            per_example_loss = mlm_loss_object(labels, predictions)
            loss = tf.nn.compute_average_loss(
                per_example_loss, global_batch_size=global_batch_size)
            return loss

        train_mlm_loss_metric = tf.keras.metrics.Mean()

    return compute_mlm_loss, train_mlm_loss_metric


def masked_sparse_categorical_crossentropy(y_true, y_pred):
    y_true_masked = tf.boolean_mask(y_true, tf.not_equal(y_true, -1))
    y_pred_masked = tf.boolean_mask(y_pred, tf.not_equal(y_true, -1))
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true_masked,
                                                           y_pred_masked,
                                                           from_logits=True)
    return loss


def train_mlm(train_dist_dataset, total_steps=2000, evaluate_every=200):
    step = 0
    ### Training lopp ###
    for tensor in train_dist_dataset:
        distributed_mlm_train_step(tensor)
        step += 1

        if (step % evaluate_every == 0):
            ### Print train metrics ###
            train_metric = train_mlm_loss_metric.result().numpy()
            print("Step %d, train loss: %.2f" % (step, train_metric))

            ### Reset  metrics ###
            train_mlm_loss_metric.reset_states()

        if step == total_steps:
            break


@tf.function
def distributed_mlm_train_step(data):
    strategy.experimental_run_v2(mlm_train_step, args=(data,))


@tf.function
def mlm_train_step(inputs):
    features, labels = inputs

    with tf.GradientTape() as tape:
        predictions = mlm_model(features, training=True)[0]
        loss = compute_mlm_loss(labels, predictions)

    gradients = tape.gradient(loss, mlm_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, mlm_model.trainable_variables))

    train_mlm_loss_metric.update_state(loss)


compute_mlm_loss, train_mlm_loss_metric = define_mlm_loss_and_metrics()

In [None]:
# train the model
train_mlm(train_dist_dataset,TOTAL_STEPS , EVALUATE_EVERY)
# save the pre-trained model
mlm_model.save_pretrained('/data/distilbert_uncased_24_epochs')