# About this notebook
Base notebook taken from https://www.kaggle.com/xhlulu/jigsaw-tpu-xlm-roberta

In [None]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

from sklearn.model_selection import train_test_split

import transformers
from transformers import TFAutoModel, AutoTokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

transformers.set_seed(60)

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,# à changer en fonction du modèle
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(28, activation='softmax')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1.5e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

# TPU config

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
PATH = '../input/nlpchallenge/'

# Configuration
EPOCHS = 5
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192
MODEL = 'roberta-large'

## Create fast tokenizer

In [None]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

Load text data into memory

In [None]:
df = pd.read_json(PATH+'train.json').set_index('Id').loc[:, 'description']
labels = pd.read_csv(PATH+'train_label.csv', index_col=0).loc[:, 'Category'].astype('category').cat.codes
test_texts = pd.read_json(PATH+'test.json').set_index('Id').loc[:, 'description'].tolist()

In [None]:
data_text, data_label = df.tolist(), labels.tolist()

In [None]:
%%time 

x_train = regular_encode(data_text, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(test_texts, tokenizer, maxlen=MAX_LEN)

y_train = data_label

## Build datasets objects

In [None]:
%%time
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(300000)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

## Load model into the TPU

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

In [None]:
from tensorflow.keras.callbacks import Callback
class prediction_history(Callback):
    def __init__(self):
        self.predhis = []
    def on_epoch_end(self, epoch, logs={}):
        self.predhis.append(model.predict(test_dataset, verbose=1))

# Train Model

In [None]:
callbacks = [prediction_history()]

n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    epochs=EPOCHS,
    callbacks = callbacks # After each epoch, we predict the test dataset
    )

In [None]:
y_prob_somme = sum(callbacks[0].predhis) # sum of all the epoch predictions

# Performance evaluation

In [None]:
with open('roberta-large-sub-60.npy', 'wb') as f:
    np.save(f, y_prob_somme) # Local save