# About this notebook
TODO

In [1]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split

import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

transformers.set_seed(12)



In [2]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,# à changer en fonction du modèle
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(28, activation='softmax')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1.5e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [3]:
#def create_model():
#  model = TFRobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=28)
#  model.compile(optimizer=tf.keras.optimizers.Adam(1.5e-5), 
#              loss=model.compute_loss, 
#              metrics=['accuracy'])
#  return model

# TPU config

In [4]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [5]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
PATH = '../input/nlpchallenge/'

# Configuration
EPOCHS = 1
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192
MODEL = 'roberta-large'

## Create fast tokenizer

In [6]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=482.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




Load text data into memory

In [7]:
df = pd.read_json(PATH+'train.json').set_index('Id').loc[:, 'description']
labels = pd.read_csv(PATH+'train_label.csv', index_col=0).loc[:, 'Category'].astype('category').cat.codes
test_texts = pd.read_json(PATH+'test.json').set_index('Id').loc[:, 'description'].tolist()

In [8]:
data_text, data_label = df.tolist(), labels.tolist()

In [9]:
%%time 

x_train = regular_encode(data_text, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(test_texts, tokenizer, maxlen=MAX_LEN)

y_train = data_label

CPU times: user 3min 21s, sys: 2.98 s, total: 3min 24s
Wall time: 3min 23s


## Build datasets objects

In [10]:
%%time
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(300000, seed=5)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

CPU times: user 1.3 s, sys: 997 ms, total: 2.3 s
Wall time: 2.36 s


## Load model into the TPU

In [11]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1634375628.0, style=ProgressStyle(descr…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_roberta_model (TFRobertaM ((None, 192, 1024), (None 355359744 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 1024)]            0         
_________________________________________________________________
dense (Dense)                (None, 28)                28700     
Total params: 355,388,444
Trainable params: 355,388,444
Non-trainable params: 0
_________________________________________________________________
CPU times: user 1min 9s, sys: 17.6 s, total: 1min 26s
Wall time: 1min 34s


In [12]:
from tensorflow.keras.callbacks import Callback
class prediction_history(Callback):
    def __init__(self):
        self.predhis = []
    def on_epoch_end(self, epoch, logs={}):
        self.predhis.append(model.predict(test_dataset, verbose=1))

# Train Model

In [13]:
callbacks = [prediction_history()]

n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    epochs=5,
    callbacks = callbacks
    )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
#n_steps = x_train.shape[0] // BATCH_SIZE
#train_history = model.fit(
#    train_dataset,
#    steps_per_epoch=n_steps,
#    epochs=1
#    )
#y_prob_somme = model.predict(test_dataset, verbose=1)
#for i in range(4):
#    n_steps = x_train.shape[0] // BATCH_SIZE
#    train_history = model.fit(
#        train_dataset,
#        steps_per_epoch=n_steps,
#        epochs=1
#    )
#    y_prob = model.predict(test_dataset, verbose=1)
#    y_prob_somme = y_prob_somme + y_prob

In [15]:
y_prob_somme = sum(callbacks[0].predhis)

# Performance evaluation

In [16]:
with open('roberta-large-sub-125.npy', 'wb') as f:
    np.save(f, y_prob_somme)