# Entraînement

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import backend as K

from utils.models import (
    RobertaClassifier,
    ElectraClassifierPL,
    T5Generator,
)

from utils.metrics import Report, Save
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

## Données
Les données après pré-traitement sont disponibles [sur ce lien](https://drive.google.com/drive/folders/1QyPvtM-cVdtwztnyWsLMFQAT8Bejv0nd?usp=sharing):

- `test_data`: les données pour la soumission
- `train_base_data`: les données fournies lors de la compétition
- `dev_data`: les données de validation (Hold-out)
- `train_aug_data`: les données d'entraînement augmentées par BBT et Gender Swap
- `train_lda_data`: les données d'entraînement avec le label 29
- `train_aug_lda_data`: les données d'entraînement augmentées avec le label 29

In [2]:
data_path = "../data/"

train = pd.read_json(data_path + "train_aug_data.json")
test = pd.read_json(data_path + "dev_data.json")

train.Category = train.Category.astype(np.int16)
test.Category = test.Category.astype(np.int16)

train, val = train_test_split(
    train, test_size = .2,
    random_state = 42069,
    stratify = train.Category
)

## RoBERTa
Exemple d'entrainement pour RoBERTa  
Les poids sont disponibles [sur ce lien](https://drive.google.com/drive/folders/1-jiZxzFmozyexvm3vcRVxJTNb-qnLVM8?usp=sharing):

- `robLnli_al`: RoBERTa NLI sur les données augmentées
- `robL_al`: RoBERTa Large sur les données augmentées

Pour entraîner d'autre seed (pour seed averaging), changer la variable `SEED`

In [3]:
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 64
MAX_LEN = 190
N_LABELS = 28
BUFFER = 300000
SEED = 1
MODEL_N = "robLnli_al"
NTRAIN = train.shape[0]
NVAL = val.shape[0]
STEPS = int(np.ceil(NTRAIN/BATCH_SIZE))
VAL_STEPS = int(np.ceil(NVAL/BATCH_SIZE))

print("Total Steps:", STEPS)
print("Total Validation Steps:", VAL_STEPS)

Total Steps: 2803
Total Validation Steps: 701


Chargement des modèles

In [4]:
model = RobertaClassifier.from_pretrained(f"cache/{MODEL_N}")
tokenizer = AutoTokenizer.from_pretrained(f"cache/{MODEL_N}")

All model checkpoint layers were used when initializing RobertaClassifier.

All the layers of RobertaClassifier were initialized from the model checkpoint at cache/robLnli_al.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaClassifier for predictions without further training.


Définition de la fonction de perte, de la métrique et de l'optimiseur.  
Compilation du modèle

In [5]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = False)
optimizer = tf.keras.optimizers.Adam(lr = 1.5e-5)
metrics = tf.keras.metrics.SparseCategoricalAccuracy(name = 'accuracy')

model.compile(
    optimizer = optimizer,
    loss = loss,
    metrics = [metrics]
)

Mise en forme des données

In [6]:
x_train = tokenizer.batch_encode_plus(
    train.description.to_list(),
    truncation=True,
    return_tensors='tf',
    max_length=MAX_LEN,
    return_attention_mask=False,
    padding="max_length")['input_ids']

x_val = tokenizer.batch_encode_plus(
    val.description.to_list(),
    truncation=True,
    return_tensors='tf',
    max_length=MAX_LEN,
    return_attention_mask=False,
    padding="max_length")['input_ids']

x_test = tokenizer.batch_encode_plus(
    test.description.to_list(),
    truncation=True,
    return_tensors='tf',
    max_length=MAX_LEN,
    return_attention_mask=False,
    padding="max_length")['input_ids']

######################################################
y_train = K.constant(train.Category, dtype = tf.int32)
y_val = K.constant(val.Category, dtype = tf.int32)
y_test = K.constant(test.Category, dtype = tf.int32)

######################################################
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(BUFFER)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

val_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_val, y_val))
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

Définition des callbacks

In [7]:
save = Save(path = f"weights/{MODEL_N}/", monitor = "val_loss")
early = tf.keras.callbacks.EarlyStopping(
    monitor = 'val_loss',
    patience = 2,
    restore_best_weights = True
)

Entraînement avec $\alpha = 1.5e-5$

In [8]:
epochs_done = 0
history = model.fit(
  train_dataset,
  epochs = 20,
  steps_per_epoch = STEPS,
  callbacks = [save, early],
  validation_data = val_dataset,
  initial_epoch = epochs_done,
  workers = 8,
  use_multiprocessing = True
)

Epoch 1/20












Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


Entrainement avec $\alpha = 1e-6$

In [9]:
optimizer = tf.keras.optimizers.Adam(lr = 1e-6)
model.compile(
    optimizer = optimizer,
    loss = loss,
    metrics = [metrics]
)

epochs_done = 5
history = model.fit(
  train_dataset,
  epochs = 20,
  steps_per_epoch = STEPS,
  callbacks = [save, early],
  validation_data = val_dataset,
  initial_epoch = epochs_done,
  workers = 8,
  use_multiprocessing = True
)

Epoch 6/20












Epoch 7/20
Epoch 8/20


Tous les poids enregistrés lors de cet entraînement se trouvent [sur ce lien](https://drive.google.com/drive/folders/1-jiZxzFmozyexvm3vcRVxJTNb-qnLVM8?usp=sharing)  
Les poids pour la soumission commencent par `PROD-*`

Prédiction

In [10]:
predictions = list()
for dir_ in glob("weights/robLnli_al/*/"):
    model = RobertaClassifier.from_pretrained(dir_)
    pred = model.predict(test_dataset)
    predictions.append(pred)
    
y_pred = np.argmax(sum(predictions), axis = 1)

All model checkpoint layers were used when initializing RobertaClassifier.


L'entraînement d'`ElectraClassifierPL` s'effectue de la même manière que `RobertaClassifier`.

## T5 Generator
Comme pour `RoBERTa` les poids que nous avons entraîné pour `T5` se trouvent [sur ce lien](https://drive.google.com/drive/folders/16qs2CGR_BXv32MkVIE93lCIaiIBOkWRp?usp=sharing):

- `t5S_al`: pour 60 millions de paramètres
- `t5B_al`: pour 220 millions de paramètres

In [11]:
BATCH_SIZE = 48
SEED = 1
MODEL_N = "t5B_al"
STEPS = int(np.ceil(NTRAIN/BATCH_SIZE))
VAL_STEPS = int(np.ceil(NVAL/BATCH_SIZE))

Chargement des modèles

In [12]:
model = T5Generator.from_pretrained(f"cache/{MODEL_N}")
tokenizer = AutoTokenizer.from_pretrained(f"cache/{MODEL_N}")

All model checkpoint layers were used when initializing T5Generator.

All the layers of T5Generator were initialized from the model checkpoint at cache/t5B_al.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5Generator for predictions without further training.


Définition de la métrique et de l'optimiseur.  
Redéfinition de `save`  
Compilation du modèle

In [13]:
save = Save(path = f"weights/{MODEL_N}/", monitor = "val_loss")

optimizer = tf.keras.optimizers.Adam(lr = 1.5e-5)
metrics = tf.keras.metrics.SparseTopKCategoricalAccuracy(name = 'accuracy')

model.compile(optimizer = optimizer, metrics=[metrics])

Fonction de formatage de données pour notre modèle T5

In [14]:
def generate_dataset(df):
    """
    Format data for T5 Generator
    df: DataFrame with description and Category names
    :return: Tensor dataset
    """
    x = df.description.to_list()
    y = df.Category.to_list()
    x = tokenizer.batch_encode_plus(
      x, truncation=True,
      return_attention_mask = False,
      return_tensors='tf', max_length=MAX_LEN,
      padding = "max_length")
    y = tokenizer.batch_encode_plus(
      y, truncation=True,
      return_attention_mask = False,
      return_tensors='tf', max_length=4,
      padding = "max_length")
    data = {
        'input_ids': x['input_ids'],
        'labels': y['input_ids']
    }
    dataset = (
      tf.data.Dataset
      .from_tensor_slices(data)
      .shuffle(BUFFER)
      .batch(BATCH_SIZE)
      .prefetch(AUTO)
      .repeat()
    )
    return dataset

###########################################
train_dataset = generate_dataset(train)
val_dataset = generate_dataset(val)

x_test = tokenizer.batch_encode_plus(
    test.description.to_list(), truncation=True, 
    return_tensors='tf', max_length=MAX_LEN,
    return_attention_mask = False,
    padding = "max_length")['input_ids']

y_test = test.Category.to_list()

Entraînement avec $\alpha = 1.5e-5$

In [15]:
epochs_done = 0
history = model.fit(
    train_dataset,
    epochs = 20,
    steps_per_epoch = STEPS,
    callbacks = [save, early],
    validation_data = val_dataset,
    validation_steps = VAL_STEPS,
    initial_epoch = epochs_done,
    workers = 8,
    use_multiprocessing = True
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20

Tous les poids enregistrés lors de cet entraînement se trouvent [sur ce lien](https://drive.google.com/drive/folders/1-q6fTOJmEVtETv30JqbiIHTiS9UDJFWm?usp=sharing)

Prédiction (ou plutôt génération)

In [16]:
predictions = list()
for dir_ in glob("weights/t5B_al/*/"):
    model = T5Generator.from_pretrained(dir_)
    pred = model.batch_generate(x_test, 128)
    pred = tokenizer.batch_decode(pred.numpy().tolist())
    predictions.append(pred)

All model checkpoint layers were used when initializing T5Generator.
