<a href="https://colab.research.google.com/github/amaelbogne/huggingface-transformers/blob/main/learning_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers[sentencepiece]
!pip install datasets

In [6]:
import tensorflow as tf
import numpy as np
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from datasets import load_dataset
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [None]:
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
raw_datasets = load_dataset('glue', 'sst2')

In [9]:
#tokenize any glue dataset

def tokenize_glue_dataset(dataset):
  print(dataset.shape)
  if dataset.shape[1] == 3:
    encoded = tokenizer(
        dataset['sentence'],
        padding = True,
        truncation = True,
        return_tensors = 'tf'
    )
  else:
    #n this case shape[1]=4 meaning the inputs are pairs of sentences
    encoded = tokenizer(
        dataset['sentence1'],
        dataset['sentence2'],
        padding = True,
        truncation = True,
        return_tensors = 'tf'
    )

  return encoded.data

In [None]:
tokenized_datasets = {
    split: tokenize_glue_dataset(raw_datasets[split]) for split in raw_datasets.keys()
}

### Training the model

In [None]:
model.compile(
    optimizer='adam',
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

model.fit(
    tokenized_datasets['train'],
    np.array(raw_datasets['train']['label']),
    validation_data=(
        tokenized_datasets['validation'],
        np.array(raw_datasets['validation']['label']),
    ),
    batch_size=8,
    epochs=3
)

### Adding the F1 metric

F1 metric is not built in in keras so we need to implement it by hand

In [11]:
class F1_metric(tf.keras.metrics.Metric):
  def __init__(self, name='f1_score', **kwargs):
    super().__init__(name=name, **kwargs)
    
    self.precision = tf.keras.metrics.Precision()
    self.recall = tf.keras.metrics.Recall()

  def update_state(self, y_true, y_pred, sample_weight=None):
    class_preds = tf.math.argmax(y_pred, axis=1)
    self.precision.update_state(y_true, class_preds, sample_weight)
    self.recall.update_state(y_true, class_preds, sample_weight)

  def reset_state(self):
    self.precision.reset_state()
    self.recall.reset_state()

  def result(self):
    return 2 / ((1 / self.precision.result()) + (1 / self.recall.result()))


### Improving performance

we achieve this by 

1- reducing the default rate (1e-3) by 20

2- decaying the learning rate during the training down to 0

Note: we will add the f1 custom metric while calling the compile method

In [12]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam

batch_size = 8
num_epochs = 3

#number of training steps, corresponding to the number of mini batches the model will go through overall
num_train_steps = (len(tokenized_datasets['train']['input_ids']) // batch_size) * num_epochs

lr_scheduler = PolynomialDecay(
    initial_learning_rate = 5e-5,
    end_learning_rate = 0,
    decay_steps = num_train_steps
)

opt = Adam(learning_rate = lr_scheduler)

In [None]:
model2 = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model2.compile(
    optimizer=opt,
    loss=loss,
    metrics=['accuracy', F1_metric()],
)

model2.fit(
    tokenized_datasets['train'],
    np.array(raw_datasets['train']['label']),
    validation_data=(
        tokenized_datasets['validation'],
        np.array(raw_datasets['validation']['label']),
    ),
    batch_size=batch_size,
    epochs=num_epochs
)

### Making predictions

In [None]:
preds = model2.predict(tokenized_datasets['test'])['logits']
class_preds = np.argmax(preds, axis=1)
print(preds.shape, class_preds.shape)

In [None]:
print(class_preds)

### computing metrics associated with our dataset

In [None]:
from datasets import load_metric

metric = load_metric('glue', 'sst2')
metric.compute(predictions=class_preds, references=raw_datasets['test']['label'])

### Pushing to the hub

In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
!sudo apt-get install git-lfs
!git lfs install

In [None]:
!huggingface-cli login

In [3]:
!git config --global user.email "loicamael@gmail.com"
!git config --global user.name "amaelbogne"

In [None]:
model.push_to_hub('bert-base-cased-sst2-basic')

In [None]:
tokenizer.push_to_hub('bert-base-cased-sst2-basic')