# Imports

In [1]:
import tensorflow as tf
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding, pipeline
from datasets import load_dataset
from huggingface_hub import notebook_login
from pprint import pprint
from sklearn.metrics import accuracy_score

# Pretrained model loading

In [3]:
model_id = "camembert-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = TFAutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)

model.summary()

Metal device set to: Apple M1 Pro


2022-10-07 11:57:19.887935: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-10-07 11:57:19.888266: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFCamembertForSequenceClassification.

Some layers of TFCamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_camembert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLayer  multiple                 110031360 
 )                                                               
                                                                 
 classifier (TFRobertaClassi  multiple                 592130    
 ficationHead)                                                   
                                                                 
Total params: 110,623,490
Trainable params: 110,623,490
Non-trainable params: 0
_________________________________________________________________


# Data loading

In [23]:
dataset = load_dataset("allocine")

dataset

Found cached dataset allocine (/Users/sofiene.alouini/.cache/huggingface/datasets/allocine/allocine/1.0.0/ea86b1dc05eae3a45a07b6281f2d4033b5fe7927b1008d06aa457ca1eae660d0)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['review', 'label'],
        num_rows: 160000
    })
    validation: Dataset({
        features: ['review', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['review', 'label'],
        num_rows: 20000
    })
})

In [5]:
dataset["train"].features

{'review': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)}

# Data preprocessing

In [24]:
tokenized_dataset = dataset.map(
    lambda example: tokenizer(example["review"], truncation=True), 
    batched=True
)

tokenized_dataset

  0%|          | 0/160 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['review', 'label', 'input_ids', 'attention_mask'],
        num_rows: 160000
    })
    validation: Dataset({
        features: ['review', 'label', 'input_ids', 'attention_mask'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['review', 'label', 'input_ids', 'attention_mask'],
        num_rows: 20000
    })
})

In [25]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=32,
)

tf_validation_dataset = tokenized_dataset["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=32,
)

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


# Model fine-tuning

In [26]:
for i, batch in enumerate(tf_train_dataset.take(2)):
    print(f"\n\n=== Batch {i+1} ===\n")
    print(batch)
    print()
    print("Input IDs shape:", batch[0]["input_ids"].shape)
    print("Attention mask shape:", batch[0]["attention_mask"].shape)
    print("Targets shape:", batch[1].shape)



=== Batch 1 ===

({'input_ids': <tf.Tensor: shape=(32, 453), dtype=int64, numpy=
array([[    5,   148,   492, ...,     1,     1,     1],
       [    5,  2812,  2812, ...,     1,     1,     1],
       [    5,   211,    16, ...,     1,     1,     1],
       ...,
       [    5,  4012,  2722, ...,     1,     1,     1],
       [    5, 13156,  1187, ...,     1,     1,     1],
       [    5,    29,    44, ...,     1,     1,     1]])>, 'attention_mask': <tf.Tensor: shape=(32, 453), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}, <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0])>)

Input IDs shape: (32, 453)
Attention mask shape: (32, 453)
Targets shape: (32,)


=== Batch 2 ===

({'input_ids': <tf.Tensor: shape=

In [31]:
batch_size = 32
num_epochs = 3

num_train_steps = len(tf_train_dataset) * num_epochs

lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)

In [33]:
model.roberta.trainable = False

model.compile(
    optimizer=opt,
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
)



<keras.callbacks.History at 0x7fbed0912e50>

# Model evaluation

In [34]:
tf_test_dataset = tokenized_dataset["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=32
)

In [35]:
model.evaluate(tf_test_dataset)



[0.22061826288700104, 0.9175999760627747]

# Deployment on Hugging Face hub

In [5]:
notebook_login()

Login successful
Your token has been saved to /Users/sofiene.alouini/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [6]:
tokenizer.push_to_hub("alosof/camembert-sentiment-allocine")

CommitInfo(commit_url='https://huggingface.co/alosof/camembert-sentiment-allocine/commit/48ed2b71d7b4fb4648d3fe421dee71c4faabc89f', commit_message='Upload tokenizer', commit_description='', oid='48ed2b71d7b4fb4648d3fe421dee71c4faabc89f', pr_url=None, pr_revision=None, pr_num=None)

In [40]:
model.push_to_hub("alosof/camembert-sentiment-allocine")

# Testing the deployed model

In [7]:
finetuned_tokenizer = AutoTokenizer.from_pretrained("alosof/camembert-sentiment-allocine")
finetuned_model = TFAutoModelForSequenceClassification.from_pretrained("alosof/camembert-sentiment-allocine")

Downloading:   0%|          | 0.00/522 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/354 [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFCamembertForSequenceClassification.

All the layers of TFCamembertForSequenceClassification were initialized from the model checkpoint at alosof/camembert-sentiment-allocine.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCamembertForSequenceClassification for predictions without further training.


In [44]:
y_pred = finetuned_model.predict(tf_test_dataset, verbose=True)



In [55]:
y_true = dataset["test"]["label"]

accuracy_score(
    y_true=y_true, 
    y_pred=y_pred["logits"].argmax(axis=1)
)

0.9176

In [66]:
dataset["test"].features

{'review': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)}

In [8]:
pipe = pipeline("sentiment-analysis", model=finetuned_model, tokenizer=finetuned_tokenizer)

In [9]:
pipe("Pire navet de l'histoire du cinéma !")

[{'label': 'LABEL_0', 'score': 0.9391865134239197}]

In [10]:
pipe("C'est sans conteste le film de l'année")

[{'label': 'LABEL_1', 'score': 0.6549316048622131}]