In [1]:
import numpy as np
import pandas as pd

# Load Huggingface transformers
#from transformers import TFBertModel,  BertConfig, BertTokenizerFast
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import (BertTokenizer, BertForSequenceClassification, TFBertForSequenceClassification,
                          CamembertTokenizer, CamembertForSequenceClassification, TFCamembertForSequenceClassification)

# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

import tensorflow as tf
import matplotlib.pyplot as plt

from datetime import datetime
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:
data1 = pd.read_excel (r'./Matric_Couverture3_evolution.xlsx', index_col=None, header=None)
data = pd.DataFrame(data1)
data = data.iloc[0:2]
data=data.transpose()
data = data[[1, 0]]
data.columns = ["texte", "Evolution"]
data=data.drop(index=0)
data

Unnamed: 0,texte,Evolution
1,le système affiche les boutons des réseaux soc...,0
2,"sur le logo depuis une page intérieure, le sys...",0
3,"Sur le logo depuis la page daccueil, le systè...",2
4,Si lutilisateur clique sur le bouton daccess...,0
5,Un clic sur un bouton de réseaux sociaux perme...,0
6,Le système affiche :,3
7,"Au clic sur la langue sélectionnée, le système...",0
8,"Si la langue sélectionnée est français, le sys...",0
9,"Si la langue sélectionnée est anglais, le syst...",0
10,"Si la langue sélectionnée est espagnol, le sys...",0


In [3]:
data['Evolution'].value_counts()

1    535
0    436
2    156
3     30
Name: Evolution, dtype: int64

In [4]:
possible_labels = data.Evolution.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = possible_label
label_dict

{0: 0, 2: 2, 3: 3, 1: 1}

In [5]:
data['label'] = data.Evolution.replace(label_dict)
data.head()

Unnamed: 0,texte,Evolution,label
1,le système affiche les boutons des réseaux soc...,0,0
2,"sur le logo depuis une page intérieure, le sys...",0,0
3,"Sur le logo depuis la page daccueil, le systè...",2,2
4,Si lutilisateur clique sur le bouton daccess...,0,0
5,Un clic sur un bouton de réseaux sociaux perme...,0,0


In [6]:
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

In [7]:
tokenizer.tokenize("J'aime bien faire des achats en ligne")

['▁J', "'", 'aime', '▁bien', '▁faire', '▁des', '▁achats', '▁en', '▁ligne']

In [8]:
tokenizer.encode("J'aime bien faire des achats en ligne")

[5, 121, 11, 660, 72, 85, 20, 5503, 22, 284, 6]

In [9]:
transformers_model = TFCamembertForSequenceClassification.from_pretrained('jplu/tf-camembert-base', num_labels=4)

Some weights of the model checkpoint at jplu/tf-camembert-base were not used when initializing TFCamembertForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFCamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFCamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFCamembertForSequenceClassification were not initialized from the model checkpoint at jplu/tf-camembert-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
transformers_model.summary()

Model: "tf_camembert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  110621952 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  593668    
Total params: 111,215,620
Trainable params: 111,215,620
Non-trainable params: 0
_________________________________________________________________


In [11]:
input_ =  tf.expand_dims(tokenizer.encode("J'aime bien faire des achats en ligne"), 0)
input_

<tf.Tensor: shape=(1, 11), dtype=int32, numpy=array([[   5,  121,   11,  660,   72,   85,   20, 5503,   22,  284,    6]])>

In [12]:
att_mask = tf.expand_dims(np.ones(input_.shape[1], dtype='int32'), 0)
att_mask

<tf.Tensor: shape=(1, 11), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>

In [13]:
logits = transformers_model([input_, att_mask])
logits

(<tf.Tensor: shape=(1, 4), dtype=float32, numpy=
 array([[-0.03259619,  0.01883602, -0.0169699 ,  0.00540785]],
       dtype=float32)>,)

## Pre-processing
### Tokenize text & padding

In [14]:
df_dataset = data
df_dataset.head()

Unnamed: 0,texte,Evolution,label
1,le système affiche les boutons des réseaux soc...,0,0
2,"sur le logo depuis une page intérieure, le sys...",0,0
3,"Sur le logo depuis la page daccueil, le systè...",2,2
4,Si lutilisateur clique sur le bouton daccess...,0,0
5,Un clic sur un bouton de réseaux sociaux perme...,0,0


In [15]:
#df_dataset['sent_len'] = df_dataset['texte'].apply(lambda x: len(x.split(" ")))
#max_seq_len = np.round(df_dataset['sent_len'].mean() + 2 * df_dataset['sent_len'].std()).astype(int)
max_seq_len = 35

df_dataset['sent_len'].plot.hist()
plt.axvline(x=max_seq_len, color='k', linestyle='--', label='max len');

In [16]:
from tqdm.notebook import tqdm
input_sequences = []
# The attention mask is an optional argument used when batching sequences together.
# The attention mask is a binary tensor indicating the position of the padded indices so that the model does not attend to them.
attention_masks = []

for text in tqdm(df_dataset['texte']):
    sequence_dict = tokenizer.encode_plus(text, max_length=max_seq_len, pad_to_max_length=True, truncation=True)
    input_ids = sequence_dict['input_ids']
    att_mask = sequence_dict['attention_mask']

    input_sequences.append(input_ids)
    attention_masks.append(att_mask)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1157.0), HTML(value='')))






In [17]:
print(input_sequences[0])
print(attention_masks[0])

[5, 16, 439, 4772, 19, 5908, 20, 1517, 1148, 3199, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [18]:
labels = df_dataset['label'].values
print(labels[0])

0


In [19]:
X_train, X_test, y_train, y_test, att_masks_train, att_masks_test = (
    train_test_split(input_sequences, labels, attention_masks, random_state=42, test_size=0.2)
)

In [20]:
X_train = tf.constant(X_train)
X_test = tf.constant(X_test)

y_train = tf.constant(y_train)
y_test = tf.constant(y_test)

att_masks_train = tf.constant(att_masks_train)
att_masks_test = tf.constant(att_masks_test)

In [21]:
print(f'Train | X shape: {X_train.shape}, att_mask shape: {att_masks_train.shape}, y shape: {y_train.shape}')
print(f'Test | X shape: {X_test.shape}, att_mask shape: {att_masks_test.shape}, y shape: {y_test.shape},')

Train | X shape: (925, 35), att_mask shape: (925, 35), y shape: (925,)
Test | X shape: (232, 35), att_mask shape: (232, 35), y shape: (232,),


## Create model

In [22]:
def create_model():
    model = TFCamembertForSequenceClassification.from_pretrained('jplu/tf-camembert-base', num_labels=4)
    
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    opt = tf.keras.optimizers.Adam(lr=1e-5)
  
    model.compile(loss=loss,
                  optimizer=opt,
                  metrics=['accuracy'])
  
    return model

In [23]:
model = create_model()
model.summary()

Some weights of the model checkpoint at jplu/tf-camembert-base were not used when initializing TFCamembertForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFCamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFCamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFCamembertForSequenceClassification were not initialized from the model checkpoint at jplu/tf-camembert-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_camembert_for_sequence_classification_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  110621952 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  593668    
Total params: 111,215,620
Trainable params: 111,215,620
Non-trainable params: 0
_________________________________________________________________


## Training

In [24]:
loss, metric = model.evaluate([X_test, att_masks_test], y_test, batch_size=32, verbose=0)
print(f"Loss before training: {loss:.4f}, Accuracy before training: {metric:.2%}")

Loss before training: 1.3798, Accuracy before training: 38.36%


In [25]:
history = model.fit([X_train, att_masks_train], y_train, batch_size=32, epochs=30, validation_data=([X_test, att_masks_test], y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30

KeyboardInterrupt: 

In [None]:
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
plot_history(history)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = model.predict([X_test, att_masks_test])
print(confusion_matrix(y_test, np.argmax(y_pred, axis = 2).transpose()))
print(classification_report(y_test,np.argmax(y_pred, axis = 2).transpose()))
print(accuracy_score(y_test, np.argmax(y_pred, axis = 2).transpose()))

model.save('nom_model.h5')