In [1]:
#! pip install transformers tensorflow
# ! pip install tf-keras
# ! pip install sacremoses 
# ! pip install sentencepiece

from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import tokenizers as tk
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm





# BERT multilangual

In [2]:
# Charger le tokenizer 
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Charger le modèle
model = TFBertModel.from_pretrained('bert-base-multilingual-cased')





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [3]:
# test sur 10% des données
# récupérer les données sample_pair_indices_train.pkl
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

train = pd.read_parquet(r"C:\Users\guigu\Desktop\Telecom\Projet Fil Rouge\data\df_train.parquet")

train = train.sample(frac=0.1, random_state=1)

#  Prétraiter le texte
train['text'] = train['title'] + ' ' + train['description'] + ' ' + train['brand'] + ' ' + train['category']
train = train.dropna(subset=['text'])

# Initialiser une liste pour stocker les embeddings
all_embeddings = []

# Diviser les données en lots et les traiter un par un
batch_size = 32  # Définir une taille de lot qui convient à votre machine
for i in tqdm(range(0, len(train), batch_size)):
    batch = train.iloc[i:i+batch_size]
    input_tokens = tokenizer(batch['text'].tolist(), padding=True, truncation=True, return_tensors='tf', max_length=256)  # Réduire la longueur max si nécessaire
    outputs = model(input_tokens)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Utiliser l'embedding CLS
    all_embeddings.append(embeddings)

# Concaténer tous les embeddings de lots en une seule matrice
embeddings = np.vstack(all_embeddings)

100%|██████████| 220/220 [1:28:17<00:00, 24.08s/it] 


In [5]:
np.savetxt('./embeddings/BERT/embeddings_multilangual.txt', embeddings)

In [7]:
# régression logistique 

y = train['label']

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=1)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pre = clf.predict(X_test)

accuracy_score(y_test, y_pre)

print('Accuracy:', accuracy_score(y_test, y_pre))

Accuracy: 0.22451888809693513


# BERT camembert

In [2]:
from transformers import CamembertTokenizer, TFCamembertModel

# Charger le tokenizer et le modèle CamemBERT "base"
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
model = TFCamembertModel.from_pretrained('camembert-base')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFCamembertModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing TFCamembertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFCamembertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFCamembertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFCamembertModel for predictions without further training.


In [5]:
from tqdm import tqdm
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

train = pd.read_parquet(r"C:\Users\guigu\Desktop\Telecom\Projet Fil Rouge\data\df_train.parquet")

train = train.sample(frac=0.1, random_state=1)

#  Prétraiter le texte
train['text'] = train['title'] + ' ' + train['description'] + ' ' + train['brand'] + ' ' + train['category']
train = train.dropna(subset=['text'])
# Initialiser une liste pour stocker les embeddings
all_embeddings = []

# Diviser les données en lots et les traiter un par un
batch_size = 32  # Définir une taille de lot qui convient à votre machine
for i in tqdm(range(0, len(train), batch_size)):
    batch = train.iloc[i:i+batch_size]
    input_tokens = tokenizer(batch['text'].tolist(), padding=True, truncation=True, return_tensors='tf', max_length=256)  # Réduire la longueur max si nécessaire
    outputs = model(input_tokens)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Utiliser l'embedding CLS
    all_embeddings.append(embeddings)

# Concaténer tous les embeddings de lots en une seule matrice
embeddings = np.vstack(all_embeddings)

100%|██████████| 220/220 [31:04<00:00,  8.47s/it] 


In [6]:
np.savetxt('./embeddings/BERT/embeddings_camembert.txt', embeddings)

In [7]:
# régression logistique 

y = train['label']

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=1)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pre = clf.predict(X_test)

accuracy_score(y_test, y_pre)

print('Accuracy:', accuracy_score(y_test, y_pre))

Accuracy: 0.14611546685673557


# BERT XML-RoBERTa

In [8]:
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel

# Charger le tokenizer et le modèle XLM-RoBERTa "base"
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = TFXLMRobertaModel.from_pretrained('xlm-roberta-base')


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing TFXLMRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


In [9]:
# Initialiser une liste pour stocker les embeddings
all_embeddings = []

# Diviser les données en lots et les traiter un par un
batch_size = 32  # Définir une taille de lot qui convient à votre machine
for i in tqdm(range(0, len(train), batch_size)):
    batch = train.iloc[i:i+batch_size]
    input_tokens = tokenizer(batch['text'].tolist(), padding=True, truncation=True, return_tensors='tf', max_length=256)  # Réduire la longueur max si nécessaire
    outputs = model(input_tokens)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Utiliser l'embedding CLS
    all_embeddings.append(embeddings)

# Concaténer tous les embeddings de lots en une seule matrice
embeddings = np.vstack(all_embeddings)

100%|██████████| 220/220 [29:16<00:00,  7.98s/it]


In [10]:
np.savetxt('./embeddings/BERT/embeddings_roberta.txt', embeddings)

In [11]:
# régression logistique 

y = train['label']

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=1)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pre = clf.predict(X_test)

accuracy_score(y_test, y_pre)

print('Accuracy:', accuracy_score(y_test, y_pre))

Accuracy: 0.10192444761225944


# FLAUBERT large

In [15]:
from transformers import FlaubertTokenizer, TFFlaubertModel

# Charger le tokenizer FlauBERT
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_large_cased')

# Charger le modèle FlauBERT en convertissant les poids de PyTorch vers TensorFlow
model = TFFlaubertModel.from_pretrained('flaubert/flaubert_large_cased', from_pt=True)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFFlaubertModel: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing TFFlaubertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFFlaubertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFFlaubertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFFlaubertModel for predictions without further training.


In [16]:
# Initialiser une liste pour stocker les embeddings
all_embeddings = []

# Diviser les données en lots et les traiter un par un
batch_size = 32  # Définir une taille de lot qui convient à votre machine
for i in tqdm(range(0, len(train), batch_size)):
    batch = train.iloc[i:i+batch_size]
    input_tokens = tokenizer(batch['text'].tolist(), padding=True, truncation=True, return_tensors='tf', max_length=256)  # Réduire la longueur max si nécessaire
    outputs = model(input_tokens)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Utiliser l'embedding CLS
    all_embeddings.append(embeddings)

# Concaténer tous les embeddings de lots en une seule matrice
embeddings = np.vstack(all_embeddings)

100%|██████████| 220/220 [2:04:52<00:00, 34.05s/it]   


In [17]:
np.savetxt('./embeddings/BERT/embeddings_flaubert.txt', embeddings)

In [18]:
# régression logistique 

y = train['label']

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=1)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pre = clf.predict(X_test)

accuracy_score(y_test, y_pre)

print('Accuracy:', accuracy_score(y_test, y_pre))

Accuracy: 0.22380612972202424


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
