# Author
    - Selim Lakhdar
        - selim.lakhdar.etu@univ-lille.fr
        - selim.lakhdar@gmail.com
-------------------------------------------------

# Imports

In [1]:
import numpy as np
import pandas as pd
import os

from transformers import BertTokenizer, BertModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression

import torch

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

# Data

In [3]:
sentences = []
sources = []
sentiments = []

for root, dirs, files in os.walk('./data/'):
    for f in files:
        if f != 'readme.txt':
            tmp = pd.read_table(root + f, header=None)
            sentences += tmp[0].values.tolist()
            sentiments += tmp[1].values.tolist()
            sources += [f[:-4]] * len(sentences)

data = []
for source, sentence, sentiment,  in zip(sources, sentences, sentiments):
    data += [[source, sentence, sentiment]]

df = pd.DataFrame(data, columns=['source', 'sentence', 'sentiment'])
df

Unnamed: 0,source,sentence,sentiment
0,amazon_cells_labelled,So there is no way for me to plug it in here i...,0
1,amazon_cells_labelled,"Good case, Excellent value.",1
2,amazon_cells_labelled,Great for the jawbone.,1
3,amazon_cells_labelled,Tied to charger for conversations lasting more...,0
4,amazon_cells_labelled,The mic is great.,1
...,...,...,...
2743,yelp_labelled,I just got bored watching Jessice Lange take h...,0
2744,yelp_labelled,"Unfortunately, any virtue in this film's produ...",0
2745,yelp_labelled,"In a word, it is embarrassing.",0
2746,yelp_labelled,Exceptionally bad!,0


## Truncate

In [4]:
max_sentence_len = 400
df['len'] = df['sentence'].apply(lambda x: len(x.split(' ')))
mask = df['len'] < max_sentence_len
df_clean = df[mask]
df_clean

Unnamed: 0,source,sentence,sentiment,len
0,amazon_cells_labelled,So there is no way for me to plug it in here i...,0,21
1,amazon_cells_labelled,"Good case, Excellent value.",1,4
2,amazon_cells_labelled,Great for the jawbone.,1,4
3,amazon_cells_labelled,Tied to charger for conversations lasting more...,0,11
4,amazon_cells_labelled,The mic is great.,1,4
...,...,...,...,...
2743,yelp_labelled,I just got bored watching Jessice Lange take h...,0,13
2744,yelp_labelled,"Unfortunately, any virtue in this film's produ...",0,16
2745,yelp_labelled,"In a word, it is embarrassing.",0,8
2746,yelp_labelled,Exceptionally bad!,0,4


In [5]:
df_clean['sentiment'].value_counts()

1    1385
0    1360
Name: sentiment, dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_clean['sentence'], df_clean['sentiment'], test_size=0.3)

In [7]:
print('X_train.shape:', X_train.shape, 'y_train.shape:', y_train.shape)
print('X_test.shape:', X_test.shape, 'y_test.shape:', y_test.shape)

X_train.shape: (1921,) y_train.shape: (1921,)
X_test.shape: (824,) y_test.shape: (824,)


# Bert

In [8]:
model = BertModel.from_pretrained("bert-base-uncased")
tokenizer =  BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
def transform(sentences):
    # tokenize and add special tokens
    tokenized = sentences.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
    
    # max_len for padding
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)
    print("max_len:", max_len)
    
    # add padding 
    tokenized_pad = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    input_ids = torch.tensor(tokenized_pad)
    
    # transform
    with torch.no_grad(): 
        embeddings  = model(input_ids)
        
    return embeddings[0][:, 0, :].numpy()

In [10]:
train_tokens = transform(X_train)
test_tokens = transform(X_test)

max_len: 249
max_len: 379


In [12]:
clf = LogisticRegression(max_iter=1000)
clf.fit(train_tokens, y_train)
pred = clf.predict(test_tokens)
print("score:", clf.score(test_tokens,y_test))
print(classification_report(y_test,pred))

score: 0.5230582524271845
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       394
           1       0.97      0.09      0.16       430

    accuracy                           0.52       824
   macro avg       0.74      0.54      0.41       824
weighted avg       0.75      0.52      0.40       824



# Fine Tune Bert
- https://www.analyticsvidhya.com/blog/2021/12/fine-tune-bert-model-for-sentiment-analysis-in-google-colab/

In [20]:
import tensorflow_datasets as tfds
from transformers import TFBertForSequenceClassification
import tensorflow as tf

In [14]:
(ds_train, ds_test), ds_info = tfds.load('imdb_reviews',
          split = (tfds.Split.TRAIN, tfds.Split.TEST),
          as_supervised=True,
          with_info=True)

INFO:absl:No config specified, defaulting to first: imdb_reviews/plain_text
INFO:absl:Generating dataset imdb_reviews (/home/alpha/tensorflow_datasets/imdb_reviews/plain_text/1.0.0)


[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /home/alpha/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

INFO:absl:Downloading http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz into /home/alpha/tensorflow_datasets/downloads/ai.stanfor.edu_amaas_sentime_aclImdb_v1PaujRp-TxjBWz59jHXsMDm5WiexbxzaFQkEnXc3Tvo8.tar.gz.tmp.1015eae394f941f385b9c6e40394b1f5...






Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling imdb_reviews-train.tfrecord...:   0%|          | 0/25000 [00:00<?, ? examples/s]

INFO:absl:Done writing imdb_reviews-train.tfrecord. Number of examples: 25000 (shards: [25000])


Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling imdb_reviews-test.tfrecord...:   0%|          | 0/25000 [00:00<?, ? examples/s]

INFO:absl:Done writing imdb_reviews-test.tfrecord. Number of examples: 25000 (shards: [25000])


Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling imdb_reviews-unsupervised.tfrecord...:   0%|          | 0/50000 [00:00<?, ? examples/s]

INFO:absl:Done writing imdb_reviews-unsupervised.tfrecord. Number of examples: 50000 (shards: [50000])
INFO:absl:Constructing tf.data.Dataset for split (Split('train'), Split('test')), from /home/alpha/tensorflow_datasets/imdb_reviews/plain_text/1.0.0


[1mDataset imdb_reviews downloaded and prepared to /home/alpha/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [15]:
def convert_example_to_feature(review):
    return tokenizer.encode_plus(review,
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = max_length, # max length of the text that can go to BERT
                pad_to_max_length = True, # add [PAD] tokens
                return_attention_mask = True, # add attention mask to not focus on pad tokens
              )

In [16]:
# can be up to 512 for BERT
max_length = 512
batch_size = 6

In [17]:
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

In [18]:
def encode_examples(ds, limit=-1):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if (limit > 0):
        ds = ds.take(limit)
    for review, label in tfds.as_numpy(ds):
        bert_input = convert_example_to_feature(review.decode())
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [21]:
# train dataset
ds_train_encoded = encode_examples(ds_train).shuffle(10000).batch(batch_size)
# test dataset
ds_test_encoded = encode_examples(ds_test).batch(batch_size)

In [23]:
# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5

# we will do just 1 epoch, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 1

# model initialization
model_tfbt = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# choosing Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model_tfbt.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Erreur trop de temps .....

In [25]:
bert_history = model_tfbt.fit(ds_train_encoded, epochs=number_of_epochs, validation_data=ds_test_encoded)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
 126/4167 [..............................] - ETA: 16:15:14 - loss: 0.6588 - accuracy: 0.5594

KeyboardInterrupt: 

In [None]:
test_sentence = "This is a really good movie. I loved it and will watch again"

predict_input = tokenizer.encode(test_sentence, truncation=True, padding=True, return_tensors="tf")

tf_output = model_tfbt.predict(predict_input)[0]
tf_prediction = tf.nn.softmax(tf_output, axis=1)
labels = ['Negative','Positive'] #(0:negative, 1:positive)
label = tf.argmax(tf_prediction, axis=1)
label = label.numpy()
print(labels[label[0]])