# Team Members:
- Amir Mobayen
- Leelav Kareem
- Nikita Chistyakov

## Topic:
fine-tunedTransformer Architecture from a pretrained model that can be found on sites like HuggingFace

## Libraries

In [1]:
# Basic packages
import numpy as np
import pandas as pd

# Plots
import matplotlib.pyplot as plt

# Tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from transformers import BertTokenizer, TFAutoModelForSequenceClassification

# Sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

tf.get_logger().setLevel('ERROR')


2023-07-23 13:28:04.304772: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-23 13:28:04.306259: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-23 13:28:04.334373: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-23 13:28:04.334863: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# About the dataset:
List of tweet texts with emotion labels like joy, sadness, fear, anger…
Dataset is split into train, test and validation sets for building the machine learning model. At first, you are
given only train and test sets. The validation one will be given in the end of the project for you to check
the final performance of your algorithm (to make sure there is no overfitting over the test data).
You can work on this project on group of one, two or three students. This exercise is mandatory, not
giving it back is equivalent to getting to lowest grade

# Goal:
- Train different kind of models able to classify each text according to the sentiment mainly present
in it
- Compare the results of your different models and try to analyze and explain the differences

# Parameters

In [2]:
data_file = './NLP_exam_emotions_dataset/train.txt'
test_flie = './NLP_exam_emotions_dataset/test.txt'

max_len = 128  # Maximum sequence length for BERT

# Preparing Dataset

In [3]:
train_data = pd.read_csv(data_file, delimiter=';', header=0)
test_data = pd.read_csv(test_flie, delimiter=';', header=0)

train_data.head(), test_data.head()

(                             i didnt feel humiliated  sadness
 0  i can go from feeling so hopeless to so damned...  sadness
 1   im grabbing a minute to post i feel greedy wrong    anger
 2  i am ever feeling nostalgic about the fireplac...     love
 3                               i am feeling grouchy    anger
 4  ive been feeling a little burdened lately wasn...  sadness,
   im feeling rather rotten so im not very ambitious right now  sadness
 0          im updating my blog because i feel shitty           sadness
 1  i never make her separate from me because i do...           sadness
 2  i left with my bouquet of red and yellow tulip...               joy
 3    i was feeling a little vain when i did this one           sadness
 4  i cant walk into a shop anywhere where i do no...              fear)

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
# Preprocess the data
def preprocess_text(text, tokenizer, max_len):
    input_ids = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=max_len, padding='max_length')
    return input_ids

In [6]:
train_input_ids_list = [preprocess_text(text, tokenizer, max_len) for text in train_data.iloc[:, 0]]
train_input_ids_batch = tf.convert_to_tensor(train_input_ids_list)
train_labels = tf.convert_to_tensor(train_data.iloc[:, 1])
len_label = len(np.unique(train_labels))

test_input_ids_list = [preprocess_text(text, tokenizer, max_len) for text in test_data.iloc[:, 0]]
test_input_ids_batch = tf.convert_to_tensor(test_input_ids_list)
test_labels = tf.convert_to_tensor(test_data.iloc[:, 1])

2023-07-23 13:28:11.155955: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-23 13:28:11.156354: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [7]:
# Label Encoder
label_encoder = LabelEncoder()
train_y = label_encoder.fit_transform(train_data.iloc[:, 1])
test_y = label_encoder.transform(test_data.iloc[:, 1])

# Model

In [8]:
# Load pre-trained BERT model
model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len_label)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.summary

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<bound method Model.summary of <transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification object at 0x7f2958fbc150>>

In [None]:
# Train the model
history = model.fit(train_input_ids_batch, train_y, epochs=5, batch_size=32)

Epoch 1/5

# Evaluation

In [None]:
# Evaluate the model
predicted_class_indices = np.argmax(model.predict(test_input_ids_batch)[0], axis=1)
predicted_labels = label_encoder.inverse_transform(predicted_class_indices)
accuracy = accuracy_score(test_data.iloc[:, 1], predicted_labels)
print(f'Test Accuracy: {accuracy:.4f}')

print('Classification Report:')
print(classification_report(test_data.iloc[:, 1], predicted_labels))

In [None]:
def plot_results(history):
    hist_df = pd.DataFrame(history.history)
    hist_df.columns = ["loss", "accuracy", "val_loss", "val_accuracy"]
    hist_df.index = np.arange(1, len(hist_df) + 1)

    fig, axs = plt.subplots(nrows=2, sharex=True, figsize=(16, 10))
    axs[0].plot(hist_df.val_accuracy, lw=3, label='Validation Accuracy')
    axs[0].plot(hist_df.accuracy, lw=3, label='Training Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].grid()
    axs[0].legend(loc=0)
    axs[1].plot(hist_df.val_loss, lw=3, label='Validation Loss')
    axs[1].plot(hist_df.loss, lw=3, label='Training Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].grid()
    axs[1].legend(loc=0)

    plt.show();


plot_results(history)

# Sample Test

In [None]:
count = 0
sample_range = 10
for i in range(sample_range):
    sample = test_data.iloc[i, 0]
    sample_test, _, _ = tokenizer([sample])  # Tokenize the sample as a list

    sample_test = pad_sequences(sample_test, maxlen=max_len)  # Pad the sequence to match max_len
    test_predict = model.predict(sample_test)
    prediction = np.argmax(test_predict)

    # After obtaining the predicted class index
    predicted_label = label_encoder.inverse_transform([prediction])[0]
    true_label = test_data.iloc[i, 1]
    if predicted_label == true_label:
        count += 1
    print(f'Predicted class vs True Class: [{predicted_label}, {true_label}]')

print(f'\nsample accuracy = {count / sample_range}')