In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import libraries

import pandas as pd
import numpy as np
import re
import tensorflow as tf
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification, AlbertConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from nltk.corpus import stopwords
import nltk

In [None]:
# download stopwords for NLTK

nltk.download('stopwords')
stop_word = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# load dataset
IMDB_dataset = pd.read_csv('/content/drive/MyDrive/Datasets/IMDB Dataset.csv')

In [None]:
# display the first five rows
IMDB_dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
print(IMDB_dataset.shape)

(50000, 2)


In [None]:
# map sentiments to numerical values
IMDB_dataset['sentiment'] = IMDB_dataset['sentiment'].map({'positive': 1, 'negative': 0})


In [None]:
# Three-way split: 80% train, 20% test, then 20% of train for validation
train_text, test_text, train_label, test_label = train_test_split(
    IMDB_dataset['review'],
    IMDB_dataset['sentiment'],
    test_size=0.2,
    random_state=42
)

train_text, val_text, train_label, val_label = train_test_split(
    train_text,
    train_label,
    test_size=0.2,
    random_state=42
)


In [None]:
# preprocessing data
def preprocess_text(text):

    # remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # lowercase
    text = text.lower()

    # remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_word])
    return text

In [None]:
# apply preprocessing for train, test and val splits

train_text = train_text.apply(preprocess_text)
val_text = val_text.apply(preprocess_text)
test_text = test_text.apply(preprocess_text)

  text = BeautifulSoup(text, "html.parser").get_text()


In [None]:
# use ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(list(train_text), truncation=True, padding="max_length", max_length=128)
val_encodings = tokenizer(list(val_text), truncation=True, padding="max_length", max_length=128)
test_encodings = tokenizer(list(test_text), truncation=True, padding="max_length", max_length=128)

In [None]:
# convert to TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_label)).batch(16)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_label)).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_label)).batch(16)

In [None]:
# model building with ALBERT configuration
config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=2, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2', config=config)

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# view the model layers
model.summary()

Model: "tf_albert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 albert (TFAlbertMainLayer)  multiple                  11683584  
                                                                 
 dropout_4 (Dropout)         multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 11685122 (44.58 MB)
Trainable params: 11685122 (44.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

In [None]:
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
# early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=2,
    restore_best_weights=True
)

In [None]:
# record training time of the model
import time
start_time = time.time()

# model training
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=6,
    callbacks=[early_stopping]
)

end_time = time.time()
training_time = end_time - start_time
print(f"Training time of the model: {training_time:.2f} seconds")

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Training time of the model: 4528.62 seconds


In [None]:
# evaluate the model on the test set
predictions = model.predict(test_dataset)
predicted_labels = np.argmax(predictions.logits, axis=1)



In [None]:
# calculate metrics
accuracy = accuracy_score(test_label, predicted_labels)
precision = precision_score(test_label, predicted_labels)
recall = recall_score(test_label, predicted_labels)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.8761
Precision: 0.8564727954971857
Recall: 0.9059337170073427


In [None]:
# print classification report

from sklearn.metrics import classification_report
print(classification_report(test_label, predicted_labels))

              precision    recall  f1-score   support

           0       0.90      0.85      0.87      4961
           1       0.86      0.91      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [None]:
# save the trained model and tokenizer
model.save_pretrained('/content/drive/MyDrive/FYP/ALBERT/ALBERT_model_1')
tokenizer.save_pretrained('/content/drive/MyDrive/FYP/ALBERT/ALBERT_model_1')

('/content/drive/MyDrive/FYP/ALBERT/ALBERT_model_1/tokenizer_config.json',
 '/content/drive/MyDrive/FYP/ALBERT/ALBERT_model_1/special_tokens_map.json',
 '/content/drive/MyDrive/FYP/ALBERT/ALBERT_model_1/spiece.model',
 '/content/drive/MyDrive/FYP/ALBERT/ALBERT_model_1/added_tokens.json')

In [None]:
from transformers import TFAlbertForSequenceClassification, AlbertTokenizer

# load the trained model and tokenizer
model_t = TFAlbertForSequenceClassification.from_pretrained('/content/drive/MyDrive/FYP/ALBERT/ALBERT_model_1')
tokenizer_t = AlbertTokenizer.from_pretrained('/content/drive/MyDrive/FYP/ALBERT/ALBERT_model_1')

# verify the model is loaded correctly
model_t.summary()


Some layers from the model checkpoint at /content/drive/MyDrive/FYP/ALBERT/ALBERT_model_1 were not used when initializing TFAlbertForSequenceClassification: ['dropout_4']
- This IS expected if you are initializing TFAlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFAlbertForSequenceClassification were initialized from the model checkpoint at /content/drive/MyDrive/FYP/ALBERT/ALBERT_model_1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertForSequenceClassification for predictions without further training.


Model: "tf_albert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 albert (TFAlbertMainLayer)  multiple                  11683584  
                                                                 
 dropout_47 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 11685122 (44.58 MB)
Trainable params: 11685122 (44.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# get the accuracy, precision and recall for the trained model

from sklearn.metrics import accuracy_score, precision_score, recall_score

# predictions on the validation set
predictions = model_t.predict(test_dataset)
predicted_labels = np.argmax(predictions.logits, axis=1)

# calculate metrics
accuracy = accuracy_score(test_label, predicted_labels)
precision = precision_score(test_label, predicted_labels)
recall = recall_score(test_label, predicted_labels)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 0.8761
Precision: 0.8565
Recall: 0.9059
