In [2]:
# Install libraries
#!pip install transformers datasets evaluate

In [12]:
# Data processing
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline
from datasets import Dataset
import evaluate

In [13]:
# Read in data
amz_review = pd.read_csv('data/amazon_cells_labelled.txt', sep='\t', names=['review', 'label'])

# Take a look at the data
amz_review.head()

Unnamed: 0,review,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [4]:
# Get the dataset information
amz_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  1000 non-null   object
 1   label   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


The label value of 0 represents negative reviews and the label value of 1 represents positive reviews. The dataset has 500 positive reviews and 500 negative reviews. It is well-balanced, so we can use  accuracy as the metric to evaluate the model performance.

In [5]:
# Check the label distribution
amz_review['label'].value_counts()

label
0    500
1    500
Name: count, dtype: int64

# Step 3: Train Test Split

In [14]:
# Training dataset
train_data = amz_review.sample(frac=0.8, random_state=42)

# Testing dataset
test_data = amz_review.drop(train_data.index)

# Check the number of records in training and testing dataset.
print(f'The training dataset has {len(train_data)} records.')
print(f'The testing dataset has {len(test_data)} records.')

The training dataset has 800 records.
The testing dataset has 200 records.


After the train test split, there are 800 reviews in the training dataset and 200 reviews in the testing dataset.

# Step 4: Convert Pandas Dataframe to Hugging Face Dataset

In [15]:
# Convert pyhton dataframe to Hugging Face arrow dataset
hg_train_data = Dataset.from_pandas(train_data)
hg_test_data = Dataset.from_pandas(test_data)

  if _pandas_api.is_sparse(col):


In [16]:
# Length of the Dataset
print(f'The length of hg_train_data is {len(hg_train_data)}.\n')

# Check one review
hg_train_data[0]

The length of hg_train_data is 800.



{'review': 'Thanks again to Amazon for having the things I need for a good price!',
 'label': 1,
 '__index_level_0__': 521}

In [9]:
# Validate the record in pandas dataframe
amz_review.iloc[[521]]

Unnamed: 0,review,label
521,Thanks again to Amazon for having the things I...,1


# Step 5: Tokenize Text

In [17]:
# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# Take a look at the tokenizer
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [12]:
# Mapping between special tokens and their IDs.
print(f'The unknown token is {tokenizer.unk_token} and the ID for the unkown token is {tokenizer.unk_token_id}.')
print(f'The seperator token is {tokenizer.sep_token} and the ID for the seperator token is {tokenizer.sep_token_id}.')
print(f'The pad token is {tokenizer.pad_token} and the ID for the pad token is {tokenizer.pad_token_id}.')
print(f'The sentence level classification token is {tokenizer.cls_token} and the ID for the classification token is {tokenizer.cls_token_id}.')
print(f'The mask token is {tokenizer.mask_token} and the ID for the mask token is {tokenizer.mask_token_id}.')

The unknown token is [UNK] and the ID for the unkown token is 100.
The seperator token is [SEP] and the ID for the seperator token is 102.
The pad token is [PAD] and the ID for the pad token is 0.
The sentence level classification token is [CLS] and the ID for the classification token is 101.
The mask token is [MASK] and the ID for the mask token is 103.


In [18]:
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["review"],
                     max_length=32,
                     truncation=True,
                     padding="max_length")

# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_dataset)
dataset_test = hg_test_data.map(tokenize_dataset)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [14]:
dataset_train

Dataset({
    features: ['review', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 800
})

After tokenization, we can see that both the training and the testing Dataset have 6 features, `'review'`, `'label'`, `'__index_level_0__'`, `'input_ids'`, `'token_type_ids'`, and `'attention_mask'`. The number of rows is stored with `num_rows`.

In [15]:
# Take a look at the data
print(dataset_train)
print(dataset_test)

Dataset({
    features: ['review', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 800
})
Dataset({
    features: ['review', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})


In step 6, we will load the pretrained model for sentiment analysis.

* `AutoModelForSequenceClassification` loads the BERT model without the sequence classification head.
* The method `from_pretrained()` loads the weights from the pretrained model into the new model, so the weights in the new model are not randomly initialized. Note that the new weights for the new sequence classification head are going to be randomly initialized.
* `bert-base-cased` is the name of the pretrained model. We can change it to a different model based on the nature of the project.
* `num_labels` indicates the number of classes. Our dataset has two classes, positive and negative, so `num_labels=2`.

In [19]:
from torch import nn
from transformers import AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput

class CustomModel(nn.Module):
  def __init__(self,checkpoint,num_labels): 
    super(CustomModel,self).__init__() 
    self.num_labels = num_labels 

    #Load Model with given checkpoint and extract its body
    self.model = model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
    self.dropouts = nn.Dropout(0.1) 
    self.classifier = nn.Linear(768,num_labels) # load and initialize weights

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #Extract outputs from the body
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    #Add custom layers
    sequence_output = self.dropouts(outputs[0]) #outputs[0]=last hidden state

    logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses
    
    loss = None
    if labels is not None:
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
    
      return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

In [20]:
# Load model

#model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
model=CustomModel("bert-base-cased", num_labels=2)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./sentiment_transformer/",
    logging_dir='./sentiment_transformer/logs',
    logging_strategy='epoch',
    logging_steps=10,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

Since our dataset is highly balanced, we will use accuracy as the evaluation metric. It can be loaded using `evaluate.load("accuracy")`. After getting predictions from the model, the metric is computed using `metric.compute`.

In [22]:
"""# Function to compute the metric
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)"""


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    print(logits)
    print(labels)
    predictions = np.argmax(logits, axis=0)
    return metric.compute(predictions=predictions, references=labels)


In [34]:
type(dataset_test[:50])

dict

In [23]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    #compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,0.6121,0.504322


TrainOutput(global_step=200, training_loss=0.6120648574829102, metrics={'train_runtime': 1194.8298, 'train_samples_per_second': 0.67, 'train_steps_per_second': 0.167, 'total_flos': 0.0, 'train_loss': 0.6120648574829102, 'epoch': 1.0})

In [14]:
dataset_test

Dataset({
    features: ['review', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 200
})

We can see that the accuracy is above 90 percent in just 2 epochs.

In [24]:
# Predictions
y_test_predict = trainer.predict(dataset_test)

# Take a look at the predictions
y_test_predict

PredictionOutput(predictions=(array([[-1.23971152e+00,  1.09413576e+00],
       [-1.14685953e+00,  7.22907186e-01],
       [-9.67836738e-01,  8.65628779e-01],
       [-9.52799544e-02,  3.10175195e-02],
       [ 1.90178663e-01, -2.37767547e-01],
       [-7.82594323e-01,  4.85976219e-01],
       [ 1.83918908e-01, -2.08478034e-01],
       [-1.54660925e-01,  6.59735575e-02],
       [-8.48139286e-01,  6.46892607e-01],
       [-6.51897267e-02, -9.55244228e-02],
       [-1.15794194e+00,  7.07303166e-01],
       [ 9.18782875e-02, -2.99674660e-01],
       [ 4.73238863e-02, -3.69770005e-02],
       [-3.76289070e-01,  9.27918777e-02],
       [-6.69242740e-01,  3.85999650e-01],
       [ 2.28240728e-01, -2.68849403e-01],
       [ 2.04837084e-01, -2.29355276e-01],
       [-1.07213616e+00,  4.66843605e-01],
       [-1.08672118e+00,  5.66961169e-01],
       [ 9.46494788e-02,  7.54997805e-02],
       [-1.13390768e+00,  7.54434466e-01],
       [ 2.67926931e-01, -4.69919652e-01],
       [-6.73249483e-01,

The predicted logits for the transfer learning text classification model can be extracted using `.predictions`.

We can see that the prediction has two columns. The first column is the predicted logit for label 0 and the second column is the predicted logit for label 1. logit values do not sum up to 1.

In [30]:
# Predicted logits
y_test_logits = y_test_predict.predictions[0]

# First 5 predicted probabilities
y_test_logits[:5]

array([[-1.2397115 ,  1.0941358 ],
       [-1.1468595 ,  0.7229072 ],
       [-0.96783674,  0.8656288 ],
       [-0.09527995,  0.03101752],
       [ 0.19017866, -0.23776755]], dtype=float32)

To get the predicted probabilities, we need to apply softmax on the predicted logit values.

After applying softmax, we can see that the predicted probability for each review sums up to 1.

In [31]:
# Predicted probabilities
y_test_probabilities = tf.nn.softmax(y_test_logits)

# First 5 predicted logits
y_test_probabilities[:5]

<tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[0.08835827, 0.9116417 ],
       [0.1335687 , 0.86643124],
       [0.13782595, 0.8621741 ],
       [0.4684675 , 0.53153247],
       [0.6053831 , 0.39461687]], dtype=float32)>

To get the predicted labels, `argmax` is used to return the index of the maximum probability for each review, which corresponds to the labels of zeros and ones.

In [32]:
# Predicted labels
y_test_pred_labels = np.argmax(y_test_probabilities, axis=1)

# First 5 predicted probabilities
y_test_pred_labels[:5]

array([1, 1, 1, 1, 0], dtype=int64)

The actual labels can be extracted using `y_test_predict.label_ids`.

In [33]:
# Actual labels
y_test_actual_labels = y_test_predict.label_ids

# First 5 predicted probabilities
y_test_actual_labels[:5]

array([1, 1, 1, 0, 0], dtype=int64)

In step 11, we will make the transfer learning text classification model performance evaluation.

`trainer.evaluate` is a quick way to get the loss and the accuracy of the testing dataset.

We can see that the model has a loss of 0.28 and an accuracy of 91.5%.

In [34]:
# Trainer evaluate
trainer.evaluate(dataset_test)

{'eval_loss': 0.504322350025177,
 'eval_runtime': 25.1108,
 'eval_samples_per_second': 7.965,
 'eval_steps_per_second': 1.991,
 'epoch': 1.0}

To calculate more model performance metrics, we can use `evaluate.load` to load the metrics of interest.

The results show that the testing dataset has a `f1` value of 0.91 and a `recall` value of 0.89.

In [35]:
# Load f1 metric
metric_f1 = evaluate.load("f1")

# Compute f1 metric
metric_f1.compute(predictions=y_test_pred_labels, references=y_test_actual_labels)

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

{'f1': 0.8}

In [36]:
# Load recall metric
metric_recall = evaluate.load("recall")

# Compute recall metric
metric_recall.compute(predictions=y_test_pred_labels, references=y_test_actual_labels)

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

{'recall': 0.8979591836734694}