<a href="https://colab.research.google.com/github/amaithi-sam/NLP_Learning/blob/main/Pretrain_Bert_Sentiment_classification_twitter_smile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bert Sentiment Analysis classification using Twitter smile dataset

Dataset: `https://figshare.com/articles/dataset/smile_annotations_final_csv/3187909/2`

EDA

In [1]:
import torch
import pandas as pd
from tqdm import tqdm


In [4]:
df = pd.read_csv('smile-annotations-final.csv', names=['id', 'text', 'category'])
df.set_index('id', inplace=True)
df.head()

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
614877582664835073,@Sofabsports thank you for following me back. ...,happy
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [5]:
df.category.value_counts()

category
nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: count, dtype: int64

In [6]:
# filter category

df = df[~df.category.str.contains(r'\|')]
df.category.value_counts()

category
nocode          1572
happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: count, dtype: int64

In [7]:
df = df[df.category != 'nocode']
df.category.value_counts()

category
happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: count, dtype: int64

In [8]:
possible_labels = df.category.unique()
possible_labels

array(['happy', 'not-relevant', 'angry', 'disgust', 'sad', 'surprise'],
      dtype=object)

In [9]:
# Label encoding

label_dict = {}

for idx, possible_label in enumerate(possible_labels):
  label_dict[possible_label] = idx

label_dict

{'happy': 0,
 'not-relevant': 1,
 'angry': 2,
 'disgust': 3,
 'sad': 4,
 'surprise': 5}

In [10]:
df['label'] = df['category'].replace(label_dict)
df.head()

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0


# Training and Testing Split

In [11]:
from sklearn.model_selection import train_test_split


In [12]:
df.index.values

array([614484565059596288, 614746522043973632, 614877582664835073, ...,
       613678555935973376, 615246897670922240, 613016084371914753])

In [13]:
df.label.values

array([0, 0, 0, ..., 0, 0, 1])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df.index.values,
                                                    df.label.values,
                                                    test_size = 0.2,
                                                    random_state = 42)

In [16]:
X_train, y_train

(array([611922717428813825, 612664104013176832, 615466078441996288, ...,
        610382402976919552, 613772240786718720, 608587511925993472]),
 array([0, 0, 0, ..., 1, 1, 5]))

In [17]:
df['data_type'] = ['not_set'] * df.shape[0]
df.head()

Unnamed: 0_level_0,text,category,label,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0,not_set
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0,not_set
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0,not_set
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0,not_set
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0,not_set


In [21]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_test, 'data_type'] = 'test'
df.data_type.value_counts()

data_type
train    1184
test      297
Name: count, dtype: int64

# Loading Tokenizer and Encoding data

In [22]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset


In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [24]:
# Encode the training data

encoded_train_data = tokenizer.batch_encode_plus(
    df[df.data_type == 'train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length = True,
    max_length = 256,
    return_tensors = 'pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [27]:
encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type == 'test'].text.values,
    add_special_tokens = True,
    return_attention_mask = True,
    pad_to_max_length = True,
    max_length = 256,
    return_tensors = 'pt'
)



In [29]:
encoded_train_data.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [31]:
input_ids_train = encoded_train_data['input_ids']
attention_masks_train = encoded_train_data['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'].label.values)

In [32]:
input_ids_test = encoded_data_val['input_ids']
attention_masks_test = encoded_data_val['attention_mask']
labels_test = torch.tensor(df[df.data_type == 'test'].label.values)

In [33]:
# Creating TensorDataset

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [37]:
dataset_train.tensors

(tensor([[  101, 16092,  3897,  ...,     0,     0,     0],
         [  101,  1030, 27034,  ...,     0,     0,     0],
         [  101,  1030, 10682,  ...,     0,     0,     0],
         ...,
         [  101, 11047,  1030,  ...,     0,     0,     0],
         [  101,  1030,  3680,  ...,     0,     0,     0],
         [  101,  1030,  2120,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([0, 0, 0,  ..., 0, 0, 1]))

In [38]:
len(dataset_train)

1184

# Setting BERT Pretrained Model

In [39]:
from transformers import BertForSequenceClassification

In [41]:
len(label_dict)

6

In [43]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = len(label_dict),
                                                      output_attentions = False,
                                                      output_hidden_states = False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# create Dataloader

In [44]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [45]:
batch_size = 32

dataloader_train = DataLoader(dataset_train,
                              sampler = RandomSampler(dataset_train),
                              batch_size = batch_size)

dataloader_validation = DataLoader(dataset_test,
                                   sampler = SequentialSampler(dataset_test),
                                   batch_size = batch_size)



In [47]:
dataloader_train

<torch.utils.data.dataloader.DataLoader at 0x7ff368fe7f40>

# Optimizer and Scheduler

In [48]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [49]:
optimizer = AdamW(model.parameters(),
                  lr= 1e-5,
                  eps = 1e-8)



In [64]:
epochs = 5
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 5,
                                            num_training_steps = len(dataloader_train) * epochs)

# Performance Metrics

In [51]:
import numpy as np
from sklearn.metrics import f1_score



In [62]:
def f1_score_func(preds, labels):
  preds_flat = np.argmax(preds, axis = 1).flatten()
  labels_flat = labels.flatten()
  return f1_score(labels_flat, preds_flat, average = 'weighted')

In [63]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

# Training the BERT

In [54]:
import random

seed_val = 45
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [55]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [60]:
def evaluate(dataloader_val):

  model.eval()

  loss_val_total = 0
  predictions, true_vals = [], []

  for batch in dataloader_validation:
    batch = tuple(b.to(device) for b in batch)

    inputs = {'input_ids' :       batch[0],
              'attention_mask':   batch[1],
              'labels':           batch[2],
              }

    with torch.no_grad():
      outputs = model(**inputs)

      loss= outputs[0]
      logits = outputs[1]
      loss_val_total += loss.item()

      logits = logits.detach().cpu().numpy()
      label_ids = inputs['labels'].cpu().numpy()
      predictions.append(logits)
      true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_validation)

    predictions = np.concatenate(predictions, axis = 0)
    true_vals = np.concatenate(true_vals, axis = 0)

    return loss_val_avg, predictions, true_vals


In [65]:
for epoch in tqdm(range(1, epochs + 1)):
  model.train()

  loss_train_total = 0

  progress_bar = tqdm(dataloader_train,
                      desc='Epoch {:1d}'.format(epoch),
                      leave = False,
                      disable = False)

  for batch in progress_bar:

    model.zero_grad()

    batch = tuple(b.to(device) for b in batch)

    inputs = {'input_ids':      batch[0],
              'attention_mask': batch[1],
              'labels':         batch[2]
              }

    outputs = model(**inputs)

    loss = outputs[0]
    loss_train_total += loss.item()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    optimizer.step()
    scheduler.step()

    progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

torch.save(model.state_dict(), f"Finetuned_Bert_model_epo_{epoch}.model")

tqdm.write(f'\nEpoch {epoch}')

loss_train_avg = loss_train_total/len(dataloader_train)
tqdm.write(f"Validation loss: {loss_train_avg}")

val_loss, predictions, true_vals = evaluate(dataloader_validation)
val_f1 = f1_score_func(predictions, true_vals)
tqdm.write(f"Validation loss: {val_loss}")
tqdm.write(f"Validation F1 Score: {val_f1}")

  0%|          | 0/5 [00:00<?, ?it/s]
Epoch 1:   0%|          | 0/37 [00:00<?, ?it/s][A
Epoch 1:   0%|          | 0/37 [00:01<?, ?it/s, training_loss=0.248][A
Epoch 1:   3%|▎         | 1/37 [00:01<00:44,  1.24s/it, training_loss=0.248][A
Epoch 1:   3%|▎         | 1/37 [00:02<00:44,  1.24s/it, training_loss=0.261][A
Epoch 1:   5%|▌         | 2/37 [00:02<00:42,  1.21s/it, training_loss=0.261][A
Epoch 1:   5%|▌         | 2/37 [00:03<00:42,  1.21s/it, training_loss=0.196][A
Epoch 1:   8%|▊         | 3/37 [00:03<00:40,  1.21s/it, training_loss=0.196][A
Epoch 1:   8%|▊         | 3/37 [00:04<00:40,  1.21s/it, training_loss=0.222][A
Epoch 1:  11%|█         | 4/37 [00:04<00:39,  1.19s/it, training_loss=0.222][A
Epoch 1:  11%|█         | 4/37 [00:05<00:39,  1.19s/it, training_loss=0.214][A
Epoch 1:  14%|█▎        | 5/37 [00:06<00:38,  1.20s/it, training_loss=0.214][A
Epoch 1:  14%|█▎        | 5/37 [00:07<00:38,  1.20s/it, training_loss=0.165][A
Epoch 1:  16%|█▌        | 6/37 [00:07<0


Epoch 5
Validation loss: 0.305333253902358
Validation loss: 0.03765441775321961
Validation F1 Score: 0.8345352564102564


# Inference

In [75]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [76]:
model.load_state_dict(torch.load('/content/Finetuned_Bert_model_epo_5.model', map_location=torch.device('cpu')))

<All keys matched successfully>

In [77]:
_, predictions, true_vals = evaluate(dataloader_validation)

In [78]:
accuracy_per_class(predictions, true_vals)

Class: happy
Accuracy: 25/25

Class: not-relevant
Accuracy: 3/4

Class: angry
Accuracy: 0/3

