<a href="https://colab.research.google.com/github/alhafizfadhil06/Sentiment-Analysis---IndoNLU-Dataset/blob/main/IndoBERT_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Preparation**
installing the required packages and downloading IndoNLU dataset by cloning IndoNLU GitHub.

In [None]:
!pip install torch torchvision

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!git clone https://github.com/indobenchmark/indonlu

fatal: destination path 'indonlu' already exists and is not an empty directory.


# **Libraries/Package Importing and Function Definition**


In [None]:
import random
import numpy as np
import pandas as pd

import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from indonlu.utils.forward_fn import forward_sequence_classification
from indonlu.utils.metrics import document_sentiment_metrics_fn
from indonlu.utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [None]:
def set_seed(seed):
  '''setting the same random seed value for all random generator'''
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)

def count_param(module, trainable=False):
  '''counting the number of parameters of the model'''
  if trainable:
    return sum(p.numel() for p in module.paramaters() if p.requires_grad)
  else:
    return sum(p.numel() for p in module.parameters())

def get_lr(optimizer):
  '''setting the learning rate'''
  for param_group in optimizer.param_groups:
    return param_group['lr']

def metrics_to_string(metric_dict):
  '''converting metrics to string'''
  string_list = []
  for key, value in metric_dict.items():
    string_list.append('{}:{:.2f}'.format(key, value))
  return ' '.join(string_list)

In [None]:
set_seed(27052023)

# **Model and Data Pipeline Initiation**

In [None]:
# Load Tokenizer and Config.
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Model initiation.
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
count_param(model)

124443651

In [None]:
train_dataset_path = "/content/indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv"
val_dataset_path = "/content/indonlu/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv"
test_dataset_path = "/content/indonlu/dataset/smsa_doc-sentiment-prosa/test_preprocess.tsv"

In [None]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
val_dataset = DocumentSentimentDataset(val_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)
val_loader = DocumentSentimentDataLoader(dataset=val_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)



In [None]:
print(train_dataset[0])

(array([    2,  6540,    92,  2970,   213,  4259,  3553,   899,    34,
         259,  5590,   262,  2558,   386,   899,  1687,    26,  1574,
       30470,   899,  3310, 30468, 22130, 30360,  6123,  6368, 30468,
       22130, 30360,  2652,  1746, 30468,  8869,  6540,    34,  6315,
        1622,  1256,  8949,   899, 30468,  4222,  1622,   752,   245,
         295,  2083, 30470,  2346,  7107,   300, 30470,   405,   724,
        5189, 30470,   843, 17464,   899,   540, 10989,  3331,  1107,
       30468,   119,  3221,    79,    34,  2170,    98,  9167, 30457,
           3]), array(0), 'warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung . tahu berkualitas , dipadu keahlian memasak , dipadu kretivitas , jadilah warung yang menyajikan menu utama berbahan tahu , ditambah menu umum lain seperti ayam . semuanya selera indonesia . harga cukup terjangkau . jangan lewatkan tahu bletoka nya , tidak kalah dengan yang asli dari tegal !')


In [None]:
l2i, i2l = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(l2i)
print(i2l)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


## **Testing the pretrained model before training**
for comparing the performance of the model before and after training process.

In [None]:
text = "Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita"
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
nl = '\n'

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f"Text: {text}{nl}Label: {i2l[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)")

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita
Label: positive (49.320%)


# **Model Training**

In [None]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [None]:
# Train.
n_epochs = 5
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []
 
    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model.
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2l, device='cuda')
 
        # Update model.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
 
        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss
 
        # Calculate metrics.
        list_hyp += batch_hyp
        list_label += batch_label
 
        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))
 
    # Calculate train metric.
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))
 
    # Evaluate on validation.
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []
 
    pbar = tqdm(val_loader, leave=True, total=len(val_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2l, device='cuda')
        
        # Calculate total loss.
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss
 
        # Calculate evaluation metrics.
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)
 
        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:0.3324 LR:0.00000300: 100%|██████████| 344/344 [02:27<00:00,  2.34it/s]


(Epoch 1) TRAIN LOSS:0.3324 ACC:0.88 F1:0.83 REC:0.81 PRE:0.87 LR:0.00000300


VALID LOSS:0.1996 ACC:0.93 F1:0.90 REC:0.89 PRE:0.91: 100%|██████████| 40/40 [00:07<00:00,  5.02it/s]


(Epoch 1) VALID LOSS:0.1996 ACC:0.93 F1:0.90 REC:0.89 PRE:0.91


(Epoch 2) TRAIN LOSS:0.1602 LR:0.00000300: 100%|██████████| 344/344 [02:33<00:00,  2.24it/s]


(Epoch 2) TRAIN LOSS:0.1602 ACC:0.95 F1:0.93 REC:0.92 PRE:0.93 LR:0.00000300


VALID LOSS:0.1787 ACC:0.94 F1:0.91 REC:0.90 PRE:0.91: 100%|██████████| 40/40 [00:08<00:00,  4.59it/s]


(Epoch 2) VALID LOSS:0.1787 ACC:0.94 F1:0.91 REC:0.90 PRE:0.91


(Epoch 3) TRAIN LOSS:0.1170 LR:0.00000300: 100%|██████████| 344/344 [02:34<00:00,  2.23it/s]


(Epoch 3) TRAIN LOSS:0.1170 ACC:0.96 F1:0.95 REC:0.95 PRE:0.96 LR:0.00000300


VALID LOSS:0.1798 ACC:0.94 F1:0.91 REC:0.91 PRE:0.91: 100%|██████████| 40/40 [00:08<00:00,  4.88it/s]


(Epoch 3) VALID LOSS:0.1798 ACC:0.94 F1:0.91 REC:0.91 PRE:0.91


(Epoch 4) TRAIN LOSS:0.0909 LR:0.00000300: 100%|██████████| 344/344 [02:34<00:00,  2.23it/s]


(Epoch 4) TRAIN LOSS:0.0909 ACC:0.97 F1:0.96 REC:0.96 PRE:0.97 LR:0.00000300


VALID LOSS:0.1837 ACC:0.93 F1:0.91 REC:0.90 PRE:0.91: 100%|██████████| 40/40 [00:08<00:00,  4.53it/s]


(Epoch 4) VALID LOSS:0.1837 ACC:0.93 F1:0.91 REC:0.90 PRE:0.91


(Epoch 5) TRAIN LOSS:0.0672 LR:0.00000300: 100%|██████████| 344/344 [02:34<00:00,  2.23it/s]


(Epoch 5) TRAIN LOSS:0.0672 ACC:0.98 F1:0.97 REC:0.97 PRE:0.98 LR:0.00000300


VALID LOSS:0.1986 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92: 100%|██████████| 40/40 [00:07<00:00,  5.25it/s]

(Epoch 5) VALID LOSS:0.1986 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92





# **Model Evaluation and Testing**

In [None]:
# Evaluate on test.
model.eval()
torch.set_grad_enabled(False)
 
total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []
 
pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2l, device='cuda')
    list_hyp += batch_hyp
 
# Save prediction.
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)
 
print(df)

100%|██████████| 16/16 [00:02<00:00,  5.62it/s]

     index     label
0        0  negative
1        1  negative
2        2  negative
3        3  negative
4        4  negative
..     ...       ...
495    495   neutral
496    496   neutral
497    497  positive
498    498  positive
499    499  positive

[500 rows x 2 columns]





from this cell, we can conclude that the performance of the model increase significantly after training process. It can be seen from the difference between class probability of the prediction for the same input text **before training (49.32%)** and **after training (99.715%)**

In [None]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
nl = '\n'

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
 
print(f'Text: {text}{nl}Label : {i2l[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita
Label : positive (99.715%)


In [None]:
text = 'Sayang, aku marah'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
 
logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
 
print(f'Text: {text}{nl}Label : {i2l[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Sayang, aku marah
Label : negative (99.580%)


In [None]:
text = 'Merasa kagum dengan toko ini tapi berubah menjadi kecewa setelah transaksi'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
 
logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
 
print(f'Text: {text}{nl}Label : {i2l[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Merasa kagum dengan toko ini tapi berubah menjadi kecewa setelah transaksi
Label : negative (99.642%)


In [None]:
text = 'Ronaldo pergi ke Mall Grand Indonesia membeli cilok'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
 
logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
 
print(f'Text: {text}{nl}Label : {i2l[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Ronaldo pergi ke Mall Grand Indonesia membeli cilok
Label : neutral (99.427%)
