<a href="https://colab.research.google.com/github/afifrizkyandika11551100310/IndoBERT_FineTUNING_SENTIMENT_ANALYSIS/blob/main/23521034_M_Afif_Rizky_A_BitHealth_Sentimen_Analysis_Model_IndoNLU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [None]:
! git clone https://github.com/IndoNLP/indonlu

Cloning into 'indonlu'...
remote: Enumerating objects: 500, done.[K
remote: Counting objects: 100% (184/184), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 500 (delta 115), reused 142 (delta 111), pack-reused 316[K
Receiving objects: 100% (500/500), 9.45 MiB | 24.63 MiB/s, done.
Resolving deltas: 100% (235/235), done.


In [None]:
import pandas as pd
import numpy as np
import torch
from torch import optim
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from nltk.tokenize import TweetTokenizer, word_tokenize

from indonlu.utils.forward_fn import forward_sequence_classification
from indonlu.utils.metrics import document_sentiment_metrics_fn
from indonlu.utils.data_utils import DocumentSentimentDataLoader, DocumentSentimentDataset

In [None]:
# helper function 

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metric_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

## **Load Model : IndoBert**

In [None]:
# load tokenizer and config
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Initiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config = config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# test text for model 
text = 'saya tidak suka makanan ini'

encoding = tokenizer.encode(text)
decoding = tokenizer.decode(encoding)
encoding_input = tokenizer(text)

print(encoding)
print(decoding)
print(encoding_input)

[2, 209, 119, 1506, 955, 92, 3]
[CLS] saya tidak suka makanan ini [SEP]
{'input_ids': [2, 209, 119, 1506, 955, 92, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


## **Prepare Dataset**

In [None]:
train_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/test_preprocess.tsv'

# load dataset
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase = True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase = True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase = True)

# print dataset dimension
print(train_dataset.data.shape)
print(valid_dataset.data.shape)
print(test_dataset.data.shape)

(11000, 2)
(1260, 2)
(500, 2)


In [None]:
# import data training yang diberikan 
df_training_diberikan_path = '/content/drive/MyDrive/BitHealth/data_training.tsv.txt'
# load dataset training yang diberikan
train_dataset_baru = DocumentSentimentDataset(df_training_diberikan_path, tokenizer, lowecase = True)
print(train_dataset_baru.data.shape)

(11000, 2)


In [None]:
# display dataset
train_dataset.data.head()

Unnamed: 0,text,sentiment
0,warung ini dimiliki oleh pengusaha pabrik tahu...,0
1,mohon ulama lurus dan k212 mmbri hujjah partai...,1
2,lokasi strategis di jalan sumatera bandung . t...,0
3,betapa bahagia nya diri ini saat unboxing pake...,0
4,duh . jadi mahasiswa jangan sombong dong . kas...,2


In [None]:
# print label
train_dataset.LABEL2INDEX

{'positive': 0, 'neutral': 1, 'negative': 2}

## **Data Loader**

In [None]:
# data loader
train_loader = DocumentSentimentDataLoader(dataset = train_dataset, max_seq_len=512, batch_size = 16, num_workers = 16, shuffle = True) 
valid_loader = DocumentSentimentDataLoader(dataset = valid_dataset, max_seq_len=512, batch_size = 16, num_workers = 16, shuffle = False)
test_loader = DocumentSentimentDataLoader(dataset = test_dataset, max_seq_len=512, batch_size = 16, num_workers = 16, shuffle = False)



In [None]:
# check data loader 
len(train_loader), len(valid_loader), len(test_loader)

(688, 79, 32)

In [None]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


## Fine Tuning Parameter

In [None]:
optimizer = optim.Adam(model.parameters(), lr = 5e-6)
model = model.cuda()

In [None]:
# Train
n_epochs = 5
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metric_to_string(metrics), get_lr(optimizer)))
    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metric_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metric_to_string(metrics)))

(Epoch 1) TRAIN LOSS:0.2615 LR:0.00000500: 100%|██████████| 688/688 [02:49<00:00,  4.06it/s]


(Epoch 1) TRAIN LOSS:0.2615 ACC:0.90 F1:0.86 REC:0.85 PRE:0.88 LR:0.00000500


VALID LOSS:0.1773 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92: 100%|██████████| 79/79 [00:09<00:00,  8.63it/s]


(Epoch 1) VALID LOSS:0.1773 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92


(Epoch 2) TRAIN LOSS:0.1233 LR:0.00000500: 100%|██████████| 688/688 [02:54<00:00,  3.95it/s]


(Epoch 2) TRAIN LOSS:0.1233 ACC:0.96 F1:0.95 REC:0.94 PRE:0.95 LR:0.00000500


VALID LOSS:0.1711 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92: 100%|██████████| 79/79 [00:08<00:00,  9.29it/s]


(Epoch 2) VALID LOSS:0.1711 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92


(Epoch 3) TRAIN LOSS:0.0790 LR:0.00000500: 100%|██████████| 688/688 [02:50<00:00,  4.03it/s]


(Epoch 3) TRAIN LOSS:0.0790 ACC:0.97 F1:0.97 REC:0.96 PRE:0.97 LR:0.00000500


VALID LOSS:0.1849 ACC:0.94 F1:0.92 REC:0.91 PRE:0.92: 100%|██████████| 79/79 [00:08<00:00,  9.42it/s]


(Epoch 3) VALID LOSS:0.1849 ACC:0.94 F1:0.92 REC:0.91 PRE:0.92


(Epoch 4) TRAIN LOSS:0.0504 LR:0.00000500: 100%|██████████| 688/688 [02:50<00:00,  4.04it/s]


(Epoch 4) TRAIN LOSS:0.0504 ACC:0.99 F1:0.98 REC:0.98 PRE:0.98 LR:0.00000500


VALID LOSS:0.1898 ACC:0.94 F1:0.92 REC:0.91 PRE:0.92: 100%|██████████| 79/79 [00:08<00:00,  9.36it/s]


(Epoch 4) VALID LOSS:0.1898 ACC:0.94 F1:0.92 REC:0.91 PRE:0.92


(Epoch 5) TRAIN LOSS:0.0300 LR:0.00000500: 100%|██████████| 688/688 [02:49<00:00,  4.05it/s]


(Epoch 5) TRAIN LOSS:0.0300 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000500


VALID LOSS:0.2327 ACC:0.94 F1:0.91 REC:0.91 PRE:0.91: 100%|██████████| 79/79 [00:08<00:00,  9.39it/s]

(Epoch 5) VALID LOSS:0.2327 ACC:0.94 F1:0.91 REC:0.91 PRE:0.91





In [None]:
# evaluate test dataset
model.eval()

torch.set_grad_enabled(False)

#total_loss, total_correct, total_labels = 0, 0, 0

list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave = True, total = len(test_loader))
for i, batch_data in enumerate(pbar):
  _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
  list_hyp += batch_hyp

# save prediction 
df = pd.DataFrame({'label' : list_hyp}).reset_index()

100%|██████████| 32/32 [00:03<00:00, 10.45it/s]


In [None]:
df.tail()

Unnamed: 0,index,label
495,495,neutral
496,496,neutral
497,497,neutral
498,498,neutral
499,499,positive


## Prediction on new dataset

In [None]:
# preprocess dataset
from datasets import load_dataset

test_df_path = '/content/drive/MyDrive/BitHealth/data_testing_preprocess.csv'
test_df = pd.read_csv(test_df_path)
test_df["label"] = 'negative'

In [None]:
test_df.isnull().values.any()

True

In [None]:
test_df.isnull().sum()

text     69
label     0
dtype: int64

In [None]:
df_test_bersih = test_df.dropna()
# save df bersih
df_test_bersih.to_csv('/content/drive/MyDrive/BitHealth/data_testing_new_preprocessing.tsv', index = False, header = False, sep = '\t')

In [None]:
df_test_bersih.isnull().sum()

text     0
label    0
dtype: int64

In [None]:
new_test_df_path = '/content/drive/MyDrive/BitHealth/data_testing_new_preprocessing.tsv'

In [None]:
new_test = DocumentSentimentDataset(new_test_df_path, tokenizer, lowercase = True)

In [None]:
print(new_test.data.shape)

(12980, 2)


In [None]:
new_test_loader = DocumentSentimentDataLoader(dataset = new_test, max_seq_len=512, batch_size = 16, num_workers = 16, shuffle = False)



In [None]:
# check data loader 
len(new_test_loader)

812

In [None]:
model.eval()

torch.set_grad_enabled(False)

#total_loss, total_correct, total_labels = 0, 0, 0

list_hyp, list_label = [], []

pbar = tqdm(new_test_loader, leave = True, total = len(new_test_loader))
for i, batch_data in enumerate(pbar):
  _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
  list_hyp += batch_hyp

# save prediction 
df = pd.DataFrame({'label' : list_hyp}).reset_index()

100%|██████████| 812/812 [00:48<00:00, 16.84it/s]


In [None]:
df.tail()

Unnamed: 0,index,label
12975,12975,negative
12976,12976,positive
12977,12977,negative
12978,12978,neutral
12979,12979,neutral


In [None]:
# save model
PATH = '/content/drive/MyDrive/BitHealth/Model/sentiment_analysis_indoNLU.pt'
torch.save(model.state_dict(), PATH)