# Import Datasets & Libraries

In [1]:
!git clone https://github.com/azizp128/Emotion-Predictor-App.git

Cloning into 'Emotion-Predictor-App'...


In [1]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable


In [None]:
# COLLAB
import os, sys

sys.path.append('../')
os.chdir('../')

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from emotion_predictor_app.utils.forward_fn import forward_sequence_classification
from emotion_predictor_app.utils.metrics import document_sentiment_metrics_fn
from emotion_predictor_app.utils.data_utils import EmotionDetectionDataset, EmotionDetectionDataLoader

# LOCAL 
# from pathlib import Path
# sys.path.append(str(Path.cwd().parent))  # Move one level up
# from utils.forward_fn import forward_sequence_classification
# from utils.metrics import document_sentiment_metrics_fn
# from utils.data_utils import EmotionDetectionDataset, EmotionDetectionDataLoader

In [8]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [9]:
# Set random seed
set_seed(26092020)

# Load Model

In [None]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = EmotionDetectionDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
count_param(model)

124445958

# Prepare Dataset

In [None]:
train_dataset_path = '...' # fill with your dataset path
valid_dataset_path = '...'
test_dataset_path = '...'

## Check dataset info

In [None]:
train_dataset = pd.read_csv(train_dataset_path)
valid_dataset = pd.read_csv(valid_dataset_path)
test_dataset = pd.read_csv(test_dataset_path)

In [None]:
train_dataset.head()

Unnamed: 0,label,tweet
0,fear,mestinya hrs diselepetan spt itu karena sdh g...
1,happy,Antara rahsia untuk senang berjaya dan tenan...
2,sadness,Aku pamit duluan ya Denger kata ini tuh kadan...
3,fear,askrl Udah jauh banget ngobrol sama kating di...
4,happy,Senang mendengar nya kawan


In [None]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   21000 non-null  object
 1   tweet   21000 non-null  object
dtypes: object(2)
memory usage: 328.2+ KB


In [None]:
valid_dataset.head()

Unnamed: 0,label,tweet
0,anger,Kalo marah marah pelariannya ke twitter kalo s...
1,fear,Halo hooman Aku mama masih mencari keluarga ...
2,love,Cinta itu lebih mementingkan perasaan pasangan...
3,happy,Bismillah hari pertama kerja salah satu peru...
4,love,It s Okay That s Love Cinta bukan tentang memb...


In [None]:
valid_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   9000 non-null   object
 1   tweet   9000 non-null   object
dtypes: object(2)
memory usage: 140.8+ KB


In [None]:
test_dataset.head()

Unnamed: 0,label,tweet
0,happy,mestinya hrs diselepetan spt itu karena sdh g...
1,happy,Antara rahsia untuk senang berjaya dan tenan...
2,happy,Aku pamit duluan ya Denger kata ini tuh kadan...
3,happy,askrl Udah jauh banget ngobrol sama kating di...
4,happy,Senang mendengar nya kawan


In [None]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5000 non-null   object
 1   tweet   5000 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB


## Load dataset

In [None]:
train_dataset = EmotionDetectionDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = EmotionDetectionDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = EmotionDetectionDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = EmotionDetectionDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=8, num_workers=16, shuffle=True)  
valid_loader = EmotionDetectionDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=8, num_workers=16, shuffle=False)  
test_loader = EmotionDetectionDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=8, num_workers=16, shuffle=False)



In [None]:
w2i, i2w = EmotionDetectionDataset.LABEL2INDEX, EmotionDetectionDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'sadness': 0, 'anger': 1, 'love': 2, 'fear': 3, 'happy': 4, 'disgust': 5}
{0: 'sadness', 1: 'anger', 2: 'love', 3: 'fear', 4: 'happy', 5: 'disgust'}


# Test model on sample sentences

In [None]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : disgust (32.396%)


In [None]:
text = 'Budi pergi ke pondok indah mall membeli cakwe'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Budi pergi ke pondok indah mall membeli cakwe | Label : disgust (31.867%)


In [None]:
text = 'Dasar anak sialan!! Kurang ajar!!'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Dasar anak sialan!! Kurang ajar!! | Label : disgust (28.371%)


# Fine Tuning & Evaluation

In [None]:
optimizer = optim.Adam(model.parameters(), lr=5e-6)
model = model.cuda()

In [None]:
# Train
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:0.1798 LR:0.00000500: 100%|██████████| 2625/2625 [04:45<00:00,  9.19it/s]


(Epoch 1) TRAIN LOSS:0.1798 ACC:0.95 F1:0.95 REC:0.95 PRE:0.95 LR:0.00000500


VALID LOSS:0.0784 ACC:0.97 F1:0.97 REC:0.97 PRE:0.97: 100%|██████████| 1125/1125 [01:38<00:00, 11.45it/s]


(Epoch 1) VALID LOSS:0.0784 ACC:0.97 F1:0.97 REC:0.97 PRE:0.97


(Epoch 2) TRAIN LOSS:0.0495 LR:0.00000500: 100%|██████████| 2625/2625 [04:41<00:00,  9.32it/s]


(Epoch 2) TRAIN LOSS:0.0495 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98 LR:0.00000500


VALID LOSS:0.0678 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 1125/1125 [01:37<00:00, 11.58it/s]


(Epoch 2) VALID LOSS:0.0678 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98


(Epoch 3) TRAIN LOSS:0.0295 LR:0.00000500: 100%|██████████| 2625/2625 [04:41<00:00,  9.33it/s]


(Epoch 3) TRAIN LOSS:0.0295 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000500


VALID LOSS:0.0685 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 1125/1125 [01:36<00:00, 11.63it/s]


(Epoch 3) VALID LOSS:0.0685 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98


(Epoch 4) TRAIN LOSS:0.0218 LR:0.00000500: 100%|██████████| 2625/2625 [04:42<00:00,  9.30it/s]


(Epoch 4) TRAIN LOSS:0.0218 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000500


VALID LOSS:0.0822 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 1125/1125 [01:34<00:00, 11.89it/s]


(Epoch 4) VALID LOSS:0.0822 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98


(Epoch 5) TRAIN LOSS:0.0169 LR:0.00000500: 100%|██████████| 2625/2625 [04:43<00:00,  9.27it/s]


(Epoch 5) TRAIN LOSS:0.0169 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000500


VALID LOSS:0.0751 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 1125/1125 [01:35<00:00, 11.82it/s]


(Epoch 5) VALID LOSS:0.0751 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98


(Epoch 6) TRAIN LOSS:0.0135 LR:0.00000500: 100%|██████████| 2625/2625 [04:42<00:00,  9.28it/s]


(Epoch 6) TRAIN LOSS:0.0135 ACC:0.99 F1:0.99 REC:0.99 PRE:0.99 LR:0.00000500


VALID LOSS:0.0817 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 1125/1125 [01:34<00:00, 11.84it/s]


(Epoch 6) VALID LOSS:0.0817 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98


(Epoch 7) TRAIN LOSS:0.0108 LR:0.00000500: 100%|██████████| 2625/2625 [04:42<00:00,  9.29it/s]


(Epoch 7) TRAIN LOSS:0.0108 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00 LR:0.00000500


VALID LOSS:0.0908 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 1125/1125 [01:37<00:00, 11.59it/s]


(Epoch 7) VALID LOSS:0.0908 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98


(Epoch 8) TRAIN LOSS:0.0104 LR:0.00000500: 100%|██████████| 2625/2625 [04:42<00:00,  9.28it/s]


(Epoch 8) TRAIN LOSS:0.0104 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00 LR:0.00000500


VALID LOSS:0.0986 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 1125/1125 [01:37<00:00, 11.54it/s]


(Epoch 8) VALID LOSS:0.0986 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98


(Epoch 9) TRAIN LOSS:0.0097 LR:0.00000500: 100%|██████████| 2625/2625 [04:41<00:00,  9.31it/s]


(Epoch 9) TRAIN LOSS:0.0097 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00 LR:0.00000500


VALID LOSS:0.0945 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 1125/1125 [01:36<00:00, 11.62it/s]


(Epoch 9) VALID LOSS:0.0945 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98


(Epoch 10) TRAIN LOSS:0.0088 LR:0.00000500: 100%|██████████| 2625/2625 [04:42<00:00,  9.29it/s]


(Epoch 10) TRAIN LOSS:0.0088 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00 LR:0.00000500


VALID LOSS:0.0962 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98: 100%|██████████| 1125/1125 [01:36<00:00, 11.63it/s]

(Epoch 10) VALID LOSS:0.0962 ACC:0.98 F1:0.98 REC:0.98 PRE:0.98





In [None]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)

print(df)

100%|██████████| 625/625 [00:15<00:00, 40.48it/s]

      index    label
0         0     fear
1         1    happy
2         2  sadness
3         3     fear
4         4    happy
...     ...      ...
4995   4995  disgust
4996   4996  disgust
4997   4997  disgust
4998   4998  sadness
4999   4999    anger

[5000 rows x 2 columns]





# Test fine-tuned model on sample sentences

In [None]:
text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : love (99.960%)


In [None]:
text = 'Jorok sekali kamu, makan tidak cuci tangan dahulu'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Jorok sekali kamu, makan tidak cuci tangan dahulu | Label : disgust (99.931%)


In [None]:
text = 'Dasar anak sialan!! Kurang ajar!!'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Dasar anak sialan!! Kurang ajar!! | Label : anger (98.820%)


# Coba kalimat

In [None]:
User_input = input('Masukan kalimat :')
text = User_input
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Masukan kalimat :saya habis jatuh dari tangga
Text: saya habis jatuh dari tangga | Label : sadness (99.912%)


# Import Pre-trained model using pipeline

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

device = 0 if torch.cuda.is_available() else -1

pipe = pipeline("sentiment-analysis", model="azizp128/prediksi-emosi-indobert", device=device)

In [18]:
result = pipe("menjijikkan sekali!")
print(f"Sentiment: {result[0]['label']}\nScore: {result[0]['score']}")

Sentiment: JIJIK
Score: 0.9954845905303955
