In [None]:
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
# !nvidia-smi

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# Finetuning SmSA
SmSA is a Sentiment Analysis dataset with 3 possible labels: `positive`, `negative`, and `neutral`

In [None]:
# pip install transformers

In [None]:
import os, sys
sys.path.append('../')
os.chdir('../')

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from utils.forward_fn import forward_sequence_classification
from utils.metrics import document_sentiment_metrics_fn
from utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

In [None]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [None]:
# Set random seed
set_seed(26092020)

# Load Model

In [None]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
# tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
# config = BertConfig.from_pretrained('indolem/indobert-base-uncased')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

model = BertForSequenceClassification(config)
model.load_state_dict(torch.load(r'C:\Users\azisf\OneDrive\Kuliah S2\@Berkas\@Thesis\Coding\IndoNLU\model.pt'))
model.eval()  # Pastikan model dalam mode evaluasi


In [None]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

In [None]:
# model

In [None]:
# count_param(model)

# Test model on sample sentences

In [None]:
text = 'Presiden berusaha menurunkan tingkat kemiskinan yang semakin tinggi'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

In [None]:
text = 'Budi pergi ke pondok indah mall membeli cakwe'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

In [None]:
#@title
text = 'Dasar anak sialan!! Kurang ajar!!'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

# Prepare Dataset

In [None]:
train_dataset_path = r'C:\Users\azisf\OneDrive\Kuliah S2\@Berkas\@Thesis\Coding\IndoNLU\dataset\train_preprocess.tsv'
valid_dataset_path = r'C:\Users\azisf\OneDrive\Kuliah S2\@Berkas\@Thesis\Coding\IndoNLU\dataset\valid_preprocess.tsv'
test_dataset_path = r'C:\Users\azisf\OneDrive\Kuliah S2\@Berkas\@Thesis\Coding\IndoNLU\dataset\test_preprocess_masked_label.tsv'

In [None]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=64, num_workers=8, shuffle=True)  
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=64, num_workers=8, shuffle=False)  
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=64, num_workers=8, shuffle=False)

# Fine Tuning & Evaluation

In [None]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [None]:
# Train
n_epochs = 5
for epoch in range(n_epochs):
    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)

    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),total_loss/(i+1), metrics_to_string(metrics)))


    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

In [None]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)

print(df)

# Test fine-tuned model on sample sentences

In [None]:
# text = 'Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita'
text = 'Start-up Indonesia "Ku Ka" gebrak Portugal'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

In [None]:
# text = 'Budi pergi ke pondok indah mall membeli cakwe'
text = 'Perusahaan Start - up Indonesia Ku Ka ( Aku Suka ), sebuah platform yang menampilkan perancang lokal, perajin dan masakan Indonesia, terpilih dari ribuan pelamar untuk berpartisipasi pada pameran teknologi internet Web Summit 2016. Menurut Fungsi Ekonomi KBRI Lisabon, Ku Ka terpilih sebagai Top 200 start-up dan diberi kesempatan berpresentasi di depan panelis yang terdiri dari juri terkemuka, media massa dan ribuan peserta Web Summit.'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

In [None]:
text = 'Dasar anak sialan!! Kurang ajar!!'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

In [None]:
text = 'Tak Lagi Andalkan Bantuan Bing, Siri Pindah Haluan ke Google Search'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

# Test dataset Ringkasan

In [None]:
import pandas as pd

df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/IndoNLU/indosum.xlsx')
df.head()

In [None]:
sentimen = []
polarity = []
for text in df['ringkasan']:
  # text = 'Dasar anak sialan!! Kurang ajar!!'
  subwords = tokenizer.encode(text)
  subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

  logits = model(subwords)[0]
  label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

  # print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')
  sentimen.append(i2w[label])
  # polarity.append(F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f)
  polarity.append(f"{F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}")


In [None]:
df['Sentimen'] = sentimen
df['Polarity'] = polarity

In [None]:
df.head()

In [None]:
df.to_excel('/content/drive/MyDrive/Colab Notebooks/IndoNLU/Sentimen_Ringkasan_Berita.xlsx', index=False)

# Simpan Model hasil Training

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/IndoNLU//model.pt')