[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/13WTEZELa06FwkxSxOQjG92yvSjno8HCq?usp=sharing)

# Setup

In [None]:
!pip install sentencepiece
!pip install transformers



In [None]:
import time
import datetime
import tensorflow as tf
import torch
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup
import numpy as np
import time
import datetime
import random
import matplotlib.pyplot as plt
% matplotlib inline
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import os
from sklearn.metrics import accuracy_score, mean_absolute_error


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Found GPU at: /device:GPU:0
There are 1 GPU(s) available.
We will use the GPU: Tesla K80


# Start Predicting (Binary)

> IndoLEM Twitter Binary Sentiment Dataset

## Load the Indonesian sentiment data obtained from IndoLEM

In [None]:
indolem_path0 = pd.read_csv('./data/indonesian-sentiment-dataset/indolem/data/test0.csv')
indolem_path1 = pd.read_csv('./data/indonesian-sentiment-dataset/indolem/data/test1.csv')
indolem_path2 = pd.read_csv('./data/indonesian-sentiment-dataset/indolem/data/test2.csv')
indolem_path3 = pd.read_csv('./data/indonesian-sentiment-dataset/indolem/data/test3.csv')
indolem_path4 = pd.read_csv('./data/indonesian-sentiment-dataset/indolem/data/test4.csv')
dfs = [indolem_path0, indolem_path1, indolem_path2, indolem_path3, indolem_path4]
df_test = pd.concat(dfs, ignore_index=True)
df_test

Unnamed: 0,sentence,sentiment
0,#Sports Perempuan Golkar Makassar Dibekali Ilm...,1
1,"Se-jauh""nya, Se-kenal""nya, Se-pisah""nya, Se-cu...",1
2,Sekedar Shared Ucapan Terimakasih Charles Hono...,1
3,Wah pak Jokowi sudah mendapat nilai positif di...,1
4,Penelpon : raffi ahmad oh raffi ahmad..... *bu...,1
...,...,...
5043,linen nya terasa agak gatal... mungkin kurang ...,0
5044,Didaskaleinophobia adalah takut akan pergi ke ...,0
5045,Iklan partai Demokrat Katakan Tidak Pada Korup...,0
5046,Tempat tidurnya nyaman toilet kurang bersih,0


In [None]:
print('Total positive review: ', df_test.loc[df_test['sentiment'] == 1].shape[0])
print('Total negative review: ', df_test.loc[df_test['sentiment'] == 0].shape[0])

Total positive review:  1486
Total negative review:  3562


## Load a model which was fine-tuned using English and Japanese review datasets

In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
model_dir = "./models/enja-binary-model/model_save/"
# Load a model fine-tuned using English and Japanese review data
model = XLMRobertaForSequenceClassification.from_pretrained(model_dir)
tokenizer = XLMRobertaTokenizer.from_pretrained(model_dir)
# Copy the model to the GPU.
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

## Preparing for Sentiment Prediction

In [None]:
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import itertools

MAX_LEN = 64

def predict_new (treviews, tsentiments):
  # Tokenize all of the sentences and map the tokens to their word IDs.
  input_ids = []
  # For every sentence...
  for sent in treviews:
      # `encode` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      encoded_sent = tokenizer.encode(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          truncation=True,
                          max_length=128
                    )
      
      input_ids.append(encoded_sent)
  # Pad our input tokens
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                            dtype="long", truncating="post", padding="post")
  # Create attention masks
  attention_masks = []
  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask) 
  # Convert to tensors.
  prediction_inputs = torch.tensor(input_ids)
  prediction_masks = torch.tensor(attention_masks)
  prediction_labels = torch.tensor(tsentiments)
  # Set the batch size.  
  batch_size = 32  
  # Create the DataLoader.
  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
  prediction_sampler = SequentialSampler(prediction_data)
  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

  # Prediction on test set
  print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
  # Put model in evaluation mode
  model.eval()
  # Tracking variables 
  predictions , true_labels = [], []
  # Predict 
  for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)
    logits = outputs[0]
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)
  print('DONE.')
  accs = []
  maes = []
  # For each input batch...
  for i in range(len(true_labels)):
    # The predictions for this batch are a 2-column ndarray (one column for "0" 
    # and one column for "1"). Pick the label with the highest value and turn this
    # in to a list of 0s and 1s.
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    print(pred_labels_i)
    acc = accuracy_score(true_labels[i], pred_labels_i)
    mae = mean_absolute_error(true_labels[i], pred_labels_i)
    accs.append(acc)
    maes.append(mae)

  # print("Trained with {} data".format(len(df_train)))
  print("Accuracy on the Test Set ({} data): ".format(len(treviews)), sum(accs)/len(accs)*100)
  print("MAE on the Test Set ({} data): ".format(len(treviews)), sum(maes)/len(maes)*100)

  # print F1-score
  preds = []
  for i in range(len(predictions)):
    preds.append(np.argmax(predictions[i], axis=1).flatten())
  truth = list(itertools.chain.from_iterable(true_labels))
  preds = list(itertools.chain.from_iterable(preds))
  f1s = f1_score(truth, preds, average='macro') # macro averaged f1 score
  print("F1 Score on the Test Set ({} data): ".format(len(treviews)), f1s*100)

In [None]:
newreviews = df_test['sentence'].values
newsentiments = df_test['sentiment'].values

In [None]:
newreviews[2000:2100]

In [None]:
newsentiments[2000:2100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

## Start Predicting

In [None]:
predict_new(newreviews, newsentiments)

Predicting labels for 2 test sentences...
DONE.
[1 0]
Accuracy on the Test Set (2 data):  50.0
MAE on the Test Set (2 data):  50.0
F1 Score on the Test Set (2 data):  33.33333333333333


# Start Predicting (Binary)
> IndoNLU (SmSA) Sentiment Dataset

## Load the Indonesian sentiment data (SmSA) also used by IndoNLU

In [None]:
df_test = pd.read_csv('./data/indonesian-sentiment-dataset/indonlu/valid_preprocess.tsv', delimiter='\t', header=None)

print('Total review: ', df_test.shape[0])
print('Total neutral review: ', df_test.loc[df_test[1] == 'neutral'].shape[0])
print('Total positive review: ', df_test.loc[df_test[1] == 'positive'].shape[0])
print('Total negative review: ', df_test.loc[df_test[1] == 'negative'].shape[0])

#Remove all NEUTRAL reviews
df_test = df_test[df_test[1] != 'neutral']
print('Total number of testing data: ',df_test.shape[0])

def label_sentiment (row):
    if row[1] == 'negative':
      return 0
    elif row[1] == 'positive':
      return 1

df_test['sentiment'] = df_test.apply(lambda row: label_sentiment(row), axis=1)
print('Total positive review: ', df_test.loc[df_test['sentiment'] == 1].shape[0])
print('Total negative review: ', df_test.loc[df_test['sentiment'] == 0].shape[0])

Total review:  1260
Total neutral review:  131
Total positive review:  735
Total negative review:  394
Total number of testing data:  1129
Total positive review:  735
Total negative review:  394


## Load a model which was fine-tuned using English and Japanese review datasets

In [None]:
#...
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
model_dir = "./models/en-binary-model/model_save/"
# Load a trained model and vocabulary that you have fine-tuned
model = XLMRobertaForSequenceClassification.from_pretrained(model_dir)
tokenizer = XLMRobertaTokenizer.from_pretrained(model_dir)

# Copy the model to the GPU.
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

## Preparing for Sentiment Prediction

In [None]:
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import itertools

MAX_LEN = 64

def predict_new (treviews, tsentiments):
  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  # For every sentence...
  for sent in treviews:
      # `encode` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      encoded_sent = tokenizer.encode(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          truncation=True,
                          max_length=128
                    )
      
      input_ids.append(encoded_sent)
  # Pad our input tokens
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                            dtype="long", truncating="post", padding="post")
  # Create attention masks
  attention_masks = []
  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask) 
  # Convert to tensors.
  prediction_inputs = torch.tensor(input_ids)
  prediction_masks = torch.tensor(attention_masks)
  prediction_labels = torch.tensor(tsentiments)
  # Set the batch size.  
  batch_size = 32  
  # Create the DataLoader.
  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
  prediction_sampler = SequentialSampler(prediction_data)
  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

  # Prediction on test set
  print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
  # Put model in evaluation mode
  model.eval()
  # Tracking variables 
  predictions , true_labels = [], []
  # Predict 
  for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)
    logits = outputs[0]
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)
  print('DONE.')
  accs = []
  maes = []
  # For each input batch...
  for i in range(len(true_labels)):
    # The predictions for this batch are a 2-column ndarray (one column for "0" 
    # and one column for "1"). Pick the label with the highest value and turn this
    # in to a list of 0s and 1s.
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    acc = accuracy_score(true_labels[i], pred_labels_i)
    mae = mean_absolute_error(true_labels[i], pred_labels_i)
    accs.append(acc)
    maes.append(mae)

  # print("Trained with {} data".format(len(df_train)))
  print("Accuracy on the Test Set ({} data): ".format(len(treviews)), sum(accs)/len(accs)*100)
  print("MAE on the Test Set ({} data): ".format(len(treviews)), sum(maes)/len(maes)*100)

  # print F1-score
  preds = []
  for i in range(len(predictions)):
    preds.append(np.argmax(predictions[i], axis=1).flatten())
  truth = list(itertools.chain.from_iterable(true_labels))
  preds = list(itertools.chain.from_iterable(preds))
  f1s = f1_score(truth, preds, average='macro') # macro averaged f1 score
  print("F1 Score on the Test Set ({} data): ".format(len(treviews)), f1s*100)

In [None]:
newreviews = df_test[0].values
newsentiments = df_test['sentiment'].values

In [None]:
newreviews

array(['tidak enak',
       'restoran ini menawarkan makanan sunda . kami memesan ayam goreng , kangkung , sayur asam , ikan gurame goreng , ikan bakar , nasi goreng , karedok , tahu tempe , nasi putih , nasi merah etc minuman yang mereka tawarkan juga cukup variatif . rasa makanan enak dan harga murah . kami 9 dewasa dan 5 anak kecil , hanya menghabiskan 800,000',
       'lokasi di alun alun masakan padang ini cukup terkenal dengan kepala ikan kakap gule , biasa saya pesan nasi bungkus padang berisikan rendang , ayam pop dan perkedel . porsi banyak dan mengenyangkan',
       ...,
       'be de gea , cowok cupu yang takut dengan pacar nya . pacar nya mau tinggal di madrid eh nurut aja . payah',
       'valen yang sangat tidak berkualitas . konentator harus nya mendidik . bukan yang jebret jebret , awas , kuat , rata , pait pait .',
       'restoran ini menjadi tempat pilihan saya berbuka puasa minggu lalu . pelayanan yang diberikan baik dengan pilhan menu yang banyak . yang saya suka a

In [None]:
newsentiments

array([0, 1, 1, ..., 0, 0, 1])

## Start Predicting

In [None]:
predict_new(newreviews, newsentiments)

Predicting labels for 1,129 test sentences...
DONE.
Accuracy on the Test Set (1129 data):  87.83757716049382
MAE on the Test Set (1129 data):  12.162422839506174
F1 Score on the Test Set (1129 data):  86.77115040607009
