[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1nxsB55E-Oi-P_jRO4LqCqW_yYlPyKGKX?usp=sharing)

# Setup

In [None]:
!pip install sentencepiece
!pip install transformers

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 25.3 MB/s eta 0:00:01[K     |▌                               | 20 kB 28.3 MB/s eta 0:00:01[K     |▉                               | 30 kB 31.4 MB/s eta 0:00:01[K     |█                               | 40 kB 32.5 MB/s eta 0:00:01[K     |█▍                              | 51 kB 33.0 MB/s eta 0:00:01[K     |█▋                              | 61 kB 34.4 MB/s eta 0:00:01[K     |██                              | 71 kB 35.0 MB/s eta 0:00:01[K     |██▏                             | 81 kB 36.2 MB/s eta 0:00:01[K     |██▍                             | 92 kB 37.1 MB/s eta 0:00:01[K     |██▊                             | 102 kB 38.4 MB/s eta 0:00:01[K     |███                             | 112 kB 38.4 MB/s eta 0:00:01[K     |███▎                            | 122 kB 38.4 MB/s eta 0:00:01[K     |██

In [None]:
import time
import datetime
import tensorflow as tf
import torch
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW
from google.colab import drive
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup
import numpy as np
import time
import datetime
import random
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import os
from sklearn.metrics import accuracy_score, mean_absolute_error


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# # The device name should look like the following:
# if device_name == '/device:GPU:0':
#     print('Found GPU at: {}'.format(device_name))
# else:
#     raise SystemError('GPU device not found')

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [None]:
en_train_path = './data/amazon-enja-sentiment-dataset/dataset_en_train.json'
en_dev_path = './data/amazon-enja-sentiment-dataset/dataset_en_dev.json'
ja_train_path = './data/amazon-enja-sentiment-dataset/dataset_ja_train.json'
ja_dev_path = './data/amazon-enja-sentiment-dataset/dataset_ja_dev.json'

# Start Predicting (Binary)

## Load Our Model

In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
model_dir = "./models/enja-binary-model/model_save/"
# Load a trained model and vocabulary that you have fine-tuned
model = XLMRobertaForSequenceClassification.from_pretrained(model_dir)
tokenizer = XLMRobertaTokenizer.from_pretrained(model_dir)

# Copy the model to the GPU.
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

## Load Dev Set

In [None]:
# english
df_test = pd.read_json(en_dev_path, lines=True)

# japanese
# df_test = pd.read_json(ja_dev_path, lines=True)

print('Number of test sentences: {:,}\n'.format(df_test.shape[0]))

Number of test sentences: 5,000



In [None]:
#Remove all 3-starred reviews
df_test = df_test[df_test.stars != 3]
print('Total number of training data: ',df_test.shape[0])

def label_sentiment (row):
    if row['stars'] == 1 or row['stars'] == 2:
      return 0
    elif row['stars'] == 4 or row['stars'] == 5:
      return 1

df_test = df_test.sample(frac=1).reset_index(drop=True)
df_test['sentiment'] = df_test.apply(lambda row: label_sentiment(row), axis=1)
print('Total positive review: ', df_test.loc[df_test['sentiment'] == 1].shape[0])
print('Total negative review: ', df_test.loc[df_test['sentiment'] == 0].shape[0])

Total number of training data:  4000
Total positive review:  2000
Total negative review:  2000


In [None]:
df_test.head()

Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category,sentiment
0,en_0156158,product_en_0078481,reviewer_en_0389414,2,I bought this set for my son for Christmas bec...,"Nice toys, but not well made.",en,toy,0
1,en_0403179,product_en_0552590,reviewer_en_0727772,1,It was super baggy in all the wrong places... ...,"Cheap, poor fit",en,apparel,0
2,en_0603155,product_en_0609001,reviewer_en_0414618,1,My PT Therapist recommened this wobble cushion...,Not very comfortabe to sit on,en,sports,0
3,en_0822106,product_en_0779187,reviewer_en_0552090,4,Great! Except I thought the aluminum letters w...,Great Make - Not Very Shiny,en,automotive,1
4,en_0785313,product_en_0824046,reviewer_en_0663045,1,"A lot smaller than I expected, couldnt use it....",Good for kittens,en,pet_products,0


In [None]:
# Create sentence and label lists
treviews = df_test.review_body.values
tsentiments = df_test.sentiment.values

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
# For every sentence...
for sent in treviews:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        truncation=True,
                        max_length=128
                   )
    
    input_ids.append(encoded_sent)

In [None]:
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
MAX_LEN = 64

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 
# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(tsentiments)
# Set the batch size.  
batch_size = 32  
# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

## Evaluate on the Dev Set

In [None]:
# Prediction on test set
print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
# Put model in evaluation mode
model.eval()
# Tracking variables 
predictions , true_labels = [], []
# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
  logits = outputs[0]
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)
print('DONE.')

Predicting labels for 4,000 test sentences...
DONE.


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

accs = []
maes = []
# For each input batch...
for i in range(len(true_labels)):
  # The predictions for this batch are a 2-column ndarray (one column for "0" 
  # and one column for "1"). Pick the label with the highest value and turn this
  # in to a list of 0s and 1s.
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  acc = accuracy_score(true_labels[i], pred_labels_i)
  mae = mean_absolute_error(true_labels[i], pred_labels_i)
  accs.append(acc)
  maes.append(mae)

# print("Trained with {} data".format(len(df_train)))
print("Accuracy on the Test Set ({} data): ".format(len(df_test)), sum(accs)/len(accs)*100)
print("MAE on the Test Set ({} data): ".format(len(df_test)), sum(maes)/len(maes)*100)

Accuracy on the Test Set (4000 data):  92.7
MAE on the Test Set (4000 data):  7.3


## Test on New Data

In [None]:
MAX_LEN = 64

def predict_new (treviews, tsentiments):
  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  # For every sentence...
  for sent in treviews:
      # `encode` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      encoded_sent = tokenizer.encode(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          truncation=True,
                          max_length=128
                    )
      
      input_ids.append(encoded_sent)
  # Pad our input tokens
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                            dtype="long", truncating="post", padding="post")
  # Create attention masks
  attention_masks = []
  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask) 
  # Convert to tensors.
  prediction_inputs = torch.tensor(input_ids)
  prediction_masks = torch.tensor(attention_masks)
  prediction_labels = torch.tensor(tsentiments)
  # Set the batch size.  
  batch_size = 32  
  # Create the DataLoader.
  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
  prediction_sampler = SequentialSampler(prediction_data)
  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

  # Prediction on test set
  print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
  # Put model in evaluation mode
  model.eval()
  # Tracking variables 
  predictions , true_labels = [], []
  # Predict 
  for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)
    logits = outputs[0]
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)
  print('DONE.')
  accs = []
  maes = []
  # For each input batch...
  for i in range(len(true_labels)):
    # The predictions for this batch are a 2-column ndarray (one column for "0" 
    # and one column for "1"). Pick the label with the highest value and turn this
    # in to a list of 0s and 1s.
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    acc = accuracy_score(true_labels[i], pred_labels_i)
    mae = mean_absolute_error(true_labels[i], pred_labels_i)
    accs.append(acc)
    maes.append(mae)

  # print("Trained with {} data".format(len(df_train)))
  print("Accuracy on the Test Set ({} data): ".format(len(treviews)), sum(accs)/len(accs)*100)
  print("MAE on the Test Set ({} data): ".format(len(treviews)), sum(maes)/len(maes)*100)

  print("True label: {} || Predicted: {}".format(true_labels[0], np.argmax(predictions[0], axis=1).flatten()))

In [None]:
# Create new sentence and label lists

# newreviews = np.array(["クソ悪い商品でがっかりだと思ったら、１ヶ月ぐらい使ったら本当の便利さがわかった","幼馴染が職場に来て先輩と関係を持ってしまうストーリーです、主人公は幼馴染の事が好きそうなのがわかります。先輩はお客さんと全員関係を持っていて売り上げを出しています。ストーリーの展開が早くて続きが気になりました！","星4→星1に変更しました1ヶ月使っていますが、給水して電源をオンにしても出ないことが多いです。手入れしたり再起動しても同様なので、値段の割に失敗したかも…-----以下購入時レビュー-------良かった点・ヒート機能を使っても音は静か・寝るときに一部の明かりを消したりできる(一部機能をオフにする)・上から水をいれれる悪かった点・サイズの割にあまりパワーは無い・給水してるのに動かないことが多い(再起動で解決)","加湿性、給水も楽で申し分ないのですが、キーンというモスキート音？が気になりました。日中はテレビをつけたりしているので気にならないのですが、夜は気になってダメで消して寝てます。それ以外は全く問題なく、子供はモスキート音が気にならないらしく、子供部屋に設置してます。"])
# add 最初は at the beginning of the first review above, and the machine will know its a positive review

newreviews = np.array(["i thought there was a mistake in the packaging that made me dissapointed but after opening it, i felt the usefulness of the product",
                       "what a product, cant even put my satisfaction into words",
                       "白いマスクより生地が厚いので、鼻にペラペラに貼れないから、サイズ大きなマスクげ好きです、顔も被れるよ、呼吸は楽ですね、しかし、色は黒だから、普通に歩くとすごく観られますよ。クール感覚もあるね、匂いもカットされるよ、電車の中に隣の臭い加齢臭をカットしたい人に勧め。",
                       "everybody says that this product is one of the best ones, maybe not for me, it was broke when it arrived",
                       "だめだ、使えない。安いからってしょうがないか。。。"])
# add a fullstop on the third review above, and the machine will know its a negative review

newsentiments = np.array([1,1,1,0,0])

In [None]:
predict_new(newreviews, newsentiments)

Predicting labels for 5 test sentences...
DONE.
Accuracy on the Test Set (5 data):  100.0
MAE on the Test Set (5 data):  0.0
True label: [1 1 1 0 0] || Predicted: [1 1 1 0 0]


---

In [None]:
# Load Rakuten Binary Test Data

ja_rakuten_path = './data/rakuten-sentiment-dataset/binary/binary_test.csv'
df_rakuten = pd.read_csv(ja_rakuten_path, header=None)
newreviews = df_rakuten[2].to_numpy()

def label_sentiment (row):
    if row[0] == 1:
      return 0
    elif row[0] == 2:
      return 1

df_rakuten = df_rakuten.sample(frac=1).reset_index(drop=True)
df_rakuten['sentiment'] = df_rakuten.apply(lambda row: label_sentiment(row), axis=1)
print('Total positive review: ', df_rakuten.loc[df_rakuten['sentiment'] == 1].shape[0])
print('Total negative review: ', df_rakuten.loc[df_rakuten['sentiment'] == 0].shape[0])
newreviews = df_rakuten[2].values
newsentiments = df_rakuten['sentiment'].values

Total positive review:  200000
Total negative review:  200000


In [None]:
# Load a combination of English and Japanese Binary Test Data from Amazon

# english
df_en = pd.read_json(en_dev_path, lines=True)
# japanese
df_ja = pd.read_json(ja_dev_path, lines=True)
#concat
df_test = pd.concat([df_en, df_ja])

#Remove all 3-starred reviews
df_test = df_test[df_test.stars != 3]
print('Total number of training data: ',df_test.shape[0])

def label_sentiment (row):
    if row['stars'] == 1 or row['stars'] == 2:
      return 0
    elif row['stars'] == 4 or row['stars'] == 5:
      return 1

df_test = df_test.sample(frac=1).reset_index(drop=True)
df_test['sentiment'] = df_test.apply(lambda row: label_sentiment(row), axis=1)
print('Total positive review: ', df_test.loc[df_test['sentiment'] == 1].shape[0])
print('Total negative review: ', df_test.loc[df_test['sentiment'] == 0].shape[0])

# Create sentence and label lists
newreviews = df_test.review_body.values
newsentiments = df_test.sentiment.values

Total number of training data:  8000
Total positive review:  4000
Total negative review:  4000


In [None]:
newreviews[101]

'我が家には無くてはならない存在。通常価格は分からないけど早く届くのでリピーターです'

In [None]:
newsentiments[101]

1

In [None]:
predict_new(newreviews, newsentiments)

Predicting labels for 8,000 test sentences...
DONE.
Accuracy on the Test Set (8000 data):  92.80000000000001
MAE on the Test Set (8000 data):  7.199999999999999
True label: [1 1 1 1 1 1 0 0 0 1 0 1 0 0 1 0 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1] || Predicted: [1 0 1 1 0 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1]


# Start Predicting (Fine-grained))

## Load Dev Set

In [None]:
# english
df_test = pd.read_json(en_dev_path, lines=True)

# japanese
# df_test = pd.read_json(ja_dev_path, lines=True)

print('Number of test sentences: {:,}\n'.format(df_test.shape[0]))

Number of test sentences: 5,000



In [None]:
df_test

Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category
0,en_0968227,product_en_0878845,reviewer_en_0987470,1,Pathetic design of the caps. Very impractical ...,Not worth the price and very bad cap design,en,baby_product
1,en_0830781,product_en_0004522,reviewer_en_0731158,1,"Shoes were purchased on March 6, 2019. My wife...",Garbage!,en,shoes
2,en_0277954,product_en_0060687,reviewer_en_0793876,1,It's taken me 1 whole year to set this thing u...,I do not recommend this printer,en,office_product
3,en_0316499,product_en_0311791,reviewer_en_0837288,1,Each cartridge printed once. Both dried up in ...,Don't purchase these refurbished cartridges!,en,office_product
4,en_0320665,product_en_0472877,reviewer_en_0878169,1,No light hard to see,Not worth,en,baby_product
...,...,...,...,...,...,...,...,...
4995,en_0447642,product_en_0068982,reviewer_en_0061521,5,This ribbon is so adorable! Goes perfect with ...,So Cute!,en,home
4996,en_0950370,product_en_0563046,reviewer_en_0871798,5,I am in love with this kettle.,Perfect,en,kitchen
4997,en_0203466,product_en_0848682,reviewer_en_0474236,5,My Doberman Loves Having His Nails Trimmed and...,My Doberman Loves Them,en,pet_products
4998,en_0010627,product_en_0536493,reviewer_en_0546192,5,"I love my Fire.. I do everything on it, read, ...",Five Stars,en,other


In [None]:
def label_sentiment (row):
    if row['stars'] == 1:
      return 0
    elif row['stars'] == 2:
      return 1
    elif row['stars'] == 3:
      return 2
    elif row['stars'] == 4:
      return 3
    elif row['stars'] == 5:
      return 4

df_test['sentiment'] = df_test.apply(lambda row: label_sentiment(row), axis=1)

print('Total 1 star review: ', df_test.loc[df_test['sentiment'] == 0].shape[0])
print('Total 2 star review: ', df_test.loc[df_test['sentiment'] == 1].shape[0])
print('Total 3 star review: ', df_test.loc[df_test['sentiment'] == 2].shape[0])
print('Total 4 star review: ', df_test.loc[df_test['sentiment'] == 3].shape[0])
print('Total 5 star review: ', df_test.loc[df_test['sentiment'] == 4].shape[0])

# Create sentence and label lists
treviews = df_test.review_body.values
tsentiments = df_test.sentiment.values

Total 1 star review:  1000
Total 2 star review:  1000
Total 3 star review:  1000
Total 4 star review:  1000
Total 5 star review:  1000


In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
# For every sentence...
for sent in treviews:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        truncation=True,
                        max_length=128
                   )
    
    input_ids.append(encoded_sent)

In [None]:
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
MAX_LEN = 64

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 
# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(tsentiments)
# Set the batch size.  
batch_size = 32  
# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

## Loading Our Model

In [None]:
#...
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
model_dir = "./models/en-finegrained-model/model_save/"
# Load a trained model and vocabulary that you have fine-tuned
model = XLMRobertaForSequenceClassification.from_pretrained(model_dir)
tokenizer = XLMRobertaTokenizer.from_pretrained(model_dir)

# Copy the model to the GPU.
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

## Evaluate on the Dev Set

In [None]:
# Prediction on test set
print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
# Put model in evaluation mode
model.eval()
# Tracking variables 
predictions , true_labels = [], []
# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
  logits = outputs[0]
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)
print('DONE.')

Predicting labels for 5,000 test sentences...
DONE.


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

accs = []
maes = []
# For each input batch...
for i in range(len(true_labels)):
  # The predictions for this batch are a 2-column ndarray (one column for "0" 
  # and one column for "1"). Pick the label with the highest value and turn this
  # in to a list of 0s and 1s.
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  acc = accuracy_score(true_labels[i], pred_labels_i)
  mae = mean_absolute_error(true_labels[i], pred_labels_i)
  accs.append(acc)
  maes.append(mae)

# print("Trained with {} data".format(len(df_train)))
print("Accuracy on the Test Set ({} data): ".format(len(df_test)), sum(accs)/len(accs)*100)
print("MAE on the Test Set ({} data): ".format(len(df_test)), sum(maes)/len(maes)*100)

Accuracy on the Test Set (5000 data):  57.74283439490446
MAE on the Test Set (5000 data):  50.53742038216561


## Test on New Data

In [None]:
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import accuracy_score
MAX_LEN = 64

def predict_new (treviews, tsentiments):
  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  # For every sentence...
  for sent in treviews:
      # `encode` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      encoded_sent = tokenizer.encode(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          truncation=True,
                          max_length=128
                    )
      
      input_ids.append(encoded_sent)
  # Pad our input tokens
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                            dtype="long", truncating="post", padding="post")
  # Create attention masks
  attention_masks = []
  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask) 
  # Convert to tensors.
  prediction_inputs = torch.tensor(input_ids)
  prediction_masks = torch.tensor(attention_masks)
  prediction_labels = torch.tensor(tsentiments)
  # Set the batch size.  
  batch_size = 32  
  # Create the DataLoader.
  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
  prediction_sampler = SequentialSampler(prediction_data)
  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

  # Prediction on test set
  print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))
  # Put model in evaluation mode
  model.eval()
  # Tracking variables 
  predictions , true_labels = [], []
  # Predict 
  for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)
    logits = outputs[0]
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)
  print('DONE.')
  accs = []
  maes = []
  # For each input batch...
  for i in range(len(true_labels)):
    # The predictions for this batch are a 2-column ndarray (one column for "0" 
    # and one column for "1"). Pick the label with the highest value and turn this
    # in to a list of 0s and 1s.
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    acc = accuracy_score(true_labels[i], pred_labels_i)
    mae = mean_absolute_error(true_labels[i], pred_labels_i)
    accs.append(acc)
    maes.append(mae)

  # print("Trained with {} data".format(len(df_train)))
  print("Accuracy on the Test Set ({} data): ".format(len(treviews)), sum(accs)/len(accs)*100)
  print("MAE on the Test Set ({} data): ".format(len(treviews)), sum(maes)/len(maes)*100)

  print("True label: {} || Predicted: {}".format(true_labels[0], np.argmax(predictions[0], axis=1).flatten()))

In [None]:
# Create new sentence and label lists

# newreviews = np.array(["クソ悪い商品でがっかりだと思ったら、１ヶ月ぐらい使ったら本当の便利さがわかった","幼馴染が職場に来て先輩と関係を持ってしまうストーリーです、主人公は幼馴染の事が好きそうなのがわかります。先輩はお客さんと全員関係を持っていて売り上げを出しています。ストーリーの展開が早くて続きが気になりました！","星4→星1に変更しました1ヶ月使っていますが、給水して電源をオンにしても出ないことが多いです。手入れしたり再起動しても同様なので、値段の割に失敗したかも…-----以下購入時レビュー-------良かった点・ヒート機能を使っても音は静か・寝るときに一部の明かりを消したりできる(一部機能をオフにする)・上から水をいれれる悪かった点・サイズの割にあまりパワーは無い・給水してるのに動かないことが多い(再起動で解決)","加湿性、給水も楽で申し分ないのですが、キーンというモスキート音？が気になりました。日中はテレビをつけたりしているので気にならないのですが、夜は気になってダメで消して寝てます。それ以外は全く問題なく、子供はモスキート音が気にならないらしく、子供部屋に設置してます。"])
# add 最初は at the beginning of the first review above, and the machine will know its a positive review

newreviews = np.array(["i thought there was a mistake in the packaging that made me dissapointed but after opening it, i felt the usefulness of the product","what a product, cant even put my satisfaction into words","everybody says that this product is one of the best ones, maybe not for me","damn how can you build a product with this quality, salute!"])
# add a fullstop on the third review above, and the machine will know its a negative review

newsentiments = np.array([1,1,0,1])

In [None]:
# Load Rakuten Binary Test Data

ja_rakuten_path = './data/rakuten-sentiment-dataset/binary/binary_test.csv'
df_rakuten = pd.read_csv(ja_rakuten_path, header=None)
newreviews = df_rakuten[2].to_numpy()

def label_sentiment (row):
    if row[0] == 1:
      return 0
    elif row[0] == 2:
      return 1

df_rakuten = df_rakuten.sample(frac=1).reset_index(drop=True)
df_rakuten['sentiment'] = df_rakuten.apply(lambda row: label_sentiment(row), axis=1)
print('Total positive review: ', df_rakuten.loc[df_rakuten['sentiment'] == 1].shape[0])
print('Total negative review: ', df_rakuten.loc[df_rakuten['sentiment'] == 0].shape[0])
newreviews = df_rakuten[2].values
newsentiments = df_rakuten['sentiment'].values

In [None]:
# Load a combination of English and Japanese Binary Test Data from Amazon

# english
df_en = pd.read_json(en_dev_path, lines=True)
# japanese
df_ja = pd.read_json(ja_dev_path, lines=True)
#concat
df_test = pd.concat([df_en, df_ja])

#Remove all 3-starred reviews
df_test = df_test[df_test.stars != 3]
print('Total number of training data: ',df_test.shape[0])

def label_sentiment (row):
    if row['stars'] == 1 or row['stars'] == 2:
      return 0
    elif row['stars'] == 4 or row['stars'] == 5:
      return 1

df_test = df_test.sample(frac=1).reset_index(drop=True)
df_test['sentiment'] = df_test.apply(lambda row: label_sentiment(row), axis=1)
print('Total positive review: ', df_test.loc[df_test['sentiment'] == 1].shape[0])
print('Total negative review: ', df_test.loc[df_test['sentiment'] == 0].shape[0])

# Create sentence and label lists
newreviews = df_test.review_body.values
newsentiments = df_test.sentiment.values

Total number of training data:  8000
Total positive review:  4000
Total negative review:  4000


In [None]:
newreviews[1]

"Very durable. Key and transponder fit perfectly. Has a nice dent near the top where your thumb comfortably fits when you turn the key. Only thing that prevents a 5-star review is the area around the panic button. On the factory key shell, the panic button sits in the plastic so you don't accidentally press it. On this shell, the panic button sits out further than any other button. I've twice pressed the panic button while the keys were in my pocket. Not the end of the world, but something to be aware of."

In [None]:
newsentiments[1]

1

In [None]:
predict_new(newreviews, newsentiments)

Predicting labels for 8,000 test sentences...
DONE.
Accuracy on the Test Set (8000 data):  92.80000000000001
MAE on the Test Set (8000 data):  7.199999999999999
True label: [1 1 0 0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0 0 1 1 0 0 1] || Predicted: [1 1 0 0 0 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 0 1]
