<a href="https://colab.research.google.com/github/aruaru0/bert-regression-test/blob/main/transformers_japanese_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformersのテスト

## install packages

In [None]:
# インストール後、ランタイムを再起動する必要あり！！
!pip install transformers
!pip install datasets
!pip install evaluate
# pip install　accelerate -U などでインストールすると再起動が必要となる
!pip install git+https://github.com/huggingface/accelerate

In [None]:
# 日本語関係
!pip install fugashi
!pip install ipadic
!pip install unidic-lite  # for bert-large-japanese

## データをロード（amazonのレビューデータ）

In [None]:
from datasets import load_dataset

In [None]:
#https://huggingface.co/datasets/amazon_reviews_multi
dataset = load_dataset("amazon_reviews_multi", "ja")

In [None]:
dataset

### pandasに変換する場合

In [None]:
dataset.set_format(type="pandas")
train_df = dataset["train"][:]
train_df.head(5)

In [None]:
dataset.reset_format() # データをもとに戻す

## トークン化

In [None]:
from transformers import AutoTokenizer

model_ckpt = "cl-tohoku/bert-large-japanese"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
sample_text = dataset['train']['review_body'][0]
sample_text

In [None]:
sample_text_encoded = tokenizer(sample_text)
print(sample_text_encoded)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(sample_text_encoded.input_ids)
print(tokens)

In [None]:
decode_text = tokenizer.convert_tokens_to_string(tokens)
print(decode_text)

In [None]:
# データが多いので部分データに変換しておく
SEED = 42
TRAIN_SIZE = 2000
TEST_SIZE = 1000

dataset["train"] = dataset["train"].shuffle(seed=SEED).select(range(TRAIN_SIZE))
dataset["validation"] = dataset["validation"].shuffle(seed=SEED).select(range(TEST_SIZE))
dataset["test"] = dataset["test"].shuffle(seed=SEED).select(range(TEST_SIZE))

In [None]:
import torch

MAX = 512

def tokenize(batch):
    enc =  tokenizer(batch["review_body"], padding=True, truncation=True, max_length=MAX)
    targets = float(batch['stars']-1)
    enc.update({'labels': targets})
    return enc

In [None]:
tokenizer(["これはテストの文字列です", "庭には二羽、裏庭には二羽鶏がいる"], padding=True, truncation=True, max_length=16)

In [None]:
# max_len = 512
# pad_to_max = False
# def tokenize_data(example):
#     # Tokenize the review body
#     text_ = example['review_body'] + " " + example['review_title'] + " " + example['product_category']
#     encodings = tokenizer.encode_plus(text_, pad_to_max_length=pad_to_max, max_length=max_len,
#                                            add_special_tokens=True,
#                                             return_token_type_ids=False,
#                                             return_attention_mask=True,
#                                             return_overflowing_tokens=False,
#                                             return_special_tokens_mask=False,
#                                            )

#     # Subtract 1 from labels to have them in range 0-4
#     targets = torch.tensor(example['stars']-1,dtype=torch.long)


#     encodings.update({'labels': targets})
#     return encodings

In [None]:
tokenize(dataset['train'][0]).keys()

In [None]:
dataset_encoded = dataset.map(tokenize)# batched=True, batch_size=None)

In [None]:
dataset_encoded['train'][0]

In [None]:
dataset_encoded["train"][0]['review_body']

In [None]:
import pandas as pd
sample_encoded = dataset_encoded["train"][0]
pd.DataFrame(
    [sample_encoded["input_ids"]
     , sample_encoded["attention_mask"]
     , tokenizer.convert_ids_to_tokens(sample_encoded["input_ids"])],
    ['input_ids', 'attention_mask', "tokens"]
).T

In [None]:
small_train_dataset = dataset_encoded['train']
small_valid_dataset = dataset_encoded['validation']
small_test_dataset = dataset_encoded['test']

## 学習

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = 1

model = (AutoModelForSequenceClassification
    .from_pretrained(model_ckpt, num_labels=num_labels)
    .to(device))

In [None]:
from sklearn.metrics import accuracy_score, f1_score
import evaluate
import numpy as np
metric = evaluate.load("mse")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return metric.compute(predictions=predictions, references=labels)

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)

    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}

In [None]:
from transformers import TrainingArguments

batch_size = 4
logging_steps = len(small_train_dataset) // batch_size
model_name = "amazon-review-classification-bert"

training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False,
    log_level="error"
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=small_train_dataset,
    eval_dataset=small_valid_dataset,
    tokenizer=tokenizer
)
trainer.train()

In [None]:
trainer.evaluate()

## 結果をチェック

In [None]:
preds_output = trainer.predict(small_test_dataset)

In [None]:
x = [[] for _ in range(5)]
cnt = 0
for p, l in zip(preds_output.predictions, preds_output.label_ids) :
  x[int(l)].append(p[0]+1)
  if cnt == 100 : break
  cnt += 1

for i in range(5):
  v = np.array(x[i])
  print(f"{i+1}: mean={v.mean()}, std = {v.std()}")

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.scatter(preds_output.predictions+1, preds_output.label_ids+1, alpha=0.05)

In [None]:
plt.boxplot(x)