<a href="https://colab.research.google.com/github/aruaru0/bert-regression-test/blob/main/HuggingFace_BERT_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# transformerのインストールなど

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install git+https://github.com/huggingface/accelerate

In [None]:
!pip install fugashi
!pip install ipadic
!pip install unidic-lite

# データセットを準備

In [None]:
from datasets import load_dataset

dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese")

In [None]:
dataset

# Tokenizerの取得

In [None]:
from transformers import AutoTokenizer

model_ckpt = "cl-tohoku/bert-large-japanese"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# データサイズを減らす

In [None]:
SEED = 42
TRAIN_SIZE = 2000
TEST_SIZE = 1000

dataset["train"] = dataset["train"].shuffle(seed=SEED).select(range(TRAIN_SIZE))
dataset["validation"] = dataset["validation"].shuffle(seed=SEED).select(range(TEST_SIZE))
dataset["test"] = dataset["test"].shuffle(seed=SEED).select(range(TEST_SIZE))

# データセットの加工

In [None]:
import torch

MAX = 512

def tokenize(batch):
    enc =  tokenizer(batch["text"], padding=True, truncation=True, max_length=MAX)
    return enc

In [None]:
dataset_encoded = dataset.map(tokenize)

In [None]:
small_train_dataset = dataset_encoded['train']
small_valid_dataset = dataset_encoded['validation']
small_test_dataset = dataset_encoded['test']

In [None]:
from datasets import Value
new_features = small_train_dataset.features.copy()
new_features['label'] = Value("float64")
small_train_dataset = small_train_dataset.cast(new_features)
small_valid_dataset = small_valid_dataset.cast(new_features)
small_test_dataset = small_test_dataset.cast(new_features)


# 学習

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = 1

model = (AutoModelForSequenceClassification
    .from_pretrained(model_ckpt, num_labels=num_labels)
    .to(device))

In [None]:
from sklearn.metrics import accuracy_score, f1_score
import evaluate
import numpy as np
metric = evaluate.load("mse")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments

batch_size = 4
logging_steps = len(small_train_dataset) // batch_size
model_name = "multilingual-sentiments-regression-bert"

training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False,
    log_level="error"
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=small_train_dataset,
    eval_dataset=small_valid_dataset,
    tokenizer=tokenizer
)
trainer.train()

# テストデータに対する結果を評価


In [None]:
preds_output = trainer.predict(small_test_dataset)

In [None]:
x = [[] for _ in range(3)]
cnt = 0
for p, l in zip(preds_output.predictions, preds_output.label_ids) :
  x[int(l)].append(p[0]+1)
  if cnt == 100 : break
  cnt += 1

for i in range(3):
  v = np.array(x[i])
  print(f"{i+1}: mean={v.mean()}, std = {v.std()}")

In [None]:
import matplotlib.pyplot as plt

plt.scatter(preds_output.predictions+1, preds_output.label_ids+1, alpha=0.05)

In [None]:
plt.boxplot(x)

# モデルの保存、読み込み

In [None]:
trainer.save_model(f"./{model_name}-test")

In [None]:
tokenizer = AutoTokenizer\
    .from_pretrained(f"./{model_name}-test")

model = (AutoModelForSequenceClassification
    .from_pretrained(f"./{model_name}-test")
    .to(device))