<a href="https://colab.research.google.com/github/aruaru0/bert-classification-test/blob/main/HuggingFace_BERT_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# transformerのインストールなど

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install git+https://github.com/huggingface/accelerate

In [None]:
!pip install fugashi
!pip install ipadic

# データセットを準備

In [None]:
from datasets import load_dataset

dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese")

In [None]:
dataset

# Tokenizerの取得

In [None]:
from transformers import AutoTokenizer

model_ckpt = "cl-tohoku/bert-base-japanese"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# データサイズを減らす

In [None]:
SEED = 42
TRAIN_SIZE = 10000
TEST_SIZE = 1000

dataset["train"] = dataset["train"].shuffle(seed=SEED).select(range(TRAIN_SIZE))
dataset["validation"] = dataset["validation"].shuffle(seed=SEED).select(range(TEST_SIZE))
dataset["test"] = dataset["test"].shuffle(seed=SEED).select(range(TEST_SIZE))

# データセットの加工

In [None]:
import torch

def tokenize(batch):
    enc =  tokenizer(batch["text"], padding=True, truncation=True)
    enc.update({'label': batch['label']})
    return enc

In [None]:
dataset_encoded = dataset.map(tokenize)

In [None]:
small_train_dataset = dataset_encoded['train']
small_valid_dataset = dataset_encoded['validation']
small_test_dataset = dataset_encoded['test']

# 学習

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = 5

model = (AutoModelForSequenceClassification
    .from_pretrained(model_ckpt, num_labels=num_labels)
    .to(device))

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    preds, labels = pred
    preds = preds.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import TrainingArguments

batch_size = 16
logging_steps = len(small_train_dataset) // batch_size
model_name = "multilingual-sentiments-classification-bert"

training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False,
    log_level="error"
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=small_train_dataset,
    eval_dataset=small_valid_dataset,
    tokenizer=tokenizer
)
trainer.train()

# テストデータに対する結果を評価


In [None]:
preds_output = trainer.predict(small_test_dataset)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix


y_preds = np.argmax(preds_output.predictions, axis=1)
y_valid = np.array(small_test_dataset["label"])
labels = ["3star", "2star", "1star"]
#dataset_encoded["train"].features["label"].names

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()

plot_confusion_matrix(y_preds, y_valid, labels)

# モデルの保存、読み込み

In [None]:
trainer.save_model(f"./{model_name}-test")

In [None]:
tokenizer = AutoTokenizer\
    .from_pretrained(f"./{model_name}-test")

model = (AutoModelForSequenceClassification
    .from_pretrained(f"./{model_name}-test")
    .to(device))

# Pipelineを使って推論

In [None]:
from transformers import pipeline

pipe = pipeline("text-classification", f"./{model_name}-test")
pipe("同価格帯のガン型電ドラより力が入りにくいが、手回しより楽です。締めすぎないので良いです。")