<a href="https://colab.research.google.com/github/aruaru0/bert-classification-test/blob/main/transformers_japanese_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformersのテスト

## install packages

In [None]:
# インストール後、ランタイムを再起動する必要あり！！
!pip install transformers
!pip install datasets
!pip install evaluate
# pip install　accelerate -U などでインストールすると再起動が必要となる
!pip install git+https://github.com/huggingface/accelerate

In [None]:
# 日本語関係
!pip install fugashi
!pip install ipadic

## データをロード（amazonのレビューデータ）

In [None]:
from datasets import load_dataset

In [None]:
#https://huggingface.co/datasets/amazon_reviews_multi
dataset = load_dataset("amazon_reviews_multi", "ja")

In [None]:
dataset

### pandasに変換する場合

In [None]:
dataset.set_format(type="pandas")
train_df = dataset["train"][:]
train_df.head(5)

In [None]:
dataset.reset_format() # データをもとに戻す

## Tokenizerを取得し、トークナイザーのテストをしてみる

In [None]:
from transformers import AutoTokenizer

model_ckpt = "cl-tohoku/bert-base-japanese"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
sample_text = dataset['train']['review_body'][0]
sample_text

In [None]:
sample_text_encoded = tokenizer(sample_text)
print(sample_text_encoded)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(sample_text_encoded.input_ids)
print(tokens)

In [None]:
decode_text = tokenizer.convert_tokens_to_string(tokens)
print(decode_text)

## データサイズを削減

In [None]:
# データが多いので部分データに変換しておく
SEED = 42
TRAIN_SIZE = 10000
TEST_SIZE = 1000

dataset["train"] = dataset["train"].shuffle(seed=SEED).select(range(TRAIN_SIZE))
dataset["validation"] = dataset["validation"].shuffle(seed=SEED).select(range(TEST_SIZE))
dataset["test"] = dataset["test"].shuffle(seed=SEED).select(range(TEST_SIZE))

## データセットをトークン化する

In [None]:
import torch

def tokenize(batch):
    enc =  tokenizer(batch["review_body"], padding=True, truncation=True)
    targets = batch['stars']-1
    enc.update({'labels': targets})
    return enc

In [None]:
# max_len = 512
# pad_to_max = False
# def tokenize_data(example):
#     # Tokenize the review body
#     text_ = example['review_body'] + " " + example['review_title'] + " " + example['product_category']
#     encodings = tokenizer.encode_plus(text_, pad_to_max_length=pad_to_max, max_length=max_len,
#                                            add_special_tokens=True,
#                                             return_token_type_ids=False,
#                                             return_attention_mask=True,
#                                             return_overflowing_tokens=False,
#                                             return_special_tokens_mask=False,
#                                            )

#     # Subtract 1 from labels to have them in range 0-4
#     targets = torch.tensor(example['stars']-1,dtype=torch.long)


#     encodings.update({'labels': targets})
#     return encodings

In [None]:
tokenize(dataset['train'][0]).keys()

In [None]:
dataset_encoded = dataset.map(tokenize)

In [None]:
dataset_encoded["train"][0]['review_body']

In [None]:
import pandas as pd
sample_encoded = dataset_encoded["train"][0]
pd.DataFrame(
    [sample_encoded["input_ids"]
     , sample_encoded["attention_mask"]
     , tokenizer.convert_ids_to_tokens(sample_encoded["input_ids"])],
    ['input_ids', 'attention_mask', "tokens"]
).T

In [None]:
small_train_dataset = dataset_encoded['train']
small_valid_dataset = dataset_encoded['validation']
small_test_dataset = dataset_encoded['test']

## 学習

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = 5

model = (AutoModelForSequenceClassification
    .from_pretrained(model_ckpt, num_labels=num_labels)
    .to(device))

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    preds, labels = pred
    preds = preds.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import TrainingArguments

batch_size = 16
logging_steps = len(small_train_dataset) // batch_size
model_name = "amazon-review-classification-bert"

training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False,
    log_level="error"
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=small_train_dataset,
    eval_dataset=small_valid_dataset,
    tokenizer=tokenizer
)
trainer.train()

## 混同行列を作成

In [None]:
preds_output = trainer.predict(small_test_dataset)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

y_preds = np.argmax(preds_output.predictions, axis=1)
y_valid = np.array(small_test_dataset["labels"])
labels = ["1star", "2star", "3star", "4star", "5star"]
#dataset_encoded["train"].features["label"].names

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()

plot_confusion_matrix(y_preds, y_valid, labels)

In [None]:
trainer.save_model(f"./{model_name}-test")

## 保存したモデルを使って推論してみる

In [None]:
new_tokenizer = AutoTokenizer\
    .from_pretrained(f"./{model_name}-test")

new_model = (AutoModelForSequenceClassification
    .from_pretrained(f"./{model_name}-test")
    .to(device))

In [None]:
inputs = new_tokenizer(sample_text, return_tensors="pt")

new_model.eval()

with torch.no_grad():
    outputs = new_model(
        inputs["input_ids"].to(device),
        inputs["attention_mask"].to(device),
    )

print(sample_text)
outputs.logits

In [None]:
y_preds = np.argmax(outputs.logits.to('cpu').detach().numpy().copy(), axis=1) + 1
y_preds