In [1]:
!pip install transformers datasets evaluate fugashi ipadic sentencepiece accelerate -qU

In [2]:
import random
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.metrics import accuracy_score, f1_score
import torch

# Load dataset
dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese")

# If you want to downsize the dataset for experimentation, uncomment the following section
random.seed(42)
dataset = DatasetDict({
    "train": dataset['train']\
        .select(random.sample(range(dataset['train'].num_rows), k=1000)),
    "validation": dataset['validation']\
        .select(random.sample(range(dataset['validation'].num_rows), k=1000)),
    "test": dataset['test']\
        .select(random.sample(range(dataset['test'].num_rows), k=1000)),
})

# Load tokenizer
model_ckpt = "cl-tohoku/bert-base-japanese-whole-word-masking"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Tokenization
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

# Load pre-trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = 3
model = (AutoModelForSequenceClassification
    .from_pretrained(model_ckpt, num_labels=num_labels)
    .to(device))

# Define metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# Set training parameters
batch_size = 16
logging_steps = len(dataset_encoded["train"]) // batch_size
model_name = "sample-text-classification-bert"

training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False,
    log_level="error",
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    tokenizer=tokenizer
)

# Start training
trainer.train()




Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9066,0.641909,0.738,0.731594
2,0.5594,0.609132,0.733,0.720505


TrainOutput(global_step=126, training_loss=0.7293949127197266, metrics={'train_runtime': 249.3063, 'train_samples_per_second': 8.022, 'train_steps_per_second': 0.505, 'total_flos': 526226835456000.0, 'train_loss': 0.7293949127197266, 'epoch': 2.0})

## https://dev.classmethod.jp/articles/huggingface-usage-custom-loss-func/