In [2]:
#意味的類似度計算（回帰）

from datasets import load_dataset
from pprint import pprint
from tqdm import tqdm

train_dataset = load_dataset(
    "llm-book/JGLUE", name="JSTS", split="train"
)

valid_dataset = load_dataset(
    "llm-book/JGLUE", name="JSTS", split="validation"
)



Downloading builder script:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.08k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/9.03k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/653k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.2k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [4]:
train_dataset[0]

{'sentence_pair_id': '0',
 'yjcaptions_id': '10005_480798-10996-92616',
 'sentence1': '川べりでサーフボードを持った人たちがいます。',
 'sentence2': 'トイレの壁に黒いタオルがかけられています。',
 'label': 0.0}



In [24]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import Trainer
from scipy.stats import pearsonr, spearmanr


model_name = "Haneken-417/bert-base-japanese-v3-jsts"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def pair_text_tokenize(example):
    encoded_example = tokenizer(example["sentence1"], example["sentence2"], max_length=512)
    encoded_example["labels"] = example["label"]

    return encoded_example



encoded_train_dataset = train_dataset.map(
    pair_text_tokenize,
    remove_columns=train_dataset.column_names
)

encoded_valid_dataset = valid_dataset.map(
    pair_text_tokenize,
    remove_columns=valid_dataset.column_names
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=1)

training_arg = TrainingArguments(
    output_dir="./output/",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True
)

def calc_correlation_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze(1)

    return {
        "eval_accuracy":pearsonr(predictions, labels).statistic,
    }

trainer = Trainer(
    model=model,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_valid_dataset,
    data_collator=data_collator,
    args=training_arg,
    compute_metrics=calc_correlation_metrics
)



Map:   0%|          | 0/1457 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [25]:
trainer.train()

  0%|          | 0/490 [00:00<?, ?it/s]

{'loss': 0.222, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_accuracy': 0.9125659571203973, 'eval_loss': 0.35875576734542847, 'eval_runtime': 5.8066, 'eval_samples_per_second': 250.924, 'eval_steps_per_second': 3.961, 'epoch': 1.0}
{'loss': 0.1747, 'learning_rate': 1.2e-05, 'epoch': 2.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_accuracy': 0.9162997198147378, 'eval_loss': 0.3368103504180908, 'eval_runtime': 5.3575, 'eval_samples_per_second': 271.954, 'eval_steps_per_second': 4.293, 'epoch': 2.0}
{'loss': 0.1471, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_accuracy': 0.9149654164328043, 'eval_loss': 0.337334007024765, 'eval_runtime': 5.2144, 'eval_samples_per_second': 279.419, 'eval_steps_per_second': 4.411, 'epoch': 3.0}
{'loss': 0.1298, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_accuracy': 0.9136139732039803, 'eval_loss': 0.343456506729126, 'eval_runtime': 5.3181, 'eval_samples_per_second': 273.969, 'eval_steps_per_second': 4.325, 'epoch': 4.0}
{'loss': 0.1187, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_accuracy': 0.9148036978536016, 'eval_loss': 0.3371449112892151, 'eval_runtime': 5.2901, 'eval_samples_per_second': 275.422, 'eval_steps_per_second': 4.348, 'epoch': 5.0}
{'train_runtime': 1317.318, 'train_samples_per_second': 47.259, 'train_steps_per_second': 0.372, 'train_loss': 0.1584542449639768, 'epoch': 5.0}


TrainOutput(global_step=490, training_loss=0.1584542449639768, metrics={'train_runtime': 1317.318, 'train_samples_per_second': 47.259, 'train_steps_per_second': 0.372, 'train_loss': 0.1584542449639768, 'epoch': 5.0})

In [26]:
trainer.evaluate()

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_accuracy': 0.9162997198147378,
 'eval_loss': 0.3368103504180908,
 'eval_runtime': 5.4896,
 'eval_samples_per_second': 265.411,
 'eval_steps_per_second': 4.19,
 'epoch': 5.0}