# Baseline-HF-BERT

In [1]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, logging
import numpy as np
import re
import string
from nltk.corpus import wordnet
from tqdm import tqdm
from sklearn import metrics
import csv
import datasets
import torch

In [2]:
np.random.seed(0)
logging.set_verbosity_error()
logging.set_verbosity_warning()
HF_HUB_DISABLE_SYMLINKS_WARNING = True

import datetime
t_delta = datetime.timedelta(hours=9)
JST = datetime.timezone(t_delta, 'JST')
now = datetime.datetime.now(JST)

In [3]:
# parameters
MODEL = "bert-base-uncased"
# X_TRAIN = '../dataset/HF-BERT_x_train.npy'
# Y_TRAIN = '../dataset/HF-BERT_y_train.npy'
# X_TEST = '../dataset/HF-BERT_x_test.npy'
# Y_TEST = '../dataset/HF-BERT_y_test.npy'
SAVED_MODEL = "../model/Baseline-HF-BERT_"+str(now.strftime('%Y%m%d%H%M%S'))
EPOCH = 2
BATCH_SIZE = 8

In [4]:
print(SAVED_MODEL)

../model/Baseline-HF-BERT_20221122110116


# Load Datasets

In [5]:
# 前処理
def preprocessing(text):
    # 括弧内文章の削除
    text = re.sub(r'\(.*\)',' ',text)
    text = re.sub(r'\[.*\]',' ',text)
    text = re.sub(r'\<.*\>',' ',text)
    text = re.sub(r'\{.*\}',' ',text)
    # 記号文字の削除
    text = text.translate(str.maketrans('','',string.punctuation))
    # スペースの調整
    text = re.sub(r'\s+',' ',text)
    return text

In [6]:
# preprocessing train data -----------------------------------------------------------------------
# load topic class labels
print("making train dataset...")
with open('../data/topic/classes.txt','r',encoding='utf-8') as f:
    labels = f.read().splitlines()
topic_class_hypothesis = dict()
for i,label in enumerate(labels):
    topic_class_hypothesis[i] = 'this text is about ' + ' or '.join([wordnet.synsets(word)[0].definition() for word in label.split(' & ')])

# load train data
with open('../data/topic/train_pu_half_v0.txt','r',encoding='utf-8') as f:
    texts_v0 = f.read()
with open('../data/topic/train_pu_half_v1.txt','r',encoding='utf-8') as f:
    texts_v1 = f.read()
texts = texts_v0 + texts_v1

# ## example -------------------------------------
import random
texts = texts.splitlines()
texts = random.sample(texts,1000)
texts = "\n".join(texts)
# ## ---------------------------------------------

x_train, y_train = [],[]
train_first, train_second = [],[]
for label_text in tqdm(texts.splitlines()):
    label,text = label_text.split('\t')
    rand_base = [0,1,2,3,4,5,6,7,8,9]
    rand_base.remove(int(label))
    label_rand = np.random.choice(rand_base)
    train_first.append(preprocessing(text))
    train_second.append(topic_class_hypothesis[int(label)])
    y_train.append(float(1))
    train_first.append(preprocessing(text))
    train_second.append(topic_class_hypothesis[int(label_rand)])
    y_train.append(float(0))

making train dataset...


100%|██████████| 1000/1000 [00:00<00:00, 17091.70it/s]


In [7]:
# dbpedia class ------------------------------------------------------------------------------------------------------
with open('../data/dbpedia_csv/classes.txt','r',encoding='utf-8') as f:
    classes = f.read().splitlines()
    dbpedia_class = ['this text is about '+text for text in classes]

with open('../data/dbpedia_csv/test.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# # example -------------------
import random
reader = random.sample(reader,1000)
# #----------------------------

x_test, y_test = [],[]
test_first, test_second = [],[]
for cls_num,auth,readtext in tqdm(reader,total=len(reader)):
    for db_class in dbpedia_class:
        text = readtext.replace(auth, "")
        test_first.append(preprocessing(text))
        test_second.append(db_class)
    y_test.append(int(cls_num)-1)             

100%|██████████| 1000/1000 [00:00<00:00, 5376.15it/s]


In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

train_dataset = datasets.Dataset.from_dict({"first":train_first, "second":train_second, "label":y_train})
test_dataset = datasets.Dataset.from_dict({"first":test_first, "second":test_second})
dataset = datasets.DatasetDict({"train":train_dataset, "test":test_dataset})

def tokenize_function(examples):
    return tokenizer(examples["first"], examples["second"], truncation=True, return_tensors="pt", padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns('first').remove_columns("second")
print(tokenized_datasets)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42) #.select(range(5000))
small_eval_dataset = tokenized_datasets["test"] #.select(range(1000))

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14000
    })
})


# Training

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
import evaluate
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.where(0.5<=logits.squeeze(), 1, 0)
    return evaluate.load("accuracy").compute(predictions=predictions, references=labels)

In [12]:
training_args = TrainingArguments(
  output_dir=SAVED_MODEL,
  num_train_epochs=EPOCH,
  per_device_train_batch_size=BATCH_SIZE,
  per_device_eval_batch_size=BATCH_SIZE,
  evaluation_strategy="epoch",
  optim="adamw_torch",
  report_to="none"
  )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_train_dataset,
    compute_metrics=compute_metrics,
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 2000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 500


  0%|          | 0/500 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.07591088116168976, 'eval_accuracy': 0.9015, 'eval_runtime': 44.6923, 'eval_samples_per_second': 44.75, 'eval_steps_per_second': 5.594, 'epoch': 1.0}


Saving model checkpoint to ../model/Baseline-HF-BERT_20221122110116\checkpoint-500
Configuration saved in ../model/Baseline-HF-BERT_20221122110116\checkpoint-500\config.json


{'loss': 0.1433, 'learning_rate': 0.0, 'epoch': 2.0}


Model weights saved in ../model/Baseline-HF-BERT_20221122110116\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


  0%|          | 0/250 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.04599885642528534, 'eval_accuracy': 0.9415, 'eval_runtime': 49.2684, 'eval_samples_per_second': 40.594, 'eval_steps_per_second': 5.074, 'epoch': 2.0}
{'train_runtime': 391.1323, 'train_samples_per_second': 10.227, 'train_steps_per_second': 1.278, 'train_loss': 0.14325254821777345, 'epoch': 2.0}


TrainOutput(global_step=500, training_loss=0.14325254821777345, metrics={'train_runtime': 391.1323, 'train_samples_per_second': 10.227, 'train_steps_per_second': 1.278, 'train_loss': 0.14325254821777345, 'epoch': 2.0})

In [14]:
model.save_pretrained(SAVED_MODEL)

Configuration saved in ../model/Baseline-HF-BERT_20221122110116\config.json
Model weights saved in ../model/Baseline-HF-BERT_20221122110116\pytorch_model.bin


# Test

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(SAVED_MODEL)

training_args = TrainingArguments(output_dir=SAVED_MODEL,report_to="none")
trainer = Trainer(model=model)

loading configuration file ../model/Baseline-HF-BERT_20221122110116\config.json
Model config BertConfig {
  "_name_or_path": "../model/Baseline-HF-BERT_20221122110116",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "regression",
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file ../model/Baseline-HF-BERT_20221122110116\pytorch_model.bin
All 

In [16]:
pred = trainer.predict(small_eval_dataset)

***** Running Prediction *****
  Num examples = 14000
  Batch size = 8


  0%|          | 0/1750 [00:00<?, ?it/s]

In [19]:
# from sklearn.metrics import classification_report
# y_pred = [np.argmax(i) for i in pred.predictions]

# rep = classification_report(y_test, y_pred, target_names=target_names, digits=3)
# print(rep)

split_pred = np.array_split(pred.predictions,len(y_test))
y_pred = [np.argmax(p) for p in split_pred]

target_names = [c[:3]+"." for c in classes]
rep = metrics.classification_report(y_test,y_pred,target_names=target_names,digits=3)
print(rep)

              precision    recall  f1-score   support

        Com.      0.631     0.679     0.654        78
        Edu.      0.624     0.986     0.764        74
        Art.      0.238     0.338     0.279        71
        Ath.      0.931     0.971     0.950        69
        Off.      0.902     0.613     0.730        75
        Mea.      0.759     0.344     0.473        64
        Bui.      0.767     0.667     0.713        69
        Nat.      0.877     0.842     0.859        76
        Vil.      0.812     0.986     0.890        70
        Ani.      0.929     0.574     0.709        68
        Pla.      0.615     1.000     0.762        72
        Alb.      0.600     0.167     0.261        72
        Fil.      0.823     0.942     0.878        69
        Wri.      0.571     0.548     0.559        73

    accuracy                          0.692      1000
   macro avg      0.720     0.690     0.677      1000
weighted avg      0.718     0.692     0.678      1000

