# Baseline-HF-BERT

In [1]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, logging
import numpy as np
import re
import string
from nltk.corpus import wordnet
from tqdm import tqdm
from sklearn import metrics
import csv
import datasets

In [2]:
np.random.seed(0)
logging.set_verbosity_error()
logging.set_verbosity_warning()
HF_HUB_DISABLE_SYMLINKS_WARNING = True

import datetime
t_delta = datetime.timedelta(hours=9)
JST = datetime.timezone(t_delta, 'JST')
now = datetime.datetime.now(JST)

In [3]:
# parameters
MODEL = "bert-base-uncased"
SAVED_MODEL = "../model/Baseline-HF-BERT_"+str(now.strftime('%Y%m%d%H%M%S'))
EPOCH = 1
BATCH_SIZE = 32

In [4]:
print(SAVED_MODEL)

../model/Baseline-HF-BERT_20221122175047


# Load Datasets

In [5]:
# 前処理
def preprocessing(text):
    # 括弧内文章の削除
    text = re.sub(r'\(.*\)',' ',text)
    text = re.sub(r'\[.*\]',' ',text)
    text = re.sub(r'\<.*\>',' ',text)
    text = re.sub(r'\{.*\}',' ',text)
    # 記号文字の削除
    text = text.translate(str.maketrans('','',string.punctuation))
    # スペースの調整
    text = re.sub(r'\s+',' ',text)
    return text

In [6]:
# preprocessing train data -----------------------------------------------------------------------
# load topic class labels
print("making train dataset...")
with open('../data/topic/classes.txt','r',encoding='utf-8') as f:
    labels = f.read().splitlines()
topic_class_hypothesis = dict()
for i,label in enumerate(labels):
    topic_class_hypothesis[i] = 'this text is about ' + ' or '.join([wordnet.synsets(word)[0].definition() for word in label.split(' & ')])

# load train data
with open('../data/topic/train_pu_half_v0.txt','r',encoding='utf-8') as f:
    texts_v0 = f.read()
with open('../data/topic/train_pu_half_v1.txt','r',encoding='utf-8') as f:
    texts_v1 = f.read()
texts = texts_v0 + texts_v1

# # ## example -------------------------------------
# import random
# texts = texts.splitlines()
# texts = random.sample(texts,100)
# texts = "\n".join(texts)
# # ## ---------------------------------------------

x_train, y_train = [],[]
train_first, train_second = [],[]
for label_text in tqdm(texts.splitlines()):
    label,text = label_text.split('\t')
    rand_base = [0,1,2,3,4,5,6,7,8,9]
    rand_base.remove(int(label))
    label_rand = np.random.choice(rand_base)
    train_first.append(preprocessing(text))
    train_second.append(topic_class_hypothesis[int(label)])
    y_train.append(float(1))
    train_first.append(preprocessing(text))
    train_second.append(topic_class_hypothesis[int(label_rand)])
    y_train.append(float(0))

making train dataset...


100%|██████████| 1300000/1300000 [01:09<00:00, 18718.81it/s]


In [7]:
# dbpedia class ------------------------------------------------------------------------------------------------------
with open('../data/dbpedia_csv/classes.txt','r',encoding='utf-8') as f:
    classes = f.read().splitlines()
    dbpedia_class = ['this text is about '+text for text in classes]

with open('../data/dbpedia_csv/test.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# # # example -------------------
# import random
# reader = random.sample(reader,1000)
# # #----------------------------

x_test, y_test = [],[]
test_first, test_second = [],[]
for cls_num,auth,readtext in tqdm(reader,total=len(reader)):
    for db_class in dbpedia_class:
        text = readtext.replace(auth, "")
        test_first.append(preprocessing(text))
        test_second.append(db_class)
    y_test.append(int(cls_num)-1)             

100%|██████████| 70000/70000 [00:13<00:00, 5040.03it/s]


In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

train_dataset = datasets.Dataset.from_dict({"first":train_first, "second":train_second, "label":y_train})
test_dataset = datasets.Dataset.from_dict({"first":test_first, "second":test_second})
dataset = datasets.DatasetDict({"train":train_dataset, "test":test_dataset})

def tokenize_function(examples):
    return tokenizer(examples["first"], examples["second"], truncation=True, return_tensors="pt", padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns('first').remove_columns("second")
print(tokenized_datasets)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42) #.select(range(5000))
small_eval_dataset = tokenized_datasets["test"] #.select(range(1000))

  0%|          | 0/2600 [00:00<?, ?ba/s]

  0%|          | 0/980 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2600000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 980000
    })
})


# Training

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
import evaluate
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.where(0.5<=logits.squeeze(), 1, 0)
    return evaluate.load("accuracy").compute(predictions=predictions, references=labels)

In [11]:
training_args = TrainingArguments(
  output_dir=SAVED_MODEL,
  num_train_epochs=EPOCH,
  per_device_train_batch_size=BATCH_SIZE,
  per_device_eval_batch_size=BATCH_SIZE,
  evaluation_strategy="epoch",
  save_strategy="no",
  optim="adamw_torch",
  report_to="none",
  )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_train_dataset,
    compute_metrics=compute_metrics,
)

In [12]:
trainer.train()

***** Running training *****
  Num examples = 2600000
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 81250
  Number of trainable parameters = 109483009


  0%|          | 0/81250 [00:00<?, ?it/s]

{'loss': 0.1245, 'learning_rate': 4.969230769230769e-05, 'epoch': 0.01}
{'loss': 0.0995, 'learning_rate': 4.9384615384615384e-05, 'epoch': 0.01}
{'loss': 0.0961, 'learning_rate': 4.907692307692308e-05, 'epoch': 0.02}
{'loss': 0.0909, 'learning_rate': 4.876923076923077e-05, 'epoch': 0.02}
{'loss': 0.0879, 'learning_rate': 4.846153846153846e-05, 'epoch': 0.03}
{'loss': 0.0908, 'learning_rate': 4.815384615384615e-05, 'epoch': 0.04}
{'loss': 0.0879, 'learning_rate': 4.784615384615384e-05, 'epoch': 0.04}
{'loss': 0.0859, 'learning_rate': 4.753846153846154e-05, 'epoch': 0.05}
{'loss': 0.0867, 'learning_rate': 4.723076923076923e-05, 'epoch': 0.06}
{'loss': 0.0845, 'learning_rate': 4.692307692307693e-05, 'epoch': 0.06}
{'loss': 0.0869, 'learning_rate': 4.661538461538462e-05, 'epoch': 0.07}
{'loss': 0.0854, 'learning_rate': 4.630769230769231e-05, 'epoch': 0.07}
{'loss': 0.0854, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.08}
{'loss': 0.0828, 'learning_rate': 4.56923076923077e-05, 'epoch'

***** Running Evaluation *****
  Num examples = 2600000
  Batch size = 32


  0%|          | 0/81250 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.058641135692596436, 'eval_accuracy': 0.9226965384615384, 'eval_runtime': 20115.8635, 'eval_samples_per_second': 129.251, 'eval_steps_per_second': 4.039, 'epoch': 1.0}
{'train_runtime': 76048.3767, 'train_samples_per_second': 34.189, 'train_steps_per_second': 1.068, 'train_loss': 0.07540145897498497, 'epoch': 1.0}


TrainOutput(global_step=81250, training_loss=0.07540145897498497, metrics={'train_runtime': 76048.3767, 'train_samples_per_second': 34.189, 'train_steps_per_second': 1.068, 'train_loss': 0.07540145897498497, 'epoch': 1.0})

In [13]:
model.save_pretrained(SAVED_MODEL)

Configuration saved in ../model/Baseline-HF-BERT_20221122175047/config.json
Model weights saved in ../model/Baseline-HF-BERT_20221122175047/pytorch_model.bin


# Test

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(SAVED_MODEL)

test_args = TrainingArguments(output_dir=SAVED_MODEL,report_to="none")
trainer = Trainer(model=model, args=test_args)

loading configuration file ../model/Baseline-HF-BERT_20221122175047/config.json
Model config BertConfig {
  "_name_or_path": "../model/Baseline-HF-BERT_20221122175047",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "regression",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file ../model/Baseline-HF-BERT_20221122175047/pytorch_model.bin
All 

In [15]:
pred = trainer.predict(small_eval_dataset)

***** Running Prediction *****
  Num examples = 980000
  Batch size = 8


  0%|          | 0/122500 [00:00<?, ?it/s]

In [16]:
split_pred = np.array_split(pred.predictions,len(y_test))
y_pred = [np.argmax(p) for p in split_pred]

target_names = [c[:3]+"." for c in classes]
rep = metrics.classification_report(y_test,y_pred,target_names=target_names,digits=3)
print(rep)

              precision    recall  f1-score   support

        Com.      0.733     0.675     0.703      5000
        Edu.      0.215     0.978     0.353      5000
        Art.      0.300     0.127     0.178      5000
        Ath.      0.791     0.912     0.847      5000
        Off.      0.638     0.774     0.700      5000
        Mea.      0.781     0.010     0.020      5000
        Bui.      0.951     0.136     0.237      5000
        Nat.      0.494     0.070     0.123      5000
        Vil.      0.967     0.146     0.254      5000
        Ani.      0.060     0.008     0.014      5000
        Pla.      0.396     0.982     0.564      5000
        Alb.      0.617     0.715     0.662      5000
        Fil.      0.667     0.930     0.777      5000
        Wri.      0.393     0.050     0.088      5000

    accuracy                          0.465     70000
   macro avg      0.572     0.465     0.394     70000
weighted avg      0.572     0.465     0.394     70000

