In [1]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, logging
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader
from tqdm import tqdm
import numpy as np
import datasets
import string
import evaluate
import csv
import re

In [2]:
np.random.seed(0)
logging.set_verbosity_error()
logging.set_verbosity_warning()
HF_HUB_DISABLE_SYMLINKS_WARNING = True

import datetime
t_delta = datetime.timedelta(hours=9)
JST = datetime.timezone(t_delta, 'JST')
now = datetime.datetime.now(JST)

In [3]:
# parameters
MODEL = "bert-base-uncased"
X_TRAIN = '../dataset/Proposed-Word2Vec-BERT_x_train.npy'
Y_TRAIN = '../dataset/Proposed-Word2Vec-BERT_y_train.npy'
X_TEST = '../dataset/Proposed-Word2Vec-BERT_x_test.npy'
Y_TEST = '../dataset/Proposed-Word2Vec-BERT_y_test.npy'
SAVED_MODEL = "../model/Proposed-Word2Vec-BERT_"+str(now.strftime('%Y%m%d%H%M%S'))
THRESHOLD = 0.05
MAXLEN_GET_PSEUDO = 3000
EPOCH = 3
BATCH_SIZE = 8

In [4]:
print(SAVED_MODEL)

../model/Proposed-Word2Vec-BERT_20221120182924


In [5]:
# 前処理
def preprocessing(text):
    # 括弧内文章の削除
    text = re.sub(r'\(.*\)',' ',text)
    text = re.sub(r'\[.*\]',' ',text)
    text = re.sub(r'\<.*\>',' ',text)
    text = re.sub(r'\{.*\}',' ',text)
    # 記号文字の削除
    text = text.translate(str.maketrans('','',string.punctuation))
    # スペースの調整
    text = re.sub(r'\s+',' ',text)
    return text

In [6]:
# 20 newsgroups datasets
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset="all")
newsgroups_datasets = list()

# example ------------------------------------------------
for texts in tqdm(newsgroups.data[:10000]):
  texts = texts.split("\n\n")
  texts = " ".join(texts[1:])
  newsgroups_datasets.append(preprocessing(texts))
# --------------------------------------------------------

# for texts in tqdm(newsgroups.data):
#   texts = texts.split("\n\n")
#   texts = " ".join(texts[1:])
#   newsgroups_datasets.append(preprocessing(texts))

100%|██████████| 10000/10000 [00:00<00:00, 17850.81it/s]


In [7]:
# yahoo topic datasets
with open('../data/topic/train_pu_half_v0.txt','r',encoding='utf-8') as f:
    texts_v0 = f.read()
with open('../data/topic/train_pu_half_v1.txt','r',encoding='utf-8') as f:
    texts_v1 = f.read()
texts = texts_v0 + texts_v1
topic_datasets = list()

# example ----------------------------------------------
for label_text in tqdm(texts.splitlines()[:10000]):
  _, text = label_text.split("\t")
  topic_datasets.append(preprocessing(text))
# -------------------------------------------------------

# for label_text in tqdm(texts.splitlines()):
#   _, text = label_text.split("\t")
#   topic_datasets.append(preprocessing(text))

100%|██████████| 10000/10000 [00:00<00:00, 55512.74it/s]


In [8]:
# reuters datasets
with open("../data/reuter/sourceall.txt", "r", encoding="utf-8") as f:
  reuter = f.read().split("\n")[:-1]

# example -----------------------------------
reuter = reuter[:10000]
# -------------------------------------------

reuters_datasets = list()
for label_text in tqdm(reuter):
  _, text = label_text.split("\t")
  reuters_datasets.append(preprocessing(text))

100%|██████████| 10000/10000 [00:00<00:00, 24713.72it/s]


In [9]:
# dbpedia datasets train
with open('../data/dbpedia_csv/train.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# example -------------------
reader = reader[:10000]
#----------------------------

dbpedia_train_datasets = list()
for _, auth, text in tqdm(reader):
    text = text.replace(auth,'')
    dbpedia_train_datasets.append(preprocessing(text))

100%|██████████| 10000/10000 [00:00<00:00, 58622.24it/s]


In [10]:
# dbpedia classes
with open("../data/dbpedia_csv/classes.txt", "r", encoding="utf-8") as f:
  classes = f.read().splitlines()

In [11]:
datasets_texts = newsgroups_datasets + topic_datasets + reuters_datasets + dbpedia_train_datasets

In [12]:
word2vec = gensim.downloader.load('word2vec-google-news-300')

def w2v_avg_vector(sentence):
  vector = np.zeros((300,), dtype="float32")
  count = 0
  for word in sentence.split():
    try:
      vector = np.add(vector, word2vec[word])
      count += 1
    except:
      pass
  if count > 0:
    vector = np.divide(vector, len(word))
  return vector

In [13]:
classes_vector = list()
for cls in classes:
  classes_vector.append(w2v_avg_vector(cls))

In [14]:
diff_datasets = {i:[] for i in range(len(classes))}
for texts in tqdm(datasets_texts):
  texts_vector = w2v_avg_vector(texts)
  similarity = cosine_similarity([texts_vector], classes_vector)[0]
  sim_argsorted = np.argsort(similarity)
  diff = similarity[sim_argsorted[-1]] - similarity[sim_argsorted[-2]]
  if diff > THRESHOLD:
    diff_datasets[sim_argsorted[-1]].append((similarity[sim_argsorted[-1]], texts))

pseudo_texts = list()
pseudo_labels = list()
for i in range(len(classes)):
  sorted_diff_data = sorted(diff_datasets[i], reverse=True)[:MAXLEN_GET_PSEUDO]
  pseudo_texts.extend([i[1] for i in sorted_diff_data])
  pseudo_labels.extend([i]*len(sorted_diff_data[:MAXLEN_GET_PSEUDO]))

100%|██████████| 40000/40000 [00:11<00:00, 3348.39it/s]


In [15]:
print("Number of all selected data")
for i in diff_datasets:
  print(classes[i][:3]+". : "+str(len(diff_datasets[i])))

Number of all selected data
Com. : 2534
Edu. : 314
Art. : 20
Ath. : 29
Off. : 1641
Mea. : 2290
Bui. : 65
Nat. : 578
Vil. : 28
Ani. : 75
Pla. : 55
Alb. : 226
Fil. : 255
Wri. : 161


In [16]:
# load test data
# dbpedia datasets train
with open('../data/dbpedia_csv/test.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# example -------------------
import random
reader = random.sample(reader, 10000)
#----------------------------

test_texts = list()
test_labels = list()
for labels, auth, text in tqdm(reader):
    text = text.replace(auth,'')
    test_texts.append(preprocessing(text))
    test_labels.append(int(labels)-1)

100%|██████████| 10000/10000 [00:00<00:00, 65006.90it/s]


In [17]:
import datasets

tokenizer = AutoTokenizer.from_pretrained(MODEL)

train_dataset = datasets.Dataset.from_dict({"text":pseudo_texts, "label":pseudo_labels})
test_dataset = datasets.Dataset.from_dict({"text":test_texts, "label":test_labels})
dataset = datasets.DatasetDict({"train":train_dataset, "test":test_dataset})

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, return_tensors="pt", padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns('text')
print(tokenized_datasets)

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8271
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
})


In [18]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(5000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

# Fine Tuning

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=14)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
import evaluate
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return evaluate.load("accuracy").compute(predictions=predictions, references=labels)

In [21]:
training_args = TrainingArguments(
  output_dir=SAVED_MODEL,
  num_train_epochs=EPOCH,
  per_device_train_batch_size=BATCH_SIZE,
  evaluation_strategy="epoch",
  optim="adamw_torch",
  report_to="none"
  )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()

***** Running training *****
  Num examples = 5000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1875


  0%|          | 0/1875 [00:00<?, ?it/s]

Saving model checkpoint to ../model/Proposed-Word2Vec-BERT_20221120182924\checkpoint-500
Configuration saved in ../model/Proposed-Word2Vec-BERT_20221120182924\checkpoint-500\config.json


{'loss': 0.7991, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}


Model weights saved in ../model/Proposed-Word2Vec-BERT_20221120182924\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.9223523139953613, 'eval_accuracy': 0.564, 'eval_runtime': 34.3255, 'eval_samples_per_second': 29.133, 'eval_steps_per_second': 3.642, 'epoch': 1.0}


Saving model checkpoint to ../model/Proposed-Word2Vec-BERT_20221120182924\checkpoint-1000
Configuration saved in ../model/Proposed-Word2Vec-BERT_20221120182924\checkpoint-1000\config.json


{'loss': 0.3259, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}


Model weights saved in ../model/Proposed-Word2Vec-BERT_20221120182924\checkpoint-1000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.8804781436920166, 'eval_accuracy': 0.573, 'eval_runtime': 37.9493, 'eval_samples_per_second': 26.351, 'eval_steps_per_second': 3.294, 'epoch': 2.0}


Saving model checkpoint to ../model/Proposed-Word2Vec-BERT_20221120182924\checkpoint-1500
Configuration saved in ../model/Proposed-Word2Vec-BERT_20221120182924\checkpoint-1500\config.json


{'loss': 0.1645, 'learning_rate': 1e-05, 'epoch': 2.4}


Model weights saved in ../model/Proposed-Word2Vec-BERT_20221120182924\checkpoint-1500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


  0%|          | 0/125 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 1.9711796045303345, 'eval_accuracy': 0.556, 'eval_runtime': 37.9618, 'eval_samples_per_second': 26.342, 'eval_steps_per_second': 3.293, 'epoch': 3.0}
{'train_runtime': 1682.1624, 'train_samples_per_second': 8.917, 'train_steps_per_second': 1.115, 'train_loss': 0.36384196370442706, 'epoch': 3.0}


TrainOutput(global_step=1875, training_loss=0.36384196370442706, metrics={'train_runtime': 1682.1624, 'train_samples_per_second': 8.917, 'train_steps_per_second': 1.115, 'train_loss': 0.36384196370442706, 'epoch': 3.0})

In [23]:
model.save_pretrained(SAVED_MODEL)

Configuration saved in ../model/Proposed-Word2Vec-BERT_20221120182924\config.json
Model weights saved in ../model/Proposed-Word2Vec-BERT_20221120182924\pytorch_model.bin


# Test

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(SAVED_MODEL)

training_args = TrainingArguments(output_dir=SAVED_MODEL,report_to="none")
trainer = Trainer(model=model)

loading configuration file ../model/Proposed-Word2Vec-BERT_20221120182924\config.json
Model config BertConfig {
  "_name_or_path": "../model/Proposed-Word2Vec-BERT_20221120182924",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LA

In [25]:
pred = trainer.predict(small_eval_dataset)

***** Running Prediction *****
  Num examples = 1000
  Batch size = 8


  0%|          | 0/125 [00:00<?, ?it/s]

In [26]:
from sklearn.metrics import classification_report
y_pred = [np.argmax(i) for i in pred.predictions]

rep = classification_report(pred.label_ids, y_pred, digits=3)
print(rep)

              precision    recall  f1-score   support

           0      0.800     0.478     0.598        67
           1      0.578     0.797     0.670        79
           2      0.833     0.211     0.337        71
           3      0.433     0.361     0.394        72
           4      0.523     0.672     0.588        67
           5      0.667     0.462     0.545        65
           6      0.798     0.835     0.816        85
           7      0.147     0.500     0.227        52
           8      0.000     0.000     0.000        67
           9      0.696     0.686     0.691        70
          10      0.000     0.000     0.000        73
          11      0.708     0.958     0.814        71
          12      0.655     0.961     0.779        77
          13      0.699     0.690     0.695        84

    accuracy                          0.556      1000
   macro avg      0.538     0.544     0.511      1000
weighted avg      0.552     0.556     0.526      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
