# Proposed Word2Vec-RoBERTa

In [1]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, logging
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader
from tqdm.auto import tqdm
import numpy as np
import datasets
import string
import evaluate
import csv 
import re

In [2]:
np.random.seed(0)
logging.set_verbosity_error()
logging.set_verbosity_warning()
HF_HUB_DISABLE_SYMLINKS_WARNING = True

import datetime
t_delta = datetime.timedelta(hours=9)
JST = datetime.timezone(t_delta, 'JST')
now = datetime.datetime.now(JST)

In [3]:
# parameters
MODEL = "roberta-base"
SAVED_MODEL = "../model/Proposed-Word2Vec-RoBERTa_"+str(now.strftime('%Y%m%d%H%M%S'))
THRESHOLD = 0.05
MAXLEN_GET_PSEUDO = 3000
MAX_LEN = 128
EPOCH = 5
BATCH_SIZE = 8

In [4]:
print(SAVED_MODEL)

../model/Proposed-Word2Vec-RoBERTa_20221202133223


# Load Dataset

In [5]:
# 前処理
def preprocessing(text):
    # 括弧内文章の削除
    text = re.sub(r'\(.*\)',' ',text)
    text = re.sub(r'\[.*\]',' ',text)
    text = re.sub(r'\<.*\>',' ',text)
    text = re.sub(r'\{.*\}',' ',text)
    # 記号文字の削除
    text = text.translate(str.maketrans('','',string.punctuation))
    # スペースの調整
    text = re.sub(r'\s+',' ',text)
    return text

In [6]:
# 20 newsgroups datasets
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset="all")
newsgroups_datasets = list()

# example ------------------------------------------------
for texts in tqdm(newsgroups.data[:1000]):
  texts = texts.split("\n\n")
  texts = " ".join(texts[1:])
  newsgroups_datasets.append(preprocessing(texts))
# --------------------------------------------------------

# for texts in tqdm(newsgroups.data):
#   texts = texts.split("\n\n")
#   texts = " ".join(texts[1:])
#   newsgroups_datasets.append(preprocessing(texts))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [7]:
# yahoo topic datasets
with open('../data/topic/train_pu_half_v0.txt','r',encoding='utf-8') as f:
    texts_v0 = f.read()
with open('../data/topic/train_pu_half_v1.txt','r',encoding='utf-8') as f:
    texts_v1 = f.read()
texts = texts_v0 + texts_v1
topic_datasets = list()

# example ----------------------------------------------
for label_text in tqdm(texts.splitlines()[:1000]):
  _, text = label_text.split("\t")
  topic_datasets.append(preprocessing(text))
# -------------------------------------------------------

# for label_text in tqdm(texts.splitlines()):
#   _, text = label_text.split("\t")
#   topic_datasets.append(preprocessing(text))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [8]:
# reuters datasets
with open("../data/reuter/sourceall.txt", "r", encoding="utf-8") as f:
  reuter = f.read().split("\n")[:-1]

# example -----------------------------------
reuter = reuter[:1000]
# -------------------------------------------

reuters_datasets = list()
for label_text in tqdm(reuter):
  _, text = label_text.split("\t")
  reuters_datasets.append(preprocessing(text))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [9]:
# dbpedia datasets train
with open('../data/dbpedia_csv/train.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# example -------------------
reader = reader[:1000]
#----------------------------

dbpedia_train_datasets = list()
for _, auth, text in tqdm(reader):
    text = text.replace(auth,'')
    dbpedia_train_datasets.append(preprocessing(text))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [10]:
# dbpedia classes
with open("../data/dbpedia_csv/classes.txt", "r", encoding="utf-8") as f:
  classes = f.read().splitlines()

In [11]:
datasets_texts = newsgroups_datasets + topic_datasets + reuters_datasets + dbpedia_train_datasets

# Choice method

In [12]:
word2vec = gensim.downloader.load('word2vec-google-news-300')

In [13]:
# 文章をベクトルに変換
# 文章内に複数同じ単語が出現する場合、1度だけ使用する
def w2v_avg_vector(sentence):
  vector = np.zeros((300,), dtype="float32")
  count = 0
  used_word = list()
  for word in sentence.split():
    if word not in used_word:
      used_word.append(word)
      try:
        vector = np.add(vector, word2vec[word])
        count += 1
      except:
        pass
    if count >= MAX_LEN:
      break
  if count > 0:
    vector = np.divide(vector, count)
  return vector

In [14]:
classes_vector = [w2v_avg_vector(cls) for cls in classes]

In [15]:
# 情報源領域の文章と各クラスの類似度を計算し、上位2クラスの差が閾値を超えた場合、1位のクラスの学習データとする
diff_datasets = {i:[] for i in range(len(classes))}
for texts in tqdm(datasets_texts):
  texts_vector = w2v_avg_vector(texts)
  similarity = cosine_similarity([texts_vector], classes_vector)[0]
  sim_argsorted = np.argsort(similarity)
  diff = similarity[sim_argsorted[-1]] - similarity[sim_argsorted[-2]]
  if diff >= THRESHOLD:
    diff_datasets[sim_argsorted[-1]].append((diff, texts))

# 上位2クラスの差が閾値を超えた文章の中で、その差が大きいものから順に各クラスの疑似ラベル付きデータの数が同じになるように選択
pseudo_texts = list()
pseudo_labels = list()
for i in range(len(classes)):
  sorted_diff_data = sorted(diff_datasets[i], reverse=True)[:MAXLEN_GET_PSEUDO]
  pseudo_texts.extend([i[1] for i in sorted_diff_data])
  pseudo_labels.extend([i]*len(sorted_diff_data))

  0%|          | 0/4000 [00:00<?, ?it/s]

In [16]:
print("Number of all selected data")
for i in diff_datasets:
  print(classes[i][:3]+". : "+str(len(diff_datasets[i])))

Number of all selected data
Com. : 272
Edu. : 37
Art. : 0
Ath. : 2
Off. : 117
Mea. : 320
Bui. : 6
Nat. : 55
Vil. : 1
Ani. : 4
Pla. : 3
Alb. : 21
Fil. : 28
Wri. : 21


In [17]:
# load test data
# dbpedia datasets train
with open('../data/dbpedia_csv/test.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# example -------------------
import random
reader = random.sample(reader, 1000)
#----------------------------

test_texts = list()
test_labels = list()
for labels, auth, text in tqdm(reader):
    text = text.replace(auth,'')
    test_texts.append(preprocessing(text))
    test_labels.append(int(labels)-1)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [18]:
import datasets

tokenizer = AutoTokenizer.from_pretrained(MODEL)

train_dataset = datasets.Dataset.from_dict({"text":pseudo_texts, "label":pseudo_labels})
test_dataset = datasets.Dataset.from_dict({"text":test_texts, "label":test_labels})
dataset = datasets.DatasetDict({"train":train_dataset, "test":test_dataset})

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, return_tensors="pt", padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns('text')
print(tokenized_datasets)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 887
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})


In [19]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42) #.select(range(5000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42) #.select(range(1000))

# Fine Tuning

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=14)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [21]:
import evaluate
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return evaluate.load("accuracy").compute(predictions=predictions, references=labels)

In [22]:
training_args = TrainingArguments(
  output_dir=SAVED_MODEL,
  num_train_epochs=EPOCH,
  per_device_train_batch_size=BATCH_SIZE,
  per_device_eval_batch_size=BATCH_SIZE,
  evaluation_strategy="epoch",
  logging_strategy='epoch',
  save_strategy="no",
  optim="adamw_torch",
  report_to="none"
  )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [23]:
trainer.train()

***** Running training *****
  Num examples = 887
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 555


  0%|          | 0/555 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'loss': 1.1712, 'learning_rate': 4e-05, 'epoch': 1.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.48378849029541, 'eval_accuracy': 0.34, 'eval_runtime': 42.0691, 'eval_samples_per_second': 23.77, 'eval_steps_per_second': 2.971, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'loss': 0.5266, 'learning_rate': 3e-05, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.612954616546631, 'eval_accuracy': 0.394, 'eval_runtime': 40.2216, 'eval_samples_per_second': 24.862, 'eval_steps_per_second': 3.108, 'epoch': 2.0}


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'loss': 0.3277, 'learning_rate': 2e-05, 'epoch': 3.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.7031121253967285, 'eval_accuracy': 0.402, 'eval_runtime': 38.5613, 'eval_samples_per_second': 25.933, 'eval_steps_per_second': 3.242, 'epoch': 3.0}


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'loss': 0.1852, 'learning_rate': 1e-05, 'epoch': 4.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.458378791809082, 'eval_accuracy': 0.422, 'eval_runtime': 36.0382, 'eval_samples_per_second': 27.748, 'eval_steps_per_second': 3.469, 'epoch': 4.0}


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'loss': 0.1022, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/125 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 2.3361575603485107, 'eval_accuracy': 0.435, 'eval_runtime': 37.0955, 'eval_samples_per_second': 26.957, 'eval_steps_per_second': 3.37, 'epoch': 5.0}
{'train_runtime': 701.478, 'train_samples_per_second': 6.322, 'train_steps_per_second': 0.791, 'train_loss': 0.46260015470487575, 'epoch': 5.0}


TrainOutput(global_step=555, training_loss=0.46260015470487575, metrics={'train_runtime': 701.478, 'train_samples_per_second': 6.322, 'train_steps_per_second': 0.791, 'train_loss': 0.46260015470487575, 'epoch': 5.0})

In [24]:
model.save_pretrained(SAVED_MODEL)

Configuration saved in ../model/Proposed-Word2Vec-RoBERTa_20221202133223\config.json
Model weights saved in ../model/Proposed-Word2Vec-RoBERTa_20221202133223\pytorch_model.bin


# Test

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(SAVED_MODEL)

test_args = TrainingArguments(output_dir=SAVED_MODEL,report_to="none")
trainer = Trainer(model=model, args=test_args)

loading configuration file ../model/Proposed-Word2Vec-RoBERTa_20221202133223\config.json
Model config RobertaConfig {
  "_name_or_path": "../model/Proposed-Word2Vec-RoBERTa_20221202133223",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "L

In [26]:
pred = trainer.predict(small_eval_dataset)

***** Running Prediction *****
  Num examples = 1000
  Batch size = 8


  0%|          | 0/125 [00:00<?, ?it/s]

In [27]:
from sklearn.metrics import classification_report
y_pred = [np.argmax(i) for i in pred.predictions]
target_names = [c[:3]+"." for c in classes]
rep = classification_report(pred.label_ids, y_pred, target_names=target_names, digits=3)
print(rep)

              precision    recall  f1-score   support

        Com.      0.536     0.592     0.562        76
        Edu.      0.483     0.853     0.617        68
        Art.      0.000     0.000     0.000        60
        Ath.      0.000     0.000     0.000        65
        Off.      0.842     0.427     0.566        75
        Mea.      0.537     0.522     0.529        69
        Bui.      1.000     0.058     0.110        69
        Nat.      0.188     0.931     0.312        87
        Vil.      0.000     0.000     0.000        72
        Ani.      0.000     0.000     0.000        75
        Pla.      0.000     0.000     0.000        75
        Alb.      0.726     0.966     0.829        88
        Fil.      0.864     0.919     0.891        62
        Wri.      0.514     0.627     0.565        59

    accuracy                          0.435      1000
   macro avg      0.406     0.421     0.356      1000
weighted avg      0.407     0.435     0.360      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
