# Proposed Word2Vec-GPT2

In [1]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, logging
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader
from tqdm import tqdm
import numpy as np
import datasets
import string
import evaluate
import csv 
import re

In [2]:
np.random.seed(0)
logging.set_verbosity_error()
logging.set_verbosity_warning()
HF_HUB_DISABLE_SYMLINKS_WARNING = True

import datetime
t_delta = datetime.timedelta(hours=9)
JST = datetime.timezone(t_delta, 'JST')
now = datetime.datetime.now(JST)

In [3]:
# parameters
MODEL = "gpt2"
SAVED_MODEL = "../model/Proposed-Word2Vec-GPT2_"+str(now.strftime('%Y%m%d%H%M%S'))
THRESHOLD = 0.05
MAXLEN_GET_PSEUDO = 3000
EPOCH = 20
BATCH_SIZE = 8

In [4]:
print(SAVED_MODEL)

../model/Proposed-Word2Vec-GPT2_20221125153534


# Load Dataset

In [5]:
# 前処理
def preprocessing(text):
    # 括弧内文章の削除
    text = re.sub(r'\(.*\)',' ',text)
    text = re.sub(r'\[.*\]',' ',text)
    text = re.sub(r'\<.*\>',' ',text)
    text = re.sub(r'\{.*\}',' ',text)
    # 記号文字の削除
    text = text.translate(str.maketrans('','',string.punctuation))
    # スペースの調整
    text = re.sub(r'\s+',' ',text)
    return text

In [6]:
# 20 newsgroups datasets
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset="all")
newsgroups_datasets = list()

# # example ------------------------------------------------
# for texts in tqdm(newsgroups.data[:1000]):
#   texts = texts.split("\n\n")
#   texts = " ".join(texts[1:])
#   newsgroups_datasets.append(preprocessing(texts))
# # --------------------------------------------------------

for texts in tqdm(newsgroups.data):
  texts = texts.split("\n\n")
  texts = " ".join(texts[1:])
  newsgroups_datasets.append(preprocessing(texts))

100%|██████████| 18846/18846 [00:01<00:00, 17220.70it/s]


In [7]:
# yahoo topic datasets
with open('../data/topic/train_pu_half_v0.txt','r',encoding='utf-8') as f:
    texts_v0 = f.read()
with open('../data/topic/train_pu_half_v1.txt','r',encoding='utf-8') as f:
    texts_v1 = f.read()
texts = texts_v0 + texts_v1
topic_datasets = list()

# # example ----------------------------------------------
# for label_text in tqdm(texts.splitlines()[:1000]):
#   _, text = label_text.split("\t")
#   topic_datasets.append(preprocessing(text))
# # -------------------------------------------------------

for label_text in tqdm(texts.splitlines()):
  _, text = label_text.split("\t")
  topic_datasets.append(preprocessing(text))

100%|██████████| 1300000/1300000 [00:28<00:00, 45063.33it/s]


In [8]:
# reuters datasets
with open("../data/reuter/sourceall.txt", "r", encoding="utf-8") as f:
  reuter = f.read().split("\n")[:-1]

# # example -----------------------------------
# reuter = reuter[:100]
# # -------------------------------------------

reuters_datasets = list()
for label_text in tqdm(reuter):
  _, text = label_text.split("\t")
  reuters_datasets.append(preprocessing(text))

100%|██████████| 762027/762027 [00:30<00:00, 24703.01it/s]


In [9]:
# dbpedia datasets train
with open('../data/dbpedia_csv/train.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# # example -------------------
# reader = reader[:100]
# #----------------------------

dbpedia_train_datasets = list()
for _, auth, text in tqdm(reader):
    text = text.replace(auth,'')
    dbpedia_train_datasets.append(preprocessing(text))

100%|██████████| 560000/560000 [00:08<00:00, 64779.26it/s]


In [10]:
# dbpedia classes
with open("../data/dbpedia_csv/classes.txt", "r", encoding="utf-8") as f:
  classes = f.read().splitlines()

In [11]:
datasets_texts = newsgroups_datasets + topic_datasets + reuters_datasets + dbpedia_train_datasets

# Choice method

In [12]:
word2vec = gensim.downloader.load('word2vec-google-news-300')

def w2v_avg_vector(sentence):
  vector = np.zeros((300,), dtype="float32")
  count = 0
  for word in sentence.split():
    try:
      vector = np.add(vector, word2vec[word])
      count += 1
    except:
      pass
  if count > 0:
    vector = np.divide(vector, len(word))
  return vector

In [13]:
classes_vector = list()
for cls in classes:
  classes_vector.append(w2v_avg_vector(cls))

In [14]:
diff_datasets = {i:[] for i in range(len(classes))}
for texts in tqdm(datasets_texts):
  texts_vector = w2v_avg_vector(texts)
  similarity = cosine_similarity([texts_vector], classes_vector)[0]
  sim_argsorted = np.argsort(similarity)
  diff = similarity[sim_argsorted[-1]] - similarity[sim_argsorted[-2]]
  if diff > THRESHOLD:
    diff_datasets[sim_argsorted[-1]].append((similarity[sim_argsorted[-1]], texts))

pseudo_texts = list()
pseudo_labels = list()
for i in range(len(classes)):
  sorted_diff_data = sorted(diff_datasets[i], reverse=True)[:MAXLEN_GET_PSEUDO]
  pseudo_texts.extend([i[1] for i in sorted_diff_data])
  pseudo_labels.extend([i]*len(sorted_diff_data[:MAXLEN_GET_PSEUDO]))

100%|██████████| 2640873/2640873 [12:32<00:00, 3511.06it/s]


In [15]:
print("Number of all selected data")
for i in diff_datasets:
  print(classes[i][:3]+". : "+str(len(diff_datasets[i])))

Number of all selected data
Com. : 61181
Edu. : 34672
Art. : 6818
Ath. : 16123
Off. : 126858
Mea. : 144368
Bui. : 10120
Nat. : 38562
Vil. : 35431
Ani. : 4742
Pla. : 13933
Alb. : 42379
Fil. : 26335
Wri. : 23263


In [16]:
# load test data
# dbpedia datasets train
with open('../data/dbpedia_csv/test.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# # example -------------------
# import random
# reader = random.sample(reader, 100)
# #----------------------------

test_texts = list()
test_labels = list()
for labels, auth, text in tqdm(reader):
    text = text.replace(auth,'')
    test_texts.append(preprocessing(text))
    test_labels.append(int(labels)-1)

100%|██████████| 70000/70000 [00:01<00:00, 64545.32it/s]


In [17]:
import datasets

tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.pad_token = tokenizer.eos_token

train_dataset = datasets.Dataset.from_dict({"text":pseudo_texts, "label":pseudo_labels})
test_dataset = datasets.Dataset.from_dict({"text":test_texts, "label":test_labels})
dataset = datasets.DatasetDict({"train":train_dataset, "test":test_dataset})

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, return_tensors="pt", padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns('text')
print(tokenized_datasets)

  0%|          | 0/42 [00:00<?, ?ba/s]

  0%|          | 0/70 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 42000
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 70000
    })
})


In [18]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42) #.select(range(5000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42) #.select(range(1000))

# Fine Tuning

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=14)
model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
import evaluate
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return evaluate.load("accuracy").compute(predictions=predictions, references=labels)

In [21]:
training_args = TrainingArguments(
  output_dir=SAVED_MODEL,
  num_train_epochs=EPOCH,
  per_device_train_batch_size=BATCH_SIZE,
  per_device_eval_batch_size=BATCH_SIZE,
  evaluation_strategy="epoch",
  save_strategy="no",
  optim="adamw_torch",
  report_to="none"
  )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()

***** Running training *****
  Num examples = 42000
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 105000
  Number of trainable parameters = 124450560


  0%|          | 0/105000 [00:00<?, ?it/s]

{'loss': 0.7792, 'learning_rate': 4.976190476190477e-05, 'epoch': 0.1}
{'loss': 0.257, 'learning_rate': 4.9523809523809525e-05, 'epoch': 0.19}
{'loss': 0.1895, 'learning_rate': 4.928571428571429e-05, 'epoch': 0.29}
{'loss': 0.1263, 'learning_rate': 4.904761904761905e-05, 'epoch': 0.38}
{'loss': 0.1448, 'learning_rate': 4.880952380952381e-05, 'epoch': 0.48}
{'loss': 0.1316, 'learning_rate': 4.8571428571428576e-05, 'epoch': 0.57}
{'loss': 0.1237, 'learning_rate': 4.8333333333333334e-05, 'epoch': 0.67}
{'loss': 0.1299, 'learning_rate': 4.80952380952381e-05, 'epoch': 0.76}
{'loss': 0.1249, 'learning_rate': 4.785714285714286e-05, 'epoch': 0.86}
{'loss': 0.1259, 'learning_rate': 4.761904761904762e-05, 'epoch': 0.95}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 3.114703893661499, 'eval_accuracy': 0.5835428571428571, 'eval_runtime': 628.2189, 'eval_samples_per_second': 111.426, 'eval_steps_per_second': 13.928, 'epoch': 1.0}
{'loss': 0.0825, 'learning_rate': 4.738095238095238e-05, 'epoch': 1.05}
{'loss': 0.0664, 'learning_rate': 4.714285714285714e-05, 'epoch': 1.14}
{'loss': 0.0769, 'learning_rate': 4.690476190476191e-05, 'epoch': 1.24}
{'loss': 0.0788, 'learning_rate': 4.666666666666667e-05, 'epoch': 1.33}
{'loss': 0.0587, 'learning_rate': 4.642857142857143e-05, 'epoch': 1.43}
{'loss': 0.0821, 'learning_rate': 4.6190476190476194e-05, 'epoch': 1.52}
{'loss': 0.0738, 'learning_rate': 4.595238095238095e-05, 'epoch': 1.62}
{'loss': 0.0759, 'learning_rate': 4.5714285714285716e-05, 'epoch': 1.71}
{'loss': 0.0938, 'learning_rate': 4.547619047619048e-05, 'epoch': 1.81}
{'loss': 0.0595, 'learning_rate': 4.523809523809524e-05, 'epoch': 1.9}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


{'loss': 0.0524, 'learning_rate': 4.5e-05, 'epoch': 2.0}


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 2.8586108684539795, 'eval_accuracy': 0.6509714285714285, 'eval_runtime': 643.638, 'eval_samples_per_second': 108.757, 'eval_steps_per_second': 13.595, 'epoch': 2.0}
{'loss': 0.0447, 'learning_rate': 4.476190476190477e-05, 'epoch': 2.1}
{'loss': 0.0369, 'learning_rate': 4.4523809523809525e-05, 'epoch': 2.19}
{'loss': 0.0358, 'learning_rate': 4.428571428571428e-05, 'epoch': 2.29}
{'loss': 0.0345, 'learning_rate': 4.404761904761905e-05, 'epoch': 2.38}
{'loss': 0.045, 'learning_rate': 4.380952380952381e-05, 'epoch': 2.48}
{'loss': 0.0396, 'learning_rate': 4.3571428571428576e-05, 'epoch': 2.57}
{'loss': 0.0437, 'learning_rate': 4.3333333333333334e-05, 'epoch': 2.67}
{'loss': 0.0393, 'learning_rate': 4.30952380952381e-05, 'epoch': 2.76}
{'loss': 0.0368, 'learning_rate': 4.2857142857142856e-05, 'epoch': 2.86}
{'loss': 0.0339, 'learning_rate': 4.261904761904762e-05, 'epoch': 2.95}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 3.3832809925079346, 'eval_accuracy': 0.6400285714285714, 'eval_runtime': 644.7318, 'eval_samples_per_second': 108.572, 'eval_steps_per_second': 13.572, 'epoch': 3.0}
{'loss': 0.0363, 'learning_rate': 4.2380952380952385e-05, 'epoch': 3.05}
{'loss': 0.0123, 'learning_rate': 4.214285714285714e-05, 'epoch': 3.14}
{'loss': 0.0226, 'learning_rate': 4.190476190476191e-05, 'epoch': 3.24}
{'loss': 0.0306, 'learning_rate': 4.166666666666667e-05, 'epoch': 3.33}
{'loss': 0.0213, 'learning_rate': 4.1428571428571437e-05, 'epoch': 3.43}
{'loss': 0.0342, 'learning_rate': 4.119047619047619e-05, 'epoch': 3.52}
{'loss': 0.0101, 'learning_rate': 4.095238095238095e-05, 'epoch': 3.62}
{'loss': 0.0436, 'learning_rate': 4.0714285714285717e-05, 'epoch': 3.71}
{'loss': 0.0253, 'learning_rate': 4.047619047619048e-05, 'epoch': 3.81}
{'loss': 0.017, 'learning_rate': 4.023809523809524e-05, 'epoch': 3.9}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


{'loss': 0.0215, 'learning_rate': 4e-05, 'epoch': 4.0}


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 3.731532096862793, 'eval_accuracy': 0.6701571428571429, 'eval_runtime': 645.6696, 'eval_samples_per_second': 108.415, 'eval_steps_per_second': 13.552, 'epoch': 4.0}
{'loss': 0.0274, 'learning_rate': 3.976190476190476e-05, 'epoch': 4.1}
{'loss': 0.0115, 'learning_rate': 3.9523809523809526e-05, 'epoch': 4.19}
{'loss': 0.0215, 'learning_rate': 3.928571428571429e-05, 'epoch': 4.29}
{'loss': 0.013, 'learning_rate': 3.904761904761905e-05, 'epoch': 4.38}
{'loss': 0.0296, 'learning_rate': 3.880952380952381e-05, 'epoch': 4.48}
{'loss': 0.0147, 'learning_rate': 3.857142857142858e-05, 'epoch': 4.57}
{'loss': 0.0153, 'learning_rate': 3.8333333333333334e-05, 'epoch': 4.67}
{'loss': 0.0282, 'learning_rate': 3.809523809523809e-05, 'epoch': 4.76}
{'loss': 0.0133, 'learning_rate': 3.785714285714286e-05, 'epoch': 4.86}
{'loss': 0.0259, 'learning_rate': 3.761904761904762e-05, 'epoch': 4.95}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 4.514944076538086, 'eval_accuracy': 0.6274571428571428, 'eval_runtime': 644.2606, 'eval_samples_per_second': 108.652, 'eval_steps_per_second': 13.581, 'epoch': 5.0}
{'loss': 0.0215, 'learning_rate': 3.7380952380952386e-05, 'epoch': 5.05}
{'loss': 0.0176, 'learning_rate': 3.7142857142857143e-05, 'epoch': 5.14}
{'loss': 0.0138, 'learning_rate': 3.690476190476191e-05, 'epoch': 5.24}
{'loss': 0.0203, 'learning_rate': 3.6666666666666666e-05, 'epoch': 5.33}
{'loss': 0.0089, 'learning_rate': 3.642857142857143e-05, 'epoch': 5.43}
{'loss': 0.0221, 'learning_rate': 3.619047619047619e-05, 'epoch': 5.52}
{'loss': 0.0131, 'learning_rate': 3.595238095238095e-05, 'epoch': 5.62}
{'loss': 0.0127, 'learning_rate': 3.571428571428572e-05, 'epoch': 5.71}
{'loss': 0.0128, 'learning_rate': 3.547619047619048e-05, 'epoch': 5.81}
{'loss': 0.0186, 'learning_rate': 3.523809523809524e-05, 'epoch': 5.9}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


{'loss': 0.0166, 'learning_rate': 3.5e-05, 'epoch': 6.0}


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 3.721174478530884, 'eval_accuracy': 0.6677714285714286, 'eval_runtime': 644.3999, 'eval_samples_per_second': 108.628, 'eval_steps_per_second': 13.579, 'epoch': 6.0}
{'loss': 0.0069, 'learning_rate': 3.476190476190476e-05, 'epoch': 6.1}
{'loss': 0.0031, 'learning_rate': 3.4523809523809526e-05, 'epoch': 6.19}
{'loss': 0.0034, 'learning_rate': 3.428571428571429e-05, 'epoch': 6.29}
{'loss': 0.0118, 'learning_rate': 3.404761904761905e-05, 'epoch': 6.38}
{'loss': 0.0168, 'learning_rate': 3.380952380952381e-05, 'epoch': 6.48}
{'loss': 0.0112, 'learning_rate': 3.357142857142857e-05, 'epoch': 6.57}
{'loss': 0.0091, 'learning_rate': 3.3333333333333335e-05, 'epoch': 6.67}
{'loss': 0.01, 'learning_rate': 3.309523809523809e-05, 'epoch': 6.76}
{'loss': 0.0123, 'learning_rate': 3.285714285714286e-05, 'epoch': 6.86}
{'loss': 0.0195, 'learning_rate': 3.261904761904762e-05, 'epoch': 6.95}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 3.7025766372680664, 'eval_accuracy': 0.6515714285714286, 'eval_runtime': 642.8738, 'eval_samples_per_second': 108.886, 'eval_steps_per_second': 13.611, 'epoch': 7.0}
{'loss': 0.0143, 'learning_rate': 3.2380952380952386e-05, 'epoch': 7.05}
{'loss': 0.0087, 'learning_rate': 3.2142857142857144e-05, 'epoch': 7.14}
{'loss': 0.0063, 'learning_rate': 3.19047619047619e-05, 'epoch': 7.24}
{'loss': 0.0078, 'learning_rate': 3.1666666666666666e-05, 'epoch': 7.33}
{'loss': 0.0083, 'learning_rate': 3.142857142857143e-05, 'epoch': 7.43}
{'loss': 0.0086, 'learning_rate': 3.1190476190476195e-05, 'epoch': 7.52}
{'loss': 0.005, 'learning_rate': 3.095238095238095e-05, 'epoch': 7.62}
{'loss': 0.0076, 'learning_rate': 3.071428571428572e-05, 'epoch': 7.71}
{'loss': 0.0222, 'learning_rate': 3.0476190476190482e-05, 'epoch': 7.81}
{'loss': 0.0116, 'learning_rate': 3.0238095238095236e-05, 'epoch': 7.9}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


{'loss': 0.0065, 'learning_rate': 3e-05, 'epoch': 8.0}


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 4.049188613891602, 'eval_accuracy': 0.6430428571428571, 'eval_runtime': 643.2759, 'eval_samples_per_second': 108.818, 'eval_steps_per_second': 13.602, 'epoch': 8.0}
{'loss': 0.0096, 'learning_rate': 2.9761904761904762e-05, 'epoch': 8.1}
{'loss': 0.0006, 'learning_rate': 2.9523809523809526e-05, 'epoch': 8.19}
{'loss': 0.0023, 'learning_rate': 2.9285714285714288e-05, 'epoch': 8.29}
{'loss': 0.0056, 'learning_rate': 2.9047619047619052e-05, 'epoch': 8.38}
{'loss': 0.0063, 'learning_rate': 2.880952380952381e-05, 'epoch': 8.48}
{'loss': 0.012, 'learning_rate': 2.857142857142857e-05, 'epoch': 8.57}
{'loss': 0.0044, 'learning_rate': 2.8333333333333335e-05, 'epoch': 8.67}
{'loss': 0.0018, 'learning_rate': 2.8095238095238096e-05, 'epoch': 8.76}
{'loss': 0.0075, 'learning_rate': 2.785714285714286e-05, 'epoch': 8.86}
{'loss': 0.0033, 'learning_rate': 2.7619047619047622e-05, 'epoch': 8.95}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 4.9771809577941895, 'eval_accuracy': 0.6092571428571428, 'eval_runtime': 643.9593, 'eval_samples_per_second': 108.703, 'eval_steps_per_second': 13.588, 'epoch': 9.0}
{'loss': 0.0124, 'learning_rate': 2.7380952380952383e-05, 'epoch': 9.05}
{'loss': 0.0041, 'learning_rate': 2.714285714285714e-05, 'epoch': 9.14}
{'loss': 0.0018, 'learning_rate': 2.6904761904761905e-05, 'epoch': 9.24}
{'loss': 0.0033, 'learning_rate': 2.6666666666666667e-05, 'epoch': 9.33}
{'loss': 0.0005, 'learning_rate': 2.642857142857143e-05, 'epoch': 9.43}
{'loss': 0.0106, 'learning_rate': 2.6190476190476192e-05, 'epoch': 9.52}
{'loss': 0.0115, 'learning_rate': 2.5952380952380957e-05, 'epoch': 9.62}
{'loss': 0.0019, 'learning_rate': 2.5714285714285714e-05, 'epoch': 9.71}
{'loss': 0.0025, 'learning_rate': 2.5476190476190476e-05, 'epoch': 9.81}
{'loss': 0.01, 'learning_rate': 2.523809523809524e-05, 'epoch': 9.9}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


{'loss': 0.009, 'learning_rate': 2.5e-05, 'epoch': 10.0}


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 4.685549736022949, 'eval_accuracy': 0.6191714285714286, 'eval_runtime': 644.0462, 'eval_samples_per_second': 108.688, 'eval_steps_per_second': 13.586, 'epoch': 10.0}
{'loss': 0.0015, 'learning_rate': 2.4761904761904762e-05, 'epoch': 10.1}
{'loss': 0.003, 'learning_rate': 2.4523809523809523e-05, 'epoch': 10.19}
{'loss': 0.0016, 'learning_rate': 2.4285714285714288e-05, 'epoch': 10.29}
{'loss': 0.0091, 'learning_rate': 2.404761904761905e-05, 'epoch': 10.38}
{'loss': 0.0054, 'learning_rate': 2.380952380952381e-05, 'epoch': 10.48}
{'loss': 0.0042, 'learning_rate': 2.357142857142857e-05, 'epoch': 10.57}
{'loss': 0.0023, 'learning_rate': 2.3333333333333336e-05, 'epoch': 10.67}
{'loss': 0.0059, 'learning_rate': 2.3095238095238097e-05, 'epoch': 10.76}
{'loss': 0.0019, 'learning_rate': 2.2857142857142858e-05, 'epoch': 10.86}
{'loss': 0.0056, 'learning_rate': 2.261904761904762e-05, 'epoch': 10.95}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 4.395516872406006, 'eval_accuracy': 0.6322285714285715, 'eval_runtime': 644.5517, 'eval_samples_per_second': 108.603, 'eval_steps_per_second': 13.575, 'epoch': 11.0}
{'loss': 0.0027, 'learning_rate': 2.2380952380952384e-05, 'epoch': 11.05}
{'loss': 0.0011, 'learning_rate': 2.214285714285714e-05, 'epoch': 11.14}
{'loss': 0.0078, 'learning_rate': 2.1904761904761906e-05, 'epoch': 11.24}
{'loss': 0.0028, 'learning_rate': 2.1666666666666667e-05, 'epoch': 11.33}
{'loss': 0.0023, 'learning_rate': 2.1428571428571428e-05, 'epoch': 11.43}
{'loss': 0.0, 'learning_rate': 2.1190476190476193e-05, 'epoch': 11.52}
{'loss': 0.0024, 'learning_rate': 2.0952380952380954e-05, 'epoch': 11.62}
{'loss': 0.003, 'learning_rate': 2.0714285714285718e-05, 'epoch': 11.71}
{'loss': 0.0021, 'learning_rate': 2.0476190476190476e-05, 'epoch': 11.81}
{'loss': 0.0032, 'learning_rate': 2.023809523809524e-05, 'epoch': 11.9}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


{'loss': 0.0002, 'learning_rate': 2e-05, 'epoch': 12.0}


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 4.5103068351745605, 'eval_accuracy': 0.6432857142857142, 'eval_runtime': 645.1996, 'eval_samples_per_second': 108.494, 'eval_steps_per_second': 13.562, 'epoch': 12.0}
{'loss': 0.0, 'learning_rate': 1.9761904761904763e-05, 'epoch': 12.1}
{'loss': 0.0031, 'learning_rate': 1.9523809523809524e-05, 'epoch': 12.19}
{'loss': 0.0039, 'learning_rate': 1.928571428571429e-05, 'epoch': 12.29}
{'loss': 0.0016, 'learning_rate': 1.9047619047619046e-05, 'epoch': 12.38}
{'loss': 0.0, 'learning_rate': 1.880952380952381e-05, 'epoch': 12.48}
{'loss': 0.01, 'learning_rate': 1.8571428571428572e-05, 'epoch': 12.57}
{'loss': 0.0024, 'learning_rate': 1.8333333333333333e-05, 'epoch': 12.67}
{'loss': 0.0025, 'learning_rate': 1.8095238095238094e-05, 'epoch': 12.76}
{'loss': 0.0, 'learning_rate': 1.785714285714286e-05, 'epoch': 12.86}
{'loss': 0.0021, 'learning_rate': 1.761904761904762e-05, 'epoch': 12.95}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 4.674953937530518, 'eval_accuracy': 0.6395857142857143, 'eval_runtime': 645.3298, 'eval_samples_per_second': 108.472, 'eval_steps_per_second': 13.559, 'epoch': 13.0}
{'loss': 0.0015, 'learning_rate': 1.738095238095238e-05, 'epoch': 13.05}
{'loss': 0.0, 'learning_rate': 1.7142857142857145e-05, 'epoch': 13.14}
{'loss': 0.0, 'learning_rate': 1.6904761904761906e-05, 'epoch': 13.24}
{'loss': 0.0, 'learning_rate': 1.6666666666666667e-05, 'epoch': 13.33}
{'loss': 0.0001, 'learning_rate': 1.642857142857143e-05, 'epoch': 13.43}
{'loss': 0.0057, 'learning_rate': 1.6190476190476193e-05, 'epoch': 13.52}
{'loss': 0.0001, 'learning_rate': 1.595238095238095e-05, 'epoch': 13.62}
{'loss': 0.0006, 'learning_rate': 1.5714285714285715e-05, 'epoch': 13.71}
{'loss': 0.0, 'learning_rate': 1.5476190476190476e-05, 'epoch': 13.81}
{'loss': 0.0, 'learning_rate': 1.5238095238095241e-05, 'epoch': 13.9}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


{'loss': 0.0037, 'learning_rate': 1.5e-05, 'epoch': 14.0}


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 5.061285972595215, 'eval_accuracy': 0.6324428571428572, 'eval_runtime': 644.0548, 'eval_samples_per_second': 108.686, 'eval_steps_per_second': 13.586, 'epoch': 14.0}
{'loss': 0.0, 'learning_rate': 1.4761904761904763e-05, 'epoch': 14.1}
{'loss': 0.0044, 'learning_rate': 1.4523809523809526e-05, 'epoch': 14.19}
{'loss': 0.0002, 'learning_rate': 1.4285714285714285e-05, 'epoch': 14.29}
{'loss': 0.0023, 'learning_rate': 1.4047619047619048e-05, 'epoch': 14.38}
{'loss': 0.0, 'learning_rate': 1.3809523809523811e-05, 'epoch': 14.48}
{'loss': 0.0001, 'learning_rate': 1.357142857142857e-05, 'epoch': 14.57}
{'loss': 0.0, 'learning_rate': 1.3333333333333333e-05, 'epoch': 14.67}
{'loss': 0.0024, 'learning_rate': 1.3095238095238096e-05, 'epoch': 14.76}
{'loss': 0.0, 'learning_rate': 1.2857142857142857e-05, 'epoch': 14.86}
{'loss': 0.0, 'learning_rate': 1.261904761904762e-05, 'epoch': 14.95}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 5.06746768951416, 'eval_accuracy': 0.6483285714285715, 'eval_runtime': 643.7986, 'eval_samples_per_second': 108.73, 'eval_steps_per_second': 13.591, 'epoch': 15.0}
{'loss': 0.0, 'learning_rate': 1.2380952380952381e-05, 'epoch': 15.05}
{'loss': 0.0, 'learning_rate': 1.2142857142857144e-05, 'epoch': 15.14}
{'loss': 0.0052, 'learning_rate': 1.1904761904761905e-05, 'epoch': 15.24}
{'loss': 0.0016, 'learning_rate': 1.1666666666666668e-05, 'epoch': 15.33}
{'loss': 0.0, 'learning_rate': 1.1428571428571429e-05, 'epoch': 15.43}
{'loss': 0.0, 'learning_rate': 1.1190476190476192e-05, 'epoch': 15.52}
{'loss': 0.0, 'learning_rate': 1.0952380952380953e-05, 'epoch': 15.62}
{'loss': 0.0, 'learning_rate': 1.0714285714285714e-05, 'epoch': 15.71}
{'loss': 0.0, 'learning_rate': 1.0476190476190477e-05, 'epoch': 15.81}
{'loss': 0.0, 'learning_rate': 1.0238095238095238e-05, 'epoch': 15.9}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 16.0}


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 5.60060977935791, 'eval_accuracy': 0.6299571428571429, 'eval_runtime': 644.4035, 'eval_samples_per_second': 108.628, 'eval_steps_per_second': 13.578, 'epoch': 16.0}
{'loss': 0.0027, 'learning_rate': 9.761904761904762e-06, 'epoch': 16.1}
{'loss': 0.0, 'learning_rate': 9.523809523809523e-06, 'epoch': 16.19}
{'loss': 0.0, 'learning_rate': 9.285714285714286e-06, 'epoch': 16.29}
{'loss': 0.0, 'learning_rate': 9.047619047619047e-06, 'epoch': 16.38}
{'loss': 0.0, 'learning_rate': 8.80952380952381e-06, 'epoch': 16.48}
{'loss': 0.0, 'learning_rate': 8.571428571428573e-06, 'epoch': 16.57}
{'loss': 0.0, 'learning_rate': 8.333333333333334e-06, 'epoch': 16.67}
{'loss': 0.0, 'learning_rate': 8.095238095238097e-06, 'epoch': 16.76}
{'loss': 0.0, 'learning_rate': 7.857142857142858e-06, 'epoch': 16.86}
{'loss': 0.0, 'learning_rate': 7.6190476190476205e-06, 'epoch': 16.95}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 5.661205768585205, 'eval_accuracy': 0.6334714285714286, 'eval_runtime': 648.8878, 'eval_samples_per_second': 107.877, 'eval_steps_per_second': 13.485, 'epoch': 17.0}
{'loss': 0.0004, 'learning_rate': 7.380952380952382e-06, 'epoch': 17.05}
{'loss': 0.0, 'learning_rate': 7.142857142857143e-06, 'epoch': 17.14}
{'loss': 0.0, 'learning_rate': 6.9047619047619055e-06, 'epoch': 17.24}
{'loss': 0.0, 'learning_rate': 6.666666666666667e-06, 'epoch': 17.33}
{'loss': 0.0027, 'learning_rate': 6.428571428571429e-06, 'epoch': 17.43}
{'loss': 0.0, 'learning_rate': 6.190476190476191e-06, 'epoch': 17.52}
{'loss': 0.0, 'learning_rate': 5.9523809523809525e-06, 'epoch': 17.62}
{'loss': 0.0004, 'learning_rate': 5.7142857142857145e-06, 'epoch': 17.71}
{'loss': 0.0, 'learning_rate': 5.4761904761904765e-06, 'epoch': 17.81}
{'loss': 0.0, 'learning_rate': 5.2380952380952384e-06, 'epoch': 17.9}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


{'loss': 0.0, 'learning_rate': 5e-06, 'epoch': 18.0}


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 5.4780659675598145, 'eval_accuracy': 0.6364857142857143, 'eval_runtime': 644.0097, 'eval_samples_per_second': 108.694, 'eval_steps_per_second': 13.587, 'epoch': 18.0}
{'loss': 0.0, 'learning_rate': 4.7619047619047615e-06, 'epoch': 18.1}
{'loss': 0.0, 'learning_rate': 4.5238095238095235e-06, 'epoch': 18.19}
{'loss': 0.0, 'learning_rate': 4.285714285714286e-06, 'epoch': 18.29}
{'loss': 0.0, 'learning_rate': 4.047619047619048e-06, 'epoch': 18.38}
{'loss': 0.0, 'learning_rate': 3.8095238095238102e-06, 'epoch': 18.48}
{'loss': 0.0, 'learning_rate': 3.5714285714285714e-06, 'epoch': 18.57}
{'loss': 0.0, 'learning_rate': 3.3333333333333333e-06, 'epoch': 18.67}
{'loss': 0.0, 'learning_rate': 3.0952380952380953e-06, 'epoch': 18.76}
{'loss': 0.0, 'learning_rate': 2.8571428571428573e-06, 'epoch': 18.86}
{'loss': 0.0023, 'learning_rate': 2.6190476190476192e-06, 'epoch': 18.95}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


  0%|          | 0/8750 [00:00<?, ?it/s]

{'eval_loss': 5.475131511688232, 'eval_accuracy': 0.6393285714285715, 'eval_runtime': 643.3178, 'eval_samples_per_second': 108.811, 'eval_steps_per_second': 13.601, 'epoch': 19.0}
{'loss': 0.0, 'learning_rate': 2.3809523809523808e-06, 'epoch': 19.05}
{'loss': 0.0, 'learning_rate': 2.142857142857143e-06, 'epoch': 19.14}
{'loss': 0.0, 'learning_rate': 1.9047619047619051e-06, 'epoch': 19.24}
{'loss': 0.0, 'learning_rate': 1.6666666666666667e-06, 'epoch': 19.33}
{'loss': 0.0, 'learning_rate': 1.4285714285714286e-06, 'epoch': 19.43}
{'loss': 0.0, 'learning_rate': 1.1904761904761904e-06, 'epoch': 19.52}
{'loss': 0.0, 'learning_rate': 9.523809523809526e-07, 'epoch': 19.62}
{'loss': 0.0, 'learning_rate': 7.142857142857143e-07, 'epoch': 19.71}
{'loss': 0.0001, 'learning_rate': 4.761904761904763e-07, 'epoch': 19.81}
{'loss': 0.0, 'learning_rate': 2.3809523809523814e-07, 'epoch': 19.9}


***** Running Evaluation *****
  Num examples = 70000
  Batch size = 8


{'loss': 0.0, 'learning_rate': 0.0, 'epoch': 20.0}


  0%|          | 0/8750 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 5.533255577087402, 'eval_accuracy': 0.6378142857142857, 'eval_runtime': 645.005, 'eval_samples_per_second': 108.526, 'eval_steps_per_second': 13.566, 'epoch': 20.0}
{'train_runtime': 37266.5365, 'train_samples_per_second': 22.54, 'train_steps_per_second': 2.818, 'train_loss': 0.021139337502380545, 'epoch': 20.0}


TrainOutput(global_step=105000, training_loss=0.021139337502380545, metrics={'train_runtime': 37266.5365, 'train_samples_per_second': 22.54, 'train_steps_per_second': 2.818, 'train_loss': 0.021139337502380545, 'epoch': 20.0})

In [23]:
model.save_pretrained(SAVED_MODEL)

Configuration saved in ../model/Proposed-Word2Vec-GPT2_20221125153534/config.json
Model weights saved in ../model/Proposed-Word2Vec-GPT2_20221125153534/pytorch_model.bin


# Test

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(SAVED_MODEL)

training_args = TrainingArguments(output_dir=SAVED_MODEL,report_to="none")
trainer = Trainer(model=model, args=training_args)

loading configuration file ../model/Proposed-Word2Vec-GPT2_20221125153534/config.json
Model config GPT2Config {
  "_name_or_path": "../model/Proposed-Word2Vec-GPT2_20221125153534",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2ForSequenceClassification"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "layer_no

In [25]:
pred = trainer.predict(small_eval_dataset)

***** Running Prediction *****
  Num examples = 70000
  Batch size = 8


  0%|          | 0/8750 [00:00<?, ?it/s]

In [26]:
from sklearn.metrics import classification_report
y_pred = [np.argmax(i) for i in pred.predictions]

target_names = [c[:3]+"." for c in classes]

rep = classification_report(pred.label_ids, y_pred, target_names=target_names, digits=3)
print(rep)

              precision    recall  f1-score   support

        Com.      0.812     0.441     0.572      5000
        Edu.      0.678     0.525     0.592      5000
        Art.      0.689     0.593     0.638      5000
        Ath.      0.762     0.960     0.850      5000
        Off.      0.660     0.677     0.668      5000
        Mea.      0.717     0.451     0.554      5000
        Bui.      0.744     0.408     0.527      5000
        Nat.      0.319     0.717     0.442      5000
        Vil.      0.616     0.980     0.756      5000
        Ani.      0.646     0.470     0.544      5000
        Pla.      0.673     0.617     0.644      5000
        Alb.      0.926     0.906     0.916      5000
        Fil.      0.857     0.498     0.630      5000
        Wri.      0.522     0.686     0.593      5000

    accuracy                          0.638     70000
   macro avg      0.687     0.638     0.637     70000
weighted avg      0.687     0.638     0.637     70000

