In [1]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, logging
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow import keras
import tensorflow as tf
import gensim.downloader
from tqdm import tqdm
import numpy as np
import string
import time
import csv
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
np.random.seed(0)
logging.set_verbosity_error()

import datetime
t_delta = datetime.timedelta(hours=9)
JST = datetime.timezone(t_delta, 'JST')
now = datetime.datetime.now(JST)

In [3]:
# parameters
MODEL = "bert-base-uncased"
X_TRAIN = '../dataset/Proposed-Word2Vec-BERT_x_train.npy'
Y_TRAIN = '../dataset/Proposed-Word2Vec-BERT_y_train.npy'
X_TEST = '../dataset/Proposed-Word2Vec-BERT_x_test.npy'
Y_TEST = '../dataset/Proposed-Word2Vec-BERT_y_test.npy'
SAVED_MODEL = "../model/Proposed-Word2Vec-BERT_"+str(now.strftime('%Y%m%d%H%M%S'))+".h5"
THRESHOLD = 0.05
MAXLEN_GET_PSEUDO = 3000
EPOCH = 1
BATCH_SIZE = 8

In [4]:
print(SAVED_MODEL)

../model/Proposed-Word2Vec-BERT_20221111162257.h5


In [5]:
# 前処理
def preprocessing(text):
    # 括弧内文章の削除
    text = re.sub(r'\(.*\)',' ',text)
    text = re.sub(r'\[.*\]',' ',text)
    text = re.sub(r'\<.*\>',' ',text)
    text = re.sub(r'\{.*\}',' ',text)
    # 記号文字の削除
    text = text.translate(str.maketrans('','',string.punctuation))
    # スペースの調整
    text = re.sub(r'\s+',' ',text)
    return text

In [6]:
# 20 newsgroups datasets
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset="all")
newsgroups_datasets = list()

# example ------------------------------------------------
for texts in tqdm(newsgroups.data[:1000]):
  texts = texts.split("\n\n")
  texts = " ".join(texts[1:])
  newsgroups_datasets.append(preprocessing(texts))
# --------------------------------------------------------

# for texts in tqdm(newsgroups.data):
#   texts = texts.split("\n\n")
#   texts = " ".join(texts[1:])
#   newsgroups_datasets.append(preprocessing(texts))

100%|██████████| 1000/1000 [00:00<00:00, 20587.16it/s]


In [7]:
# yahoo topic datasets
with open('../data/topic/train_pu_half_v0.txt','r',encoding='utf-8') as f:
    texts_v0 = f.read()
with open('../data/topic/train_pu_half_v1.txt','r',encoding='utf-8') as f:
    texts_v1 = f.read()
texts = texts_v0 + texts_v1
topic_datasets = list()

# example ----------------------------------------------
for label_text in tqdm(texts.splitlines()[:1000]):
  _, text = label_text.split("\t")
  topic_datasets.append(preprocessing(text))
# -------------------------------------------------------

# for label_text in tqdm(texts.splitlines()):
#   _, text = label_text.split("\t")
#   topic_datasets.append(preprocessing(text))

100%|██████████| 1000/1000 [00:00<00:00, 54217.99it/s]


In [8]:
# reuters datasets
with open("../data/reuter/sourceall.txt", "r", encoding="utf-8") as f:
  reuter = f.read().split("\n")[:-1]

# example -----------------------------------
reuter = reuter[:1000]
# -------------------------------------------

reuters_datasets = list()
for label_text in tqdm(reuter):
  _, text = label_text.split("\t")
  reuters_datasets.append(preprocessing(text))

100%|██████████| 1000/1000 [00:00<00:00, 26070.36it/s]


In [9]:
# dbpedia datasets train
with open('../data/dbpedia_csv/train.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# example -------------------
reader = reader[:1000]
#----------------------------

dbpedia_train_datasets = list()
for _, auth, text in tqdm(reader):
    text = text.replace(auth,'')
    dbpedia_train_datasets.append(preprocessing(text))

100%|██████████| 1000/1000 [00:00<00:00, 64203.77it/s]


In [10]:
# dbpedia classes
with open("../data/dbpedia_csv/classes.txt", "r", encoding="utf-8") as f:
  classes = f.read().splitlines()

In [11]:
datasets_texts = newsgroups_datasets + topic_datasets + reuters_datasets + dbpedia_train_datasets

In [12]:
word2vec = gensim.downloader.load('word2vec-google-news-300')

def w2v_avg_vector(sentence):
  vector = np.zeros((300,), dtype="float32")
  count = 0
  for word in sentence.split():
    try:
      vector = np.add(vector, word2vec[word])
      count += 1
    except:
      pass
  if count > 0:
    vector = np.divide(vector, len(word))
  return vector

In [13]:
classes_vector = list()
for cls in classes:
  classes_vector.append(w2v_avg_vector(cls))

In [14]:
diff_datasets = {i:[] for i in range(len(classes))}
for texts in tqdm(datasets_texts):
  texts_vector = w2v_avg_vector(texts)
  similarity = cosine_similarity([texts_vector], classes_vector)[0]
  sim_argsorted = np.argsort(similarity)
  diff = similarity[sim_argsorted[-1]] - similarity[sim_argsorted[-2]]
  if diff > THRESHOLD:
    diff_datasets[sim_argsorted[-1]].append((similarity[sim_argsorted[-1]], texts))

pseudo_texts = list()
pseudo_labels = list()
for i in range(len(classes)):
  sorted_diff_data = sorted(diff_datasets[i], reverse=True)[:MAXLEN_GET_PSEUDO]
  pseudo_texts.extend([i[1] for i in sorted_diff_data])
  pseudo_labels.extend([i]*len(sorted_diff_data[:MAXLEN_GET_PSEUDO]))

100%|██████████| 4000/4000 [00:01<00:00, 2653.08it/s]


In [15]:
print("Number of all selected data")
for i in diff_datasets:
  print(classes[i][:3]+". : "+str(len(diff_datasets[i])))

Number of all selected data
Com. : 273
Edu. : 34
Art. : 1
Ath. : 2
Off. : 174
Mea. : 210
Bui. : 7
Nat. : 53
Vil. : 1
Ani. : 11
Pla. : 6
Alb. : 24
Fil. : 30
Wri. : 18


In [18]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
x_train = tokenizer(pseudo_texts, truncation=True, return_tensors="tf", padding="max_length", max_length=512)
y_train = np.array(pseudo_labels)

model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
model.classifier = tf.keras.layers.Dense(units=14, activation="softmax", name="classifier")
model.compile(optimizer=keras.optimizers.Adam(3e-5), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=tf.metrics.SparseCategoricalAccuracy())
model.fit([x_train["input_ids"],x_train["attention_mask"]], y_train, batch_size=BATCH_SIZE, epochs=EPOCH)



<keras.callbacks.History at 0x7fc09ce26748>

In [23]:
# model.save_pretrained(SAVED_MODEL,saved_model=True)
model.save(SAVED_MODEL)

NotImplementedError: Saving the model to HDF5 format requires the model to be a Functional model or a Sequential model. It does not work for subclassed models, because such models are defined via the body of a Python method, which isn't safely serializable. Consider saving to the Tensorflow SavedModel format (by setting save_format="tf") or using `save_weights`.

In [22]:
model = tf.keras.models.load_model(SAVED_MODEL)

ValueError: No model found in config file.

In [None]:
model.summary()

In [21]:
model = TFAutoModelForSequenceClassification.from_pretrained(SAVED_MODEL)

pred = model.predict([x_train["input_ids"],x_train["attention_mask"]], batch_size=BATCH_SIZE)
y_pred = [np.argmax(i) for i in pred.logits]

target_names = [c[:3]+"." for c in classes]
rep = classification_report(y_train, y_pred, target_names=target_names, digits=3)
print(rep)

OSError: It looks like the config file at '../model/Proposed-Word2Vec-BERT_20221111162257.h5' is not a valid JSON file.

In [None]:
# load test data
# dbpedia datasets train
with open('../data/dbpedia_csv/test.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# example -------------------
import random
reader = random.sample(reader, 1000)
#----------------------------

test_texts = list()
test_labels = list()
for labels, auth, text in tqdm(reader):
    text = text.replace(auth,'')
    test_texts.append(preprocessing(text))
    test_labels.append(int(labels)-1)

In [None]:
x_test = tokenizer(test_texts, truncation=True, return_tensors="tf", padding="max_length", max_length=512)
y_test = np.array(test_labels)

In [None]:
pred = model.predict([x_test["input_ids"],x_test["attention_mask"]], batch_size=BATCH_SIZE)
y_pred = [np.argmax(i) for i in pred.logits]
target_names = [c[:3]+"." for c in classes]
rep = classification_report(y_test, y_pred, target_names=target_names, digits=3)
print(rep)

        Com.      0.565     0.388     0.460        67
        Edu.      0.404     0.984     0.573        64
        Art.      0.000     0.000     0.000        75
        Ath.      0.667     0.026     0.049        78
        Off.      0.413     0.816     0.548        87
        Mea.      0.474     0.458     0.466        59
        Bui.      0.919     0.466     0.618        73
        Nat.      0.366     0.810     0.504        79
        Vil.      0.000     0.000     0.000        74
        Ani.      0.786     0.775     0.780        71
        Pla.      0.000     0.000     0.000        75
        Alb.      0.717     0.973     0.826        73
        Fil.      0.562     0.818     0.667        55
        Wri.      0.500     0.643     0.563        70

    accuracy                          0.503      1000
   macro avg      0.455     0.511     0.432      1000
weighted avg      0.450     0.503     0.423      1000
