In [1]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, logging
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow import keras
import tensorflow as tf
import gensim.downloader
from tqdm import tqdm
import numpy as np
import string
import time
import csv
import re

np.random.seed(0)
logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# parameters
MODEL = "bert-base-uncased"
THRESHOLD = 0.05
MAXLEN_GET_PSEUDO = 5000
EPOCH = 10
BATCH_SIZE = 8

In [3]:
# 前処理
def preprocessing(text):
    # 括弧内文章の削除
    text = re.sub(r'\(.*\)',' ',text)
    text = re.sub(r'\[.*\]',' ',text)
    text = re.sub(r'\<.*\>',' ',text)
    text = re.sub(r'\{.*\}',' ',text)
    # 記号文字の削除
    text = text.translate(str.maketrans('','',string.punctuation))
    # スペースの調整
    text = re.sub(r'\s+',' ',text)
    return text

In [4]:
# 20 newsgroups datasets
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset="all")
newsgroups_datasets = list()

# # example ------------------------------------------------
# for texts in tqdm(newsgroups.data[:10000]):
#   texts = texts.split("\n\n")
#   texts = " ".join(texts[1:])
#   newsgroups_datasets.append(preprocessing(texts))
# # --------------------------------------------------------

for texts in tqdm(newsgroups.data):
  texts = texts.split("\n\n")
  texts = " ".join(texts[1:])
  newsgroups_datasets.append(preprocessing(texts))

100%|██████████| 18846/18846 [00:01<00:00, 18710.03it/s]


In [5]:
# yahoo topic datasets
with open('../data/topic/train_pu_half_v0.txt','r',encoding='utf-8') as f:
    texts_v0 = f.read()
with open('../data/topic/train_pu_half_v1.txt','r',encoding='utf-8') as f:
    texts_v1 = f.read()
texts = texts_v0 + texts_v1
topic_datasets = list()

# # example ----------------------------------------------
# for label_text in tqdm(texts.splitlines()[:10000]):
#   _, text = label_text.split("\t")
#   topic_datasets.append(preprocessing(text))
# # -------------------------------------------------------

for label_text in tqdm(texts.splitlines()):
  _, text = label_text.split("\t")
  topic_datasets.append(preprocessing(text))

100%|██████████| 1300000/1300000 [00:27<00:00, 47508.63it/s]


In [6]:
# reuters datasets
with open("../data/reuter/sourceall.txt", "r", encoding="utf-8") as f:
  reuter = f.read().split("\n")[:-1]

# # example -----------------------------------
# reuter = reuter[:10000]
# # -------------------------------------------

reuters_datasets = list()
for label_text in tqdm(reuter):
  _, text = label_text.split("\t")
  reuters_datasets.append(preprocessing(text))

100%|██████████| 762027/762027 [00:29<00:00, 25763.80it/s]


In [7]:
# dbpedia datasets train
with open('../data/dbpedia_csv/train.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# # example -------------------
# reader = reader[:10000]
# #----------------------------

dbpedia_train_datasets = list()
for _, auth, text in tqdm(reader):
    text = text.replace(auth,'')
    dbpedia_train_datasets.append(preprocessing(text))

100%|██████████| 560000/560000 [00:08<00:00, 68177.47it/s]


In [8]:
# dbpedia classes
with open("../data/dbpedia_csv/classes.txt", "r", encoding="utf-8") as f:
  classes = f.read().splitlines()

In [9]:
datasets_texts = newsgroups_datasets + topic_datasets + reuters_datasets + dbpedia_train_datasets

In [10]:
word2vec = gensim.downloader.load('word2vec-google-news-300')

def w2v_avg_vector(sentence):
  vector = np.zeros((300,), dtype="float32")
  count = 0
  for word in sentence.split():
    try:
      vector = np.add(vector, word2vec[word])
      count += 1
    except:
      pass
  if count > 0:
    vector = np.divide(vector, len(word))
  return vector

In [11]:
classes_vector = list()
for cls in classes:
  classes_vector.append(w2v_avg_vector(cls))

In [12]:
diff_datasets = {i:[] for i in range(len(classes))}
for texts in tqdm(datasets_texts):
  texts_vector = w2v_avg_vector(texts)
  similarity = cosine_similarity([texts_vector], classes_vector)[0]
  sim_argsorted = np.argsort(similarity)
  diff = similarity[sim_argsorted[-1]] - similarity[sim_argsorted[-2]]
  if diff > THRESHOLD:
    diff_datasets[sim_argsorted[-1]].append((similarity[sim_argsorted[-1]], texts))

pseudo_texts = list()
pseudo_labels = list()
for i in range(len(classes)):
  sorted_diff_data = sorted(diff_datasets[i], reverse=True)[:MAXLEN_GET_PSEUDO]
  pseudo_texts.extend([i[1] for i in sorted_diff_data])
  pseudo_labels.extend([i]*len(sorted_diff_data[:MAXLEN_GET_PSEUDO]))

100%|██████████| 2640873/2640873 [13:50<00:00, 3181.23it/s]


In [13]:
print("Number of all selected data")
for i in diff_datasets:
  print(classes[i][:3]+". : "+str(len(diff_datasets[i])))

Number of all selected data
Com. : 61181
Edu. : 34672
Art. : 6818
Ath. : 16123
Off. : 126859
Mea. : 144368
Bui. : 10120
Nat. : 38562
Vil. : 35431
Ani. : 4742
Pla. : 13933
Alb. : 42379
Fil. : 26335
Wri. : 23263


In [14]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
x_train = tokenizer(pseudo_texts, truncation=True, return_tensors="tf", padding="max_length", max_length=512)
y_train = np.array(pseudo_labels)

model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
model.classifier = tf.keras.layers.Dense(units=14, activation="softmax", name="classifer")
model.compile(optimizer=keras.optimizers.Adam(3e-5), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=tf.metrics.SparseCategoricalAccuracy())
model.fit(x_train["input_ids"], y_train, batch_size=BATCH_SIZE, epochs=EPOCH)

Epoch 1/10


  '"`sparse_categorical_crossentropy` received `from_logits=True`, but '


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f47bcc17a90>

In [15]:
pred = model.predict(x_train["input_ids"], batch_size=BATCH_SIZE)
y_pred = [np.argmax(i) for i in pred.logits]

target_names = [c[:3]+"." for c in classes]
rep = classification_report(y_train, y_pred, target_names=target_names, digits=3)
print(rep)

              precision    recall  f1-score   support

        Com.      0.997     0.230     0.373      5000
        Edu.      0.528     0.999     0.690      5000
        Art.      0.429     0.001     0.001      5000
        Ath.      0.997     0.998     0.997      5000
        Off.      0.999     0.455     0.625      5000
        Mea.      0.990     0.998     0.994      5000
        Bui.      0.963     0.113     0.202      5000
        Nat.      0.997     0.992     0.995      5000
        Vil.      0.701     0.999     0.824      5000
        Ani.      1.000     0.997     0.998      4742
        Pla.      0.999     0.999     0.999      5000
        Alb.      0.998     1.000     0.999      5000
        Fil.      0.435     0.998     0.606      5000
        Wri.      0.627     0.993     0.769      5000

    accuracy                          0.768     69742
   macro avg      0.833     0.769     0.720     69742
weighted avg      0.832     0.768     0.718     69742



In [16]:
# load test data
# dbpedia datasets train
with open('../data/dbpedia_csv/test.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# # example -------------------
# import random
# reader = random.sample(reader, 1000)
# #----------------------------

test_texts = list()
test_labels = list()
for labels, auth, text in tqdm(reader):
    text = text.replace(auth,'')
    test_texts.append(preprocessing(text))
    test_labels.append(int(labels)-1)

100%|██████████| 70000/70000 [00:01<00:00, 66318.44it/s]


In [17]:
x_test = tokenizer(test_texts, truncation=True, return_tensors="tf", padding="max_length", max_length=512)
y_test = np.array(test_labels)

In [18]:
pred = model.predict(x_test["input_ids"], batch_size=BATCH_SIZE)
y_pred = [np.argmax(i) for i in pred.logits]
target_names = [c[:3]+"." for c in classes]
rep = classification_report(y_test, y_pred, target_names=target_names, digits=3)
print(rep)

              precision    recall  f1-score   support

        Com.      0.780     0.297     0.430      5000
        Edu.      0.508     0.664     0.575      5000
        Art.      0.380     0.011     0.021      5000
        Ath.      0.878     0.918     0.898      5000
        Off.      0.698     0.352     0.468      5000
        Mea.      0.520     0.562     0.540      5000
        Bui.      0.816     0.269     0.404      5000
        Nat.      0.285     0.778     0.417      5000
        Vil.      0.585     0.962     0.727      5000
        Ani.      0.650     0.254     0.365      5000
        Pla.      0.779     0.562     0.653      5000
        Alb.      0.899     0.901     0.900      5000
        Fil.      0.605     0.630     0.618      5000
        Wri.      0.378     0.679     0.486      5000

    accuracy                          0.560     70000
   macro avg      0.626     0.560     0.536     70000
weighted avg      0.626     0.560     0.536     70000

