In [25]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, logging
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow import keras
import tensorflow as tf
import gensim.downloader
from tqdm import tqdm
import numpy as np
import string
import time
import csv
import re

np.random.seed(0)
logging.set_verbosity_error()

In [26]:
# parameters
MODEL = "bert-base-uncased"
MAXLEN_GET_PSEUDO = 1000
EPOCH = 4
BATCH_SIZE = 1


In [27]:
# 前処理
def preprocessing(text):
    # 括弧内文章の削除
    text = re.sub(r'\(.*\)',' ',text)
    text = re.sub(r'\[.*\]',' ',text)
    text = re.sub(r'\<.*\>',' ',text)
    text = re.sub(r'\{.*\}',' ',text)
    # 記号文字の削除
    text = text.translate(str.maketrans('','',string.punctuation))
    # スペースの調整
    text = re.sub(r'\s+',' ',text)
    return text

In [28]:
# 20 newsgroups datasets
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset="all")
newsgroups_datasets = list()

# example ------------------------------------------------
for texts in tqdm(newsgroups.data[:1000]):
  texts = texts.split("\n\n")
  texts = " ".join(texts[1:])
  newsgroups_datasets.append(preprocessing(texts))
# --------------------------------------------------------

# for texts in tqdm(newsgroups.data):
#   texts = texts.split("\n\n")
#   texts = " ".join(texts[1:])
#   newsgroups_datasets.append(preprocessing(texts))

100%|██████████| 10000/10000 [00:00<00:00, 16364.34it/s]


In [29]:
# yahoo topic datasets
with open('../data/topic/train_pu_half_v0.txt','r',encoding='utf-8') as f:
    texts_v0 = f.read()
with open('../data/topic/train_pu_half_v1.txt','r',encoding='utf-8') as f:
    texts_v1 = f.read()
texts = texts_v0 + texts_v1
topic_datasets = list()

# example ----------------------------------------------
for label_text in tqdm(texts.splitlines()[:1000]):
  _, text = label_text.split("\t")
  topic_datasets.append(preprocessing(text))
# -------------------------------------------------------

# for label_text in tqdm(texts.splitlines()):
#   _, text = label_text.split("\t")
#   topic_datasets.append(preprocessing(text))

100%|██████████| 10000/10000 [00:00<00:00, 48614.06it/s]


In [30]:
# reuters datasets
with open("../data/reuter/sourceall.txt", "r", encoding="utf-8") as f:
  reuter = f.read().split("\n")[:-1]

# example -----------------------------------
reuter = reuter[:1000]
# -------------------------------------------

# reuter = reuter

reuters_datasets = list()
for label_text in tqdm(reuter):
  _, text = label_text.split("\t")
  reuters_datasets.append(preprocessing(text))

100%|██████████| 10000/10000 [00:00<00:00, 22955.37it/s]


In [31]:
# dbpedia datasets train
with open('../data/dbpedia_csv/train.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# example -------------------
reader = reader[:1000]
#----------------------------

# reader = reader

dbpedia_train_datasets = list()
for _, auth, text in tqdm(reader):
    text = text.replace(auth,'')
    dbpedia_train_datasets.append(preprocessing(text))

100%|██████████| 10000/10000 [00:00<00:00, 57629.98it/s]


In [32]:
# dbpedia classes
with open("../data/dbpedia_csv/classes.txt", "r", encoding="utf-8") as f:
  classes = f.read().splitlines()

In [33]:
datasets_texts = newsgroups_datasets + topic_datasets + reuters_datasets + dbpedia_train_datasets

In [34]:
word2vec = gensim.downloader.load('word2vec-google-news-300')

def w2v_avg_vector(sentence):
  vector = np.zeros((300,), dtype="float32")
  count = 0
  for word in sentence.split():
    try:
      vector = np.add(vector, word2vec[word])
      count += 1
    except:
      pass
  if count > 0:
    vector = np.divide(vector, len(word))
  return vector

In [35]:
classes_vector = list()
for cls in classes:
  classes_vector.append(w2v_avg_vector(cls))

In [36]:
diff_datasets = {i:[] for i in range(len(classes))}
for texts in tqdm(datasets_texts):
  texts_vector = w2v_avg_vector(texts)
  similarity = cosine_similarity([texts_vector], classes_vector)[0]
  sim_argsorted = np.argsort(similarity)
  diff = similarity[sim_argsorted[-1]] - similarity[sim_argsorted[-2]]
  if diff > 0.005:
    diff_datasets[sim_argsorted[-1]].append((diff, texts))

pseudo_texts = list()
pseudo_labels = list()
for i in range(len(classes)):
  sorted_diff_data = sorted(diff_datasets[i], reverse=True)[:MAXLEN_GET_PSEUDO]
  pseudo_texts.extend([i[1] for i in sorted_diff_data])
  pseudo_labels.extend([i]*len(sorted_diff_data[:MAXLEN_GET_PSEUDO]))

100%|██████████| 40000/40000 [00:12<00:00, 3172.58it/s]


In [37]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
x_train = tokenizer(pseudo_texts, truncation=True, return_tensors="tf", padding="max_length", max_length=512)
y_train = np.array(pseudo_labels)

model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
model.classifier = tf.keras.layers.Dense(units=14, activation="softmax", name="classifer")
model.compile(optimizer=keras.optimizers.Adam(3e-5))
model.fit(x_train["input_ids"], y_train, batch_size=BATCH_SIZE, epochs=EPOCH)

Epoch 1/4


  output, from_logits = _get_logits(


Epoch 2/4

In [None]:
pred = model.predict(x_train["input_ids"], batch_size=BATCH_SIZE)
y_pred = [np.argmax(i) for i in pred.logits]

target_names = ["Com.","Edu.","Art.","Ath.","Off.","Mea.","Bui.","Nat.","Vil.","Ani.","Pla.","Alb.","Fil.","Wri."]
rep = classification_report(y_train, y_pred, target_names=target_names, digits=3)
print(rep)

              precision    recall  f1-score   support

        Com.      0.000     0.000     0.000        50
        Edu.      0.000     0.000     0.000        50
        Art.      0.000     0.000     0.000        12
        Ath.      0.000     0.000     0.000         8
        Off.      0.000     0.000     0.000        50
        Mea.      0.800     0.080     0.145        50
        Bui.      0.098     1.000     0.178        50
        Nat.      0.000     0.000     0.000        50
        Vil.      0.000     0.000     0.000        13
        Ani.      0.000     0.000     0.000        23
        Pla.      0.500     0.292     0.368        24
        Alb.      0.000     0.000     0.000        50
        Fil.      0.000     0.000     0.000        50
        Wri.      0.000     0.000     0.000        50

    accuracy                          0.115       530
   macro avg      0.100     0.098     0.049       530
weighted avg      0.107     0.115     0.047       530



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd

print(y_pred)

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 6, 5, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 

In [None]:
print()