In [1]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, logging
import tensorflow as tf
import numpy as np
import re
import string
from nltk.corpus import wordnet
from tqdm import tqdm
from sklearn import metrics
import csv
from tensorflow import keras

logging.set_verbosity_error()
np.random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# parameters
MODEL = "bert-base-uncased"
X_TRAIN = '../dataset/HF-BERT_x_train.npy'
Y_TRAIN = '../dataset/HF-BERT_y_train.npy'
X_TEST = '../dataset/HF-BERT_x_test.npy'
Y_TEST = '../dataset/HF-BERT_y_test.npy'
SAVED_MODEL = "../Baseline-HF-BERT.h5"
EPOCH = 1
BATCH_SIZE = 16

In [3]:
# 前処理
def preprocessing(text):
    # 括弧内文章の削除
    text = re.sub(r'\(.*\)',' ',text)
    text = re.sub(r'\[.*\]',' ',text)
    text = re.sub(r'\<.*\>',' ',text)
    text = re.sub(r'\{.*\}',' ',text)
    # 記号文字の削除
    text = text.translate(str.maketrans('','',string.punctuation))
    # スペースの調整
    text = re.sub(r'\s+',' ',text)
    return text

In [4]:
# preprocessing train data -----------------------------------------------------------------------
# load topic class labels
print("making train dataset...")
with open('../data/topic/classes.txt','r',encoding='utf-8') as f:
    labels = f.read().splitlines()
topic_class_hypothesis = dict()
for i,label in enumerate(labels):
    topic_class_hypothesis[i] = 'this text is about ' + ' or '.join([wordnet.synsets(word)[0].definition() for word in label.split(' & ')])

# load train data
with open('../data/topic/train_pu_half_v0.txt','r',encoding='utf-8') as f:
    texts_v0 = f.read()
with open('../data/topic/train_pu_half_v1.txt','r',encoding='utf-8') as f:
    texts_v1 = f.read()
texts = texts_v0 + texts_v1

# # ## example -------------------------------------
# import random
# texts = texts.splitlines()
# texts = random.sample(texts,10000)
# texts = "\n".join(texts)
# # ## ---------------------------------------------

tokenizer = AutoTokenizer.from_pretrained(MODEL)

x_train, y_train = [],[]
first, second = [],[]
for label_text in tqdm(texts.splitlines()):
    label,text = label_text.split('\t')
    rand_base = [0,1,2,3,4,5,6,7,8,9]
    rand_base.remove(int(label))
    label_rand = np.random.choice(rand_base)
    first.append(preprocessing(text))
    second.append(topic_class_hypothesis[int(label)])
    y_train.append(1)
    first.append(preprocessing(text))
    second.append(topic_class_hypothesis[int(label_rand)])
    y_train.append(0)

x_train = tokenizer(first, second, truncation=True, return_tensors="tf", padding="max_length", max_length=512)

making train dataset...


100%|██████████| 1300000/1300000 [01:01<00:00, 21165.02it/s]


In [5]:
y_train = np.array(y_train)

model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
model.classifier = tf.keras.layers.Dense(units=1, activation="sigmoid", name="classifier")
model.compile(optimizer=keras.optimizers.Adam(3e-5),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=tf.keras.metrics.BinaryAccuracy())
model.fit([x_train["input_ids"],x_train["attention_mask"]], y_train, epochs=EPOCH, batch_size=BATCH_SIZE)



<keras.callbacks.History at 0x7eff5cfec518>

In [6]:
pred = model.predict([x_train["input_ids"],x_train["attention_mask"]], batch_size=BATCH_SIZE)
print(pred.logits)
y_pred = np.where(pred.logits<0.5, 0, 1)

rep = metrics.classification_report(y_train,y_pred,digits=3)
print(rep)

[[0.48070368]
 [0.03339637]
 [0.9951605 ]
 ...
 [0.03262991]
 [0.9243892 ]
 [0.5926784 ]]
              precision    recall  f1-score   support

           0      0.907     0.919     0.913   1300000
           1      0.918     0.906     0.912   1300000

    accuracy                          0.912   2600000
   macro avg      0.912     0.912     0.912   2600000
weighted avg      0.912     0.912     0.912   2600000



In [7]:
# dbpedia class ------------------------------------------------------------------------------------------------------
with open('../data/dbpedia_csv/classes.txt','r',encoding='utf-8') as f:
    classes = f.read().splitlines()
    dbpedia_class = ['this text is about '+text for text in classes]

with open('../data/dbpedia_csv/test.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# # # example -------------------
# import random
# reader = random.sample(reader,1000)
# # #----------------------------

x_test, y_test = [],[]
first, second = [],[]
for cls_num,auth,readtext in tqdm(reader,total=len(reader)):
    for db_class in dbpedia_class:
        text = readtext.replace(auth, "")
        first.append(preprocessing(text))
        second.append(db_class)
    y_test.append(int(cls_num)-1)           

x_test = tokenizer(first, second, truncation=True, return_tensors="tf", padding="max_length", max_length=512)   

100%|██████████| 70000/70000 [00:13<00:00, 5342.05it/s]


In [8]:
pred = model.predict([x_test["input_ids"],x_test["attention_mask"]], batch_size=BATCH_SIZE)
split_pred = np.array_split(pred.logits,len(y_test))
y_pred = [np.argmax(p) for p in split_pred]

target_names = [c[:3]+"." for c in classes]
rep = metrics.classification_report(y_test,y_pred,target_names=target_names,digits=3)
print(rep)

              precision    recall  f1-score   support

        Com.      0.436     0.310     0.362      5000
        Edu.      0.187     0.606     0.285      5000
        Art.      0.232     0.095     0.135      5000
        Ath.      0.692     0.780     0.734      5000
        Off.      0.091     0.013     0.022      5000
        Mea.      0.379     0.069     0.117      5000
        Bui.      0.165     0.187     0.176      5000
        Nat.      0.133     0.394     0.199      5000
        Vil.      0.028     0.002     0.003      5000
        Ani.      0.361     0.479     0.412      5000
        Pla.      0.174     0.106     0.132      5000
        Alb.      0.772     0.235     0.361      5000
        Fil.      0.383     0.529     0.444      5000
        Wri.      0.138     0.055     0.078      5000

    accuracy                          0.276     70000
   macro avg      0.298     0.276     0.247     70000
weighted avg      0.298     0.276     0.247     70000

