In [None]:
# parameters
MODEL = "bert-base-uncased"
X_TRAIN = '../dataset/HF-BERT_x_train.npy'
Y_TRAIN = '../dataset/HF-BERT_y_train.npy'
X_TEST = '../dataset/HF-BERT_x_test.npy'
Y_TEST = '../dataset/HF-BERT_y_test.npy'
SAVED_MODEL = "../Baseline-HF-BERT.h5"
EPOCH = 10
BATCH_SIZE = 64

In [None]:
from transformers import AutoTokenizer, TFAutoModel, logging
import tensorflow as tf
import numpy as np
import re
import string
from nltk.corpus import wordnet
from tqdm import tqdm
from sklearn import metrics
import csv

logging.set_verbosity_error()
np.random.seed(0)

In [None]:
# 前処理
def preprocessing(text,auth):
    # 括弧内文章の削除
    text = re.sub(r'\(.*?\)','',text)
    # 記号文字の削除
    text = text.translate(str.maketrans('','',string.punctuation))
    # 著者名の削除
    text = text.replace(auth,'')
    # スペースの調整
    text = re.sub(r'\s+',' ',text)
    return text

In [None]:
# preprocessing train data -----------------------------------------------------------------------
# load topic class labels
print("making train dataset...")
with open('../data/topic/classes.txt','r',encoding='utf-8') as f:
    labels = f.read().splitlines()
topic_class_hypothesis = dict()
for i,label in enumerate(labels):
    topic_class_hypothesis[i] = 'this text is about ' + ' or '.join([wordnet.synsets(word)[0].definition() for word in label.split(' & ')])

# load train data
with open('../data/topic/train_pu_half_v0.txt','r',encoding='utf-8') as f:
    texts_v0 = f.read()
with open('../data/topic/train_pu_half_v1.txt','r',encoding='utf-8') as f:
    texts_v1 = f.read()
texts = texts_v0 + texts_v1

# ## example -------------------------------------
import random
texts = texts.splitlines()
texts = random.sample(texts,10000)
texts = "\n".join(texts)
# ## ---------------------------------------------

tokenizer = AutoTokenizer.from_pretrained(MODEL)
bert = TFAutoModel.from_pretrained(MODEL)

x_train, y_train = [],[]
first, second = [],[]
for label_text in tqdm(texts.splitlines()):
    label,text = label_text.split('\t')
    rand_base = [0,1,2,3,4,5,6,7,8,9]
    rand_base.remove(int(label))
    label_rand = np.random.choice(rand_base)
    text = preprocessing(text,'')
    tokenized = tokenizer(text,topic_class_hypothesis[int(label)],return_tensors="tf",truncation=True, padding="max_length")
    bert_feature = bert(tokenized,output_hidden_states=True)
    x_train.append(bert_feature.pooler_output[0])
    y_train.append(1)
    tokenized = tokenizer(text,topic_class_hypothesis[int(label_rand)],return_tensors="tf",truncation=True, padding="max_length")
    bert_feature = bert(tokenized,output_hidden_states=True)
    x_train.append(bert_feature.pooler_output[0])
    y_train.append(0)

np.save(X_TRAIN, x_train)
np.save(Y_TRAIN, y_train)
x_train, y_train=0, 0

In [None]:
# dbpedia class ------------------------------------------------------------------------------------------------------
with open('../data/dbpedia_csv/classes.txt','r',encoding='utf-8') as f:
    dbpedia_class = ['this text is about '+text for text in f.read().splitlines()]

with open('../data/dbpedia_csv/test.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# # example -------------------
import random
reader = random.sample(reader,1000)
# #----------------------------

x_test, y_test = [],[]
first, second = [],[]
for cls_num,auth,readtext in tqdm(reader,total=len(reader)):
    for db_class in dbpedia_class:
        first.append(preprocessing(readtext,auth))
        second.append(db_class)
        tokenized = tokenizer(preprocessing(readtext,auth), db_class, return_tensors="tf", truncation=True, padding="max_length")
        bert_feature = bert(tokenized, output_hidden_states=True)
        x_test.append(bert_feature.pooler_output[0])
        
    y_test.append(int(cls_num))

np.save(X_TEST, x_test)
np.save(Y_TEST, y_test)
x_test, y_test = 0, 0

In [None]:
x_train = np.load(X_TRAIN)
y_train = np.load(Y_TRAIN)

x_train = np.array(x_train)
y_train = np.array(y_train)

inputs = tf.keras.layers.Input(shape=(768,))
outputs = tf.keras.layers.Dense(1,activation="sigmoid")(inputs)
model = tf.keras.models.Model(inputs, outputs)

model.compile(optimizer="Adam", loss='binary_crossentropy',metrics=['mae','mse','acc'])
model.fit(x_train, y_train, epochs=EPOCH, batch_size=BATCH_SIZE)
model.save(SAVED_MODEL)

In [None]:
x_test = np.load(X_TEST)
y_test = np.load(Y_TEST)

model = tf.keras.models.load_model(SAVED_MODEL)

pred = model.predict(x_test)
split_pred = np.array_split(pred,len(y_test))
y_pred = [np.argmax(p)+1 for p in split_pred]

labels = [1,2,3,4,5,6,7,8,9,10,11,12,13,14]
target_class = ["Com.","Edu.","Art.","Ath.","Off.","Mea.","Bui.","Nat.","Vil.","Ani.","Pla.","Alb.","Fil.","Wri."]
rep = metrics.classification_report(y_test,y_pred,labels=labels,target_names=target_class,digits=3)
print(rep)