In [1]:
import csv
import re
import string
import json
import numpy as np
from keras_bert import Tokenizer
from tqdm import tqdm
from nltk.corpus import wordnet

np.random.seed(0)
config = json.load(open('config.json','r'))
SEQ_LEN = config['SEQ_LEN']

with open('../uncased_L-12_H-768_A-12/vocab.txt','r',encoding='utf8') as f:
    token_dict = {token:i for i,token in enumerate(f.read().splitlines())}
tokenizer = Tokenizer(token_dict)

# 前処理
def preprocessing(text,auth):
    # 括弧内文章の削除
    text = re.sub(r'\(.*?\)','',text)
    # 記号文字の削除
    text = text.translate(str.maketrans('','',string.punctuation))
    # 著者名の削除
    text = text.replace(auth,'')
    # スペースの調整
    text = re.sub(r'\s+',' ',text)
    return text

In [2]:
# preprocessing train data -----------------------------------------------------------------------
# load topic class labels
print("make train dataset...")
with open('../data/topic/classes.txt','r',encoding='utf-8') as f:
    labels = f.read().splitlines()
topic_class_hypothesis = dict()
for i,label in enumerate(labels):
    topic_class_hypothesis[i] = 'this text is about ' + ' or '.join([wordnet.synsets(word)[0].definition() for word in label.split(' & ')])

# load train data
with open('../data/topic/train_pu_half_v0.txt','r',encoding='utf-8') as f:
    texts_v0 = f.read()
with open('../data/topic/train_pu_half_v1.txt','r',encoding='utf-8') as f:
    texts_v1 = f.read()
texts = texts_v0 + texts_v1

# # ---------------------------------------------
# import random
# texts = texts.splitlines()
# texts = random.sample(texts,10000)
# texts = "\n".join(texts)
# # ---------------------------------------------

y_train = []
indeces, segments = [],[]
for label_text in tqdm(texts.splitlines()):
    label,text = label_text.split('\t')
    rand_base = [0,1,2,3,4,5,6,7,8,9]
    rand_base.remove(int(label))
    label_rand = np.random.choice(rand_base)
    text = preprocessing(text,'')
    ids, segs = tokenizer.encode(first=text, second=topic_class_hypothesis[int(label)], max_len=SEQ_LEN)
    indeces.append(ids)
    segments.append(segs)
    y_train.append(1)
    ids, segs = tokenizer.encode(first=text, second=topic_class_hypothesis[int(label_rand)], max_len=SEQ_LEN)
    indeces.append(ids)
    segments.append(segs)
    y_train.append(0)
x_train = [np.array(indeces),np.array(segments)]

np.save('../dataset/BERT_x_train.npy', x_train)
np.save('../dataset/BERT_y_train.npy', y_train)
# np.save('../dataset/BERT_x_train_sample.npy', x_train)
# np.save('../dataset/BERT_y_train_sample.npy', y_train)
x_train,y_train=0,0

make train dataset...


100%|██████████| 1300000/1300000 [38:39<00:00, 560.42it/s] 


In [3]:
# dbpedia class ------------------------------------------------------------------------------------------------------
with open('../data/dbpedia_csv/classes.txt','r',encoding='utf-8') as f:
    dbpedia_class = ['this text is about '+text for text in f.read().splitlines()]

with open('../data/dbpedia_csv/test.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
    # #----------------------------
    # import random
    # reader = random.sample(reader,1000)
    # #----------------------------

    y_test = []
    indeces, segments = [],[]
    for cls_num,auth,readtext in tqdm(reader,total=len(reader)):
        for db_class in dbpedia_class:
            ids, segs = tokenizer.encode(first=preprocessing(readtext,auth), second=db_class, max_len=SEQ_LEN)
            indeces.append(ids)
            segments.append(segs)
        y_test.append(int(cls_num))
    x_test = [np.array(indeces),np.array(segments)]

np.save('../dataset/BERT_x_test.npy', x_test)
np.save('../dataset/BERT_y_test.npy', y_test)
# np.save('../dataset/BERT_x_test_sample.npy', x_test)
# np.save('../dataset/BERT_y_test_sample.npy', y_test)

100%|██████████| 70000/70000 [06:12<00:00, 188.07it/s]


In [4]:
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import json
from keras_bert import load_trained_model_from_checkpoint
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras.models import Model
from keras_bert import AdamWarmup, calc_train_steps

# パラメータの読み込み
config = json.load(open('config.json'))
BATCH_SIZE = config['BATCH_SIZE']
EPOCHS = config['EPOCHS']
SEQ_LEN = config['SEQ_LEN']
LR = config['LR']

# 学習データ読み込み
print("load data...")
x_train = np.load('../dataset/BERT_x_train.npy').tolist()
y_train = np.load('../dataset/BERT_y_train.npy')
# x_train = np.load('../dataset/BERT_x_train_sample.npy').tolist()
# y_train = np.load('../dataset/BERT_y_train_sample.npy')
x_train =[np.array(x_train[0]),np.array(x_train[1])]

# BERTの読み込み
pretrained_path = '../uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

print("load pretrained model...")
bert = load_trained_model_from_checkpoint(config_path,checkpoint_path,training=True,seq_len=SEQ_LEN)

decay_steps, warmup_steps = calc_train_steps(x_train[0].shape[0],batch_size=BATCH_SIZE,epochs=EPOCHS)

# bert_nsp_dense = bert.get_layer('NSP-Dense').output
# bert_model = Model(bert.input[:2],bert_nsp_dense)
# print("bert predict...")
# bert_cls = bert_model.predict(x_train)

# inputs = Input(shape=(768,))
# output = Dense(units=1, activation='sigmoid')(inputs)
# model = Model(inputs,output)

inputs = bert.input[:2]
bert_nsp_dense = bert.get_layer('NSP-Dense').output
outputs = Dense(units=1, activation='sigmoid')(bert_nsp_dense)
model = Model(inputs,outputs)
model.compile(optimizer=AdamWarmup(decay_steps=decay_steps,warmup_steps=warmup_steps,learning_rate=LR),loss='binary_crossentropy',metrics=['mae','mse','acc'])
print("training...")
result = model.fit(x_train,y_train,epochs=EPOCHS,batch_size=BATCH_SIZE)

print("save...")
model.save('../BERT_matching_model.h5')
# model.save('../BERT_matching_model_sample.h5')

load data...
load pretrained model...
training...
save...




In [6]:
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import json
from keras_bert import get_custom_objects
from keras.models import load_model
from sklearn import metrics
from keras_bert import load_trained_model_from_checkpoint
from tensorflow.keras.models import Model

# パラメータ読み込み
config = json.load(open('config.json'))
BATCH_SIZE = config['BATCH_SIZE']
EPOCHS = config['EPOCHS']
LR = config['LR']
SEQ_LEN = config['SEQ_LEN']

# テストデータ読み込み
x_test = np.load('../dataset/BERT_x_test.npy').tolist()
y_test = np.load('../dataset/BERT_y_test.npy')
# x_test = np.load('../dataset/BERT_x_test_sample.npy').tolist()
# y_test = np.load('../dataset/BERT_y_test_sample.npy')
x_test =[np.array(x_test[0]),np.array(x_test[1])]

# BERTの読み込み
pretrained_path = '../uncased_L-12_H-768_A-12'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

print("load model...")

# bert = load_trained_model_from_checkpoint(config_path,checkpoint_path,training=True,seq_len=SEQ_LEN)
# bert_nsp_dense = bert.get_layer('NSP-Dense').output
# bert_model = Model(bert.input[:2],bert_nsp_dense)
# print("bert predict...")
# bert_cls = bert_model.predict(x_test)

print("predict...")
model = load_model('../BERT_matching_model.h5',custom_objects=get_custom_objects())
pred = model.predict(x_test)

split_pred = np.array_split(pred,len(y_test))
y_pred = [np.argmax(p)+1 for p in split_pred]

labels = [1,2,3,4,5,6,7,8,9,10,11,12,13,14]
target_class = ["Com.","Edu.","Art.","Ath.","Off.","Mea.","Bui.","Nat.","Vil.","Ani.","Pla.","Alb.","Fil.","Wri."]
rep = metrics.classification_report(y_test,y_pred,labels=labels,target_names=target_class,digits=3)
print(rep)
with open('result.txt','w') as f:
    f.write(rep)

load model...
predict...
              precision    recall  f1-score   support

        Com.      0.694     0.428     0.530      5000
        Edu.      0.377     0.979     0.544      5000
        Art.      0.341     0.114     0.171      5000
        Ath.      0.820     0.995     0.899      5000
        Off.      0.555     0.847     0.670      5000
        Mea.      0.310     0.019     0.035      5000
        Bui.      0.390     0.210     0.273      5000
        Nat.      0.150     0.020     0.035      5000
        Vil.      0.899     0.442     0.592      5000
        Ani.      0.337     0.542     0.416      5000
        Pla.      0.188     0.321     0.237      5000
        Alb.      0.511     0.753     0.609      5000
        Fil.      0.399     0.514     0.449      5000
        Wri.      0.566     0.232     0.329      5000

    accuracy                          0.458     70000
   macro avg      0.467     0.458     0.413     70000
weighted avg      0.467     0.458     0.413     70000



PermissionError: [Errno 13] Permission denied: 'result.txt'