In [1]:
from sklearn import metrics
from tensorflow import keras
import gensim.downloader
from tqdm import tqdm
import numpy as np
import string
import csv 
import re

In [2]:
np.random.seed(0)

import datetime
t_delta = datetime.timedelta(hours=9)
JST = datetime.timezone(t_delta, 'JST')
now = datetime.datetime.now(JST)

In [3]:
# parameters
SAVED_MODEL = "../model/Proposed-Word2Vec-TextCNN_"+str(now.strftime('%Y%m%d%H%M%S'))
THRESHOLD = 0.05
MAXLEN_GET_PSEUDO = 3000
EPOCH = 5
BATCH_SIZE = 64
MAX_LEN = 128
FILTERS = 100
FILTER_SIZES = [3,4,5]

In [4]:
print(SAVED_MODEL)

../model/Proposed-Word2Vec-TextCNN_20221127170635


In [5]:
# 前処理
def preprocessing(text):
    # 括弧内文章の削除
    text = re.sub(r'\(.*\)',' ',text)
    text = re.sub(r'\[.*\]',' ',text)
    text = re.sub(r'\<.*\>',' ',text)
    text = re.sub(r'\{.*\}',' ',text)
    # 記号文字の削除
    text = text.translate(str.maketrans('','',string.punctuation))
    # スペースの調整
    text = re.sub(r'\s+',' ',text)
    return text

In [6]:
word2vec = gensim.downloader.load('word2vec-google-news-300')

In [7]:
# dbpedia datasets train
with open('../data/dbpedia_csv/train.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# example -------------------
import random
reader = random.sample(reader, 10000)
#----------------------------

li = list()
dbpedia_train_datasets = list()
for l, auth, text in tqdm(reader):
    li.append(int(l)-1)
    text = text.replace(auth,'')
    dbpedia_train_datasets.append(preprocessing(text))

100%|██████████| 10000/10000 [00:00<00:00, 67517.21it/s]


In [8]:
# dbpedia classes
with open("../data/dbpedia_csv/classes.txt", "r", encoding="utf-8") as f:
  classes = f.read().splitlines()

In [9]:
# load test data
# dbpedia datasets train
with open('../data/dbpedia_csv/test.csv','r',encoding='utf-8') as f:
    reader = [r for r in csv.reader(f)]
    
# example -------------------
import random
reader = random.sample(reader, 1000)
#----------------------------

test_texts = list()
test_labels = list()
for labels, auth, text in tqdm(reader):
    text = text.replace(auth,'')
    test_texts.append(preprocessing(text))
    test_labels.append(int(labels)-1)

100%|██████████| 1000/1000 [00:00<00:00, 68943.31it/s]


In [27]:
# tokenizer = keras.preprocessing.text.Tokenizer()
# tokenizer.fit_on_texts(dbpedia_train_datasets)
# sequences = tokenizer.texts_to_sequences(dbpedia_train_datasets)
# x_train = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN, padding='post')
# y_train = np.array(keras.utils.to_categorical(li))

# embedding_matrix = np.zeros((max(tokenizer.word_index.values())+1, word2vec.vector_size))
# for word, i in tokenizer.word_index.items():
#     if word in word2vec:
#         embedding_matrix[i] = word2vec[word]

In [11]:
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(dbpedia_train_datasets)
tokenizer.fit_on_texts(test_texts)
embedding_matrix = np.zeros((max(tokenizer.word_index.values())+1, word2vec.vector_size))
for word, i in tokenizer.word_index.items():
    if word in word2vec:
        embedding_matrix[i] = word2vec[word]

sequences = tokenizer.texts_to_sequences(dbpedia_train_datasets)
x_train = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN, padding='post')
y_train = np.array(keras.utils.to_categorical(li))

sequences = tokenizer.texts_to_sequences(test_texts)
x_test = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN, padding='post')
y_test = np.array(keras.utils.to_categorical(test_labels))

In [28]:
inputs = keras.layers.Input(shape=(MAX_LEN),dtype='float32')
embedding = keras.layers.Embedding(input_dim=len(embedding_matrix),output_dim=len(embedding_matrix[0]),weights=[embedding_matrix],trainable=False, mask_zero=True,name='embedding')(inputs)
conv1 = keras.layers.Conv1D(filters=FILTERS, kernel_size=FILTER_SIZES[0], kernel_initializer='normal', activation='relu')(embedding)
conv2 = keras.layers.Conv1D(filters=FILTERS, kernel_size=FILTER_SIZES[1], kernel_initializer='normal', activation='relu')(embedding)
conv3 = keras.layers.Conv1D(filters=FILTERS, kernel_size=FILTER_SIZES[2], kernel_initializer='normal', activation='relu')(embedding)
pool1 = keras.layers.MaxPooling1D(pool_size=int(conv1.shape[1]),strides=1)(conv1)
pool2 = keras.layers.MaxPooling1D(pool_size=int(conv2.shape[1]),strides=1)(conv2)
pool3 = keras.layers.MaxPooling1D(pool_size=int(conv3.shape[1]),strides=1)(conv3)
x = keras.layers.concatenate([pool1, pool2, pool3])
x = keras.layers.Flatten()(x)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(100,activation='relu')(x)
output = keras.layers.Dense(units=14, activation='softmax')(x)
model = keras.models.Model(inputs, output)
opt = keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=True)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])

In [30]:
# モデルの学習
result = model.fit(x=x_train,
                   y=y_train,
                   epochs=EPOCH,
                   batch_size=BATCH_SIZE,
                   validation_data=(x_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
model.save(SAVED_MODEL)



INFO:tensorflow:Assets written to: ../model/Proposed-Word2Vec-TextCNN_20221127170635\assets


INFO:tensorflow:Assets written to: ../model/Proposed-Word2Vec-TextCNN_20221127170635\assets


# Test

In [15]:
model = keras.models.load_model(SAVED_MODEL)

In [23]:
# tokenizer = keras.preprocessing.text.Tokenizer()
# tokenizer.fit_on_texts(test_texts)
# sequences = tokenizer.texts_to_sequences(test_texts)
# x_test = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_LEN, padding='post')
# y_test = np.array(keras.utils.to_categorical(test_labels))

# embedding_matrix = np.zeros((max(tokenizer.word_index.values())+1, word2vec.vector_size))
# for word, i in tokenizer.word_index.items():
#     if word in word2vec:
#         embedding_matrix[i] = word2vec[word]

In [24]:
# model.embedding = keras.layers.Embedding(input_dim=len(embedding_matrix),output_dim=len(embedding_matrix[0]),weights=[embedding_matrix],trainable=False, mask_zero=True,name='embedding')

In [25]:
pred = model.predict(x_test)



In [26]:
from sklearn import metrics

y_test = [np.argmax(i) for i in y_test]
y_pred = [np.argmax(i) for i in pred]

target_names = [c[:3]+"." for c in classes]

rep = metrics.classification_report(y_test, y_pred, target_names=target_names, digits=3)
print(rep)

              precision    recall  f1-score   support

        Com.      0.127     0.206     0.157        68
        Edu.      0.065     0.074     0.069        81
        Art.      0.089     0.067     0.076        75
        Ath.      0.297     0.141     0.191        78
        Off.      0.148     0.104     0.122        77
        Mea.      0.065     0.042     0.051        72
        Bui.      0.070     0.130     0.091        69
        Nat.      0.215     0.293     0.248        58
        Vil.      0.250     0.054     0.089        74
        Ani.      0.033     0.045     0.038        66
        Pla.      0.056     0.015     0.023        68
        Alb.      0.878     0.890     0.884        73
        Fil.      0.722     0.675     0.698        77
        Wri.      0.063     0.125     0.084        64

    accuracy                          0.206      1000
   macro avg      0.220     0.204     0.202      1000
weighted avg      0.225     0.206     0.205      1000

