## Data Import and Prceprocessin

In [None]:
import numpy as np
import pandas as pd
import scipy

In [None]:
# file load from google colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dvt = pd.read_csv('YOUR FILE NAME HERE')
dvt.head()

#preprocessing

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(sentence):
    sentence=str(sentence)
    korean = re.compile('[\u3131-\u3163\uac00-\ud7a3]+')
    sentence = re.sub(korean, '', sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"")
    cleanr = re.compile('=#$-+<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)

dvt['sentence']=dvt['opinion'].map(lambda s:preprocess(s))

In [None]:
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
vocab_size = 1000
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(dvt['sentence'])
sequences = tokenizer.texts_to_sequences(dvt['sentence'])

In [None]:
n_of_train = int(len(sequences) * 0.8)
n_of_test = int(len(sequences) - n_of_train)
print('Number of Training data :',n_of_train)
print('Number of Test data:',n_of_test)

In [None]:
X_data = sequences
print('Max Length : %d' % max(len(l) for l in X_data))
print('Mean Length : %f' % (sum(map(len, X_data))/len(X_data)))
plt.hist([len(s) for s in X_data], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
# Length of dataset sets to max_len
max_len = 163
data = pad_sequences(X_data, maxlen = max_len)
print("Test data(shape): ", data.shape)

In [None]:
y_data = pd.get_dummies(dvt['DVT부위'])

X_test = data[n_of_train:]
y_test = np.array(y_data[n_of_train:])
X_train = data[:n_of_train]
y_train = np.array(y_data[:n_of_train])
print("Training data(shape): ", X_train.shape)
print("Test data(shape): ", X_test.shape)
print("Training label(shape): ", y_train.shape)
print("Test label(shape): ", y_test.shape)

## 1D CNN

In [None]:
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 32))
model.add(Dropout(0.2))
model.add(Conv1D(32, 5, strides=1, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(5, activation='softmax'))
model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
es = EarlyStopping(monitor = 'val_loss',
                   mode = 'min',
                   verbose = 1,
                   patience = 3)
mc = ModelCheckpoint('best_model.keras',
                     monitor = 'val_acc',
                     mode = 'max',
                     verbose = 1,
                     save_best_only = True)
history = model.fit(X_train,
                    y_train,
                    epochs = 50,
                    batch_size=64,
                    validation_split=0.2,
                    callbacks=[es, mc])

In [None]:
import tensorflow as tf

new_model = tf.keras.models.load_model('best_model.keras')

In [None]:
print("\n 테스트 정확도: %.4f" % (new_model.evaluate(X_test, y_test)[1]))

In [None]:
from sklearn.metrics import f1_score

y_test_classes = y_test.argmax(1)
y_pred = new_model.predict(X_test)
y_pred_classes = y_pred.argmax(1)
model_accuracy = f1_score(y_test_classes, y_pred_classes, average='micro')
print(f"f1 score : {model_accuracy}")

## Multi Kernel 1D CNN

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense, Input, Flatten, Concatenate

In [None]:
filter_sizes = [2,3,5,7]
num_filters = 512
drop = 0.5

In [None]:
model_input = Input(shape = (max_len,))
z = Embedding(vocab_size, 32)(model_input)

conv_blocks = []

for sz in filter_sizes:
    conv = Conv1D(filters = num_filters,
                         kernel_size = sz,
                         padding = "valid",
                         activation = "relu",
                         strides = 1)(z)
    conv = GlobalMaxPooling1D()(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)

z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = Dropout(drop)(z)
model_output = Dense(5, activation='softmax')(z)

model = Model(model_input, model_output)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.summary()

In [None]:
es = EarlyStopping(monitor = 'val_loss',
                   mode = 'min',
                   verbose = 1,
                   patience = 3)
mc = ModelCheckpoint('best_model_multi.keras',
                     monitor = 'val_acc',
                     mode = 'max',
                     verbose = 1,
                     save_best_only = True)
history = model.fit(X_train,
                    y_train,
                    epochs = 50,
                    batch_size=64,
                    validation_split=0.2,
                    callbacks=[es, mc])

In [None]:
epochs = range(1, len(history.history['acc']) + 1)
plt.plot(epochs, history.history['acc'])
plt.plot(epochs, history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epochs')
plt.legend(['train', 'validation'], loc='lower right')
plt.show()

epochs = range(1, len(history.history['loss']) + 1)
plt.plot(epochs, history.history['loss'])
plt.plot(epochs, history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
new_model = tf.keras.models.load_model('best_model_multi.keras')

y_test_classes = y_test.argmax(1)
y_pred = new_model.predict(X_test)
y_pred_classes = y_pred.argmax(1)
model_accuracy = f1_score(y_test_classes, y_pred_classes, average='micro')
print(f"f1 score : {model_accuracy}")

## Multi Kernel 1D CNN with GLOVE

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

In [None]:
import os

embedding_dict = dict()
f = open(os.path.join('glove.6B.100d.txt'), encoding='utf-8')
for line in f:
    word_vector = line.split()
    word = word_vector[0]
    word_vector_arr = np.asarray(word_vector[1:], dtype='float32') # 100개의 값을 가지는 array로 변환
    embedding_dict[word] = word_vector_arr
f.close()

print('%s개의 Embedding vector가 있습니다.' % len(embedding_dict))

In [None]:
word_index = tokenizer.word_index

In [None]:
vocab_size_glove = len(word_index)

In [None]:
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size_glove, embedding_dim))

In [None]:
np.shape(embedding_matrix)

In [None]:
for word, i in word_index.items():
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense, Input, Flatten, Concatenate

In [None]:
filter_sizes = [2,3,5]
num_filters = 512
drop = 0.5

In [None]:
model_input = Input(shape = (max_len,))
z = Embedding(vocab_size_glove, embedding_dim, weights=[embedding_matrix],
                      input_length=max_len, trainable=False)(model_input)

conv_blocks = []

for sz in filter_sizes:
    conv = Conv1D(filters = num_filters,
                         kernel_size = sz,
                         padding = "valid",
                         activation = "relu",
                         strides = 1)(z)
    conv = GlobalMaxPooling1D()(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)

z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
z = Dropout(drop)(z)
model_output = Dense(5, activation='softmax')(z)

model = Model(model_input, model_output)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.summary()

In [None]:
es = EarlyStopping(monitor = 'val_loss',
                   mode = 'min',
                   verbose = 1,
                   patience = 3)
mc = ModelCheckpoint('best_model_multi_glove.keras',
                     monitor = 'val_acc',
                     mode = 'max',
                     verbose = 1,
                     save_best_only = True)
history = model.fit(X_train,
                    y_train,
                    epochs = 50,
                    batch_size=64,
                    validation_split=0.2,
                    callbacks=[es, mc])

In [None]:
epochs = range(1, len(history.history['acc']) + 1)
plt.plot(epochs, history.history['acc'])
plt.plot(epochs, history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epochs')
plt.legend(['train', 'validation'], loc='lower right')
plt.show()

epochs = range(1, len(history.history['loss']) + 1)
plt.plot(epochs, history.history['loss'])
plt.plot(epochs, history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
new_model = tf.keras.models.load_model('best_model_multi_glove.keras')

y_test_classes = y_test.argmax(1)
y_pred = new_model.predict(X_test)
y_pred_classes = y_pred.argmax(1)
model_accuracy = f1_score(y_test_classes, y_pred_classes, average='micro')
print(f"f1 score : {model_accuracy}")

## LSTM

In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_len))
#model.add(Bidirectional(LSTM(128)))
model.add(LSTM(128))
model.add(Dense(5, activation='softmax'))

In [None]:
es = EarlyStopping(monitor='val_loss',
                   mode='min',
                   verbose=1,
                   patience=4)
mc = ModelCheckpoint('best_model_lstm.keras',
                     monitor='val_acc',
                     mode='max',
                     verbose=1,
                     save_best_only=True)

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])

history = model.fit(X_train,
                    y_train,
                    epochs=50,
                    callbacks=[es, mc],
                    batch_size=60,
                    validation_split=0.2)

In [None]:
epochs = range(1, len(history.history['acc']) + 1)
plt.plot(epochs, history.history['acc'])
plt.plot(epochs, history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epochs')
plt.legend(['train', 'validation'], loc='lower right')
plt.show()

epochs = range(1, len(history.history['loss']) + 1)
plt.plot(epochs, history.history['loss'])
plt.plot(epochs, history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
new_model = tf.keras.models.load_model('best_model_lstm.keras')

y_test_classes = y_test.argmax(1)
y_pred = new_model.predict(X_test)
y_pred_classes = y_pred.argmax(1)
model_accuracy = f1_score(y_test_classes, y_pred_classes, average='micro')
print(f"f1 score : {model_accuracy}")