In [3]:
import pandas as pd
from pyvi import ViTokenizer, ViPosTagger
import nltk
import numpy as np
import warnings
import time
import re
warnings.filterwarnings('ignore')

# 1. Import data

In [4]:
train_filename = "train_nor_811.xlsx"
valid_filename = "valid_nor_811.xlsx"
test_filename = "test_nor_811.xlsx"
train_data = pd.read_excel(train_filename, engine = "openpyxl")
valid_data = pd.read_excel(valid_filename, engine = "openpyxl")
test_data = pd.read_excel(test_filename, engine = "openpyxl")

In [5]:
from sklearn.preprocessing import LabelEncoder
def file_processing(data):
    data.drop(columns = {"Unnamed: 0"}, axis = 1, inplace = True)
    data["emotion_encode"] = data["Emotion"]
    encoder = LabelEncoder()
    data.emotion_encode = encoder.fit_transform(data.Emotion)
    return data

In [6]:
train_data = file_processing(train_data)
valid_data = file_processing(valid_data)
test_data = file_processing(test_data)

# 2. Data preprocessing

In [7]:
def remove_duplicate(word):
    prev_char = ""
    clean_word = ""
    for character in word:
        if(character != prev_char):
            clean_word += character
            prev_char = character
    return clean_word

In [8]:
def deEmojify(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def normalize_sentences(sentences):
    punc_lst = {'.', ',', '...', '-', '“', '”', ':', '(', ')', '"', '!', '&', ';', '?', '*', ']', '>', '…', '’',"``","''", "=", "%", "^", "@", "<", ">"}
    confusing_words = {"per"}
    acronym_word = {
        "ko" : "không",
        "k" : "không",
        "z" : "vậy",
        "v" : "vậy",
        "dzậy" : "vậy",
        "dậy": "vậy",
        "t" : "tao",
        "m" : "mày",
        "sgk" : "sách_giáo_khoa",
        "zi" : "vậy",
        "dth" : "dễ_thương",
        "dume": "đụ mẹ"
    }
    
    clean_sentences = []
    
    # remove punctuation and lowercase
    for sent in sentences:
        
        # remove emojis
        sent = deEmojify(sent)
        
        sent = nltk.word_tokenize(sent)
        temp = []
        for word in sent:
            word = word.lower()
            word = remove_duplicate(word)
            if (word in punc_lst or word in confusing_words):
                continue
            elif(word in acronym_word):
                temp.append(acronym_word[word])
            elif(word.isdigit()):
                temp.append("<NUM>")
            else:
                temp.append(word)
        # remove whitespace
        sent = ' '.join(temp)
        
        clean_sentences.append(sent)
        
    return clean_sentences

In [9]:
def normalize_dataset(data):
    sentences = []
    for i in range(len(data)):
        sentences.append(ViTokenizer.tokenize(data.Sentence[i]))
    
    sentences = normalize_sentences(sentences)
    encode_tags = data.Emotion
    
    # remove empty sentences
    for idx, sent in enumerate(sentences):
        if sent.strip() == "":
            del sentences[idx]
            del encode_tags[idx]
    
    return sentences, encode_tags

In [10]:
train_clean_sentences, train_encode_tags = normalize_dataset(train_data)
valid_clean_sentences, valid_encode_tags = normalize_dataset(valid_data)
test_clean_sentences, test_encode_tags = normalize_dataset(test_data)

In [11]:
print(test_clean_sentences[99])
print(test_encode_tags[99])

đần mặt ra nhìn nó y_như con ngu còn cười theo nó nữa xong bị nó phát_hiện
Enjoyment


In [41]:
filename =["bạn bè.txt","các câu hỏi phức tạp.txt","du lịch.txt","gia đình.txt","giải trí.txt",
           "học tập.txt","nghề nghiệp.txt","nghỉ lễ.txt","người yêu.txt","robot.txt","shoping.txt",
           "sở thích.txt","tdtu.txt","thông tin cá nhân.txt","trò chuyện về đi ăn.txt","tán gẫu.txt","đất nước.txt","địa chỉ.txt"]

In [42]:
# import train data
temp_ques = []
temp_ans = []
tag = []
for k in range(len(filename)):
    with open('dataset/' + filename[k], encoding='utf-8') as f:
        lines = f.readlines()
    for i in range(len(lines)):
        if lines[i].startswith('__eou__'):
            continue
        else:
            part = lines[i].strip('__eou__').split('__eou__')
            temp_ques.append(ViTokenizer.tokenize(part[0].lower().strip()))
            temp_ans.append(ViTokenizer.tokenize(part[1].lower().strip()))
            tag.append(filename[k].split(".")[0])

In [43]:
data = pd.DataFrame({'Question':temp_ques,'Answer':temp_ans,'Tag':tag})
data.head()

Unnamed: 0,Question,Answer,Tag
0,thích đánh_lộn không ?,ngon nhà_vô,bạn bè
1,solo yasua không,chấp lun 2 mạng đầu,bạn bè
2,mai đi picnic không ?,mai bận học rồi,bạn bè
3,mai học ca mấy vậy ?,mai học ca 3,bạn bè
4,còn tiền không ?,còn chết liền,bạn bè


In [44]:
ques = np.array(data["Question"])
ans = np.array(data["Answer"])

In [45]:
lst_empty_answer_index = []
for i in range(len(ans)):
    if(ans[i] == ""):
        lst_empty_answer_index.append(i)

In [46]:
for i in lst_empty_answer_index:
    ques = np.delete(ques, i)
    ans = np.delete(ans, i)

In [47]:
def clean_sentences(sentences):
    Punc = {'.', ',', '...', '-', '“', '”', ':', '(', ')', '"', '!', '&', ';', '?', '*', ']', '>', '…', '’',"``","''"}
    for i, sent in enumerate(sentences):
        sent = sent.lower()
        
        sent = [char for char in sent if char not in Punc]
        sent = "".join(sent)
        
        sent = sent.replace("   ", " ")
        sent = sent.replace("  ", " ")
        sent = sent.strip()
        
        sentences[i] = sent
    return sentences

In [18]:
clean_ques = train_clean_sentences
clean_ans = train_encode_tags

In [19]:
word2count = {}
for sent in clean_ques:
    for word in sent.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

for sent in clean_ans:
    for word in sent.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [20]:
len(word2count)

6085

In [21]:
thresh = 1
word2index = {}
word_num = 0

for word, count in word2count.items():
    if (count >= thresh):
        word2index[word] = word_num
        word_num += 1

In [22]:
len(word2index)

6085

In [23]:
clean_ans = clean_ans.tolist()
for i in range(len(clean_ans)):
    clean_ans[i] = '<BOS> ' + clean_ans[i] + ' <EOS>'

In [24]:
len(clean_ans)

5547

In [25]:
tokens = ['<BOS>', '<EOS>', '<OUT>']
x = len(word2index)
for token in tokens:
    word2index[token] = x
    x += 1

In [26]:
len(word2index)

6086

In [27]:
index2word = {w: v for v, w in word2index.items()}
len(index2word)

6086

In [28]:
encoder_input = []
for sent in clean_ques:
    lst = []
    for word in sent.split():
        if word not in word2index:
            lst.append(word2index["<OUT>"])
        else:
            lst.append(word2index[word])
    encoder_input.append(lst)

In [29]:
len(encoder_input)

5547

In [30]:
decoder_input = []
for sent in clean_ans:
    lst = []
    for word in sent.split():
        if word not in word2index:
            lst.append(word2index["<OUT>"])
        else:
            lst.append(word2index[word])
    decoder_input.append(lst)

In [31]:
len(decoder_input)

5547

In [41]:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 150
encoder_input = pad_sequences(encoder_input, MAX_LEN, padding='post', truncating='post')
decoder_input = pad_sequences(decoder_input, MAX_LEN, padding='post', truncating='post')

In [42]:
decoder_final_output = []
for i in decoder_input:
    decoder_final_output.append(i[1:])

In [43]:
decoder_final_output[:3]

[array([6085, 6077, 6086, 6086,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [44]:
decoder_final_output = pad_sequences(decoder_final_output, MAX_LEN, padding='post', truncating='post')
decoder_final_output[:3]

array([[6085, 6077, 6086, 6086,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [45]:
from keras.utils import to_categorical
decoder_final_output = to_categorical(decoder_final_output, len(word2index))

IndexError: index 6086 is out of bounds for axis 1 with size 6086

In [28]:
decoder_final_output.shape

(5547, 150, 6086)

In [38]:
from keras.layers import Embedding
from keras.layers import Input, Dense, LSTM, TimeDistributed
from keras.models import Model

In [46]:
# Define input
enc_inp = Input(shape=(MAX_LEN, ))
dec_inp = Input(shape=(MAX_LEN, ))

VOCAB_SIZE = len(word2index)
HIDDEN_DIM = 50
embedding_dimention = 100

# Define embedding layer
embed = Embedding(VOCAB_SIZE + 1, output_dim = embedding_dimention, input_length = MAX_LEN, trainable = True)

In [47]:
# Define encoder layers
enc_embed = embed(enc_inp)
enc_lstm = LSTM(HIDDEN_DIM, return_sequences = True, return_state = True)
enc_op, h, c = enc_lstm(enc_embed)
enc_states = [h, c]

#Define decoder layers
dec_embed = embed(dec_inp)
dec_lstm = LSTM(HIDDEN_DIM, return_sequences = True, return_state = True)
dec_op, _, _ = dec_lstm(dec_embed, initial_state = enc_states)

dense = Dense(VOCAB_SIZE, activation = "softmax")

dense_op = dense(dec_op)

model = Model([enc_inp, dec_inp], dense_op)

In [38]:
import tensorflow as tf

# Train model
model.compile(loss='categorical_crossentropy',metrics=['acc'],optimizer='adam')

BATCH_SIZE = 32
EPOCHS = 10

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1)

model.fit([encoder_input, decoder_input],decoder_final_output,epochs=EPOCHS,batch_size=BATCH_SIZE, callbacks = [es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1d2b2b0e748>

In [42]:
model.save("LSTM_model.h5")

In [34]:
import keras.models
model = keras.models.load_model("LSTM_model.h5")

In [35]:
model.save("LSTM_model.h5")

In [36]:
model.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 150, 100)     608700      input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
lstm_4 (LSTM)                   [(None, 150, 50), (N 30200       embedding_2[0][0]     

In [39]:
#Load encoder model
enc_inp = model.input[0]
enc_op, h, c = model.layers[3].output
enc_states = [h, c]

enc_model = Model([enc_inp], enc_states)

In [48]:
#Load decoder model to predict next word
decoder_state_input_h = Input(shape=(HIDDEN_DIM,))
decoder_state_input_c = Input(shape=(HIDDEN_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_inp = model.input[1]
embed = model.layers[2] # Embedding layer
dec_lstm = model.layers[4] # Decoder layer
dense = model.layers[5] # Dense

dec_embed = embed(dec_inp)
decoder_outputs, state_h, state_c = dec_lstm(dec_embed , initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]
# output = dense(decoder_outputs)

dec_model = Model([dec_inp]+ decoder_states_inputs, [decoder_outputs] + decoder_states)

In [49]:
def clean_one_sent(sent):
    Punc = {'.', ',', '...', '-', '“', '”', ':', '(', ')', '"', '!', '&', ';', '?', '*', ']', '>', '…', '’',"``","''"}
    sent = sent.lower()

    sent = [char for char in sent if char not in Punc]
    sent = "".join(sent)

    sent = sent.replace("   ", " ")
    sent = sent.replace("  ", " ")
    sent = sent.strip()

    sent = ViTokenizer.tokenize(sent)
    
    return sent

In [50]:
prepro1 = ""
while prepro1 != 'quit':
    prepro1  = input("Question : ")
#     prepro1 = clean_one_sent(prepro1)
    prepro = [prepro1]
    txt = []
    for x in prepro:
        lst = []
        for y in x.split():
            try:
                lst.append(word2index[y])
            except:
                lst.append(word2index['<OUT>'])
        txt.append(lst)
        txt = pad_sequences(txt, MAX_LEN, padding='post')

    stat = enc_model.predict( txt )
    empty_target_seq = np.zeros( ( 1 , 1) )
    empty_target_seq[0, 0] = word2index['<BOS>']
    stop_condition = False

    decoded_translation = ''

    while not stop_condition :
        dec_outputs , h, c= dec_model.predict([empty_target_seq] + stat )
        decoder_concat_input = dense(dec_outputs)
        sampled_word_index = np.argmax( decoder_concat_input[0, -1, :] )
        sampled_word = index2word[sampled_word_index] + ' '

        if sampled_word != '<EOS> ':
            decoded_translation += sampled_word
        if sampled_word == '<EOS> ' or len(decoded_translation.split()) > MAX_LEN+1:
            stop_condition = True

        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index

        stat = [h, c]
    print("Answer : ", decoded_translation )

Question : đần mặt ra nhìn nó y_như con ngu còn cười theo nó nữa xong bị nó phát_hiện


InvalidArgumentError:  indices[0,0] = 6087 is not in [0, 6087)
	 [[node functional_1/embedding_2/embedding_lookup (defined at <ipython-input-50-a169f43e476c>:17) ]] [Op:__inference_predict_function_3386]

Errors may have originated from an input operation.
Input Source operations connected to node functional_1/embedding_2/embedding_lookup:
 functional_1/embedding_2/embedding_lookup/2941 (defined at c:\python\python36\lib\contextlib.py:81)

Function call stack:
predict_function


In [52]:
# y_pred = []
# for prepro1 in test_clean_sentences:
prepro = [prepro1]
txt = []
for x in prepro:
    lst = []
    for y in x.split():
        try:
            lst.append(word2index[y])
        except:
            lst.append(word2index['<OUT>'])
    txt.append(lst)
    txt = pad_sequences(txt, MAX_LEN, padding='post')

stat = enc_model.predict( txt )
empty_target_seq = np.zeros( ( 1 , 1) )
empty_target_seq[0, 0] = word2index['<BOS>']
stop_condition = False

decoded_translation = ''

while not stop_condition :
    dec_outputs , h, c= dec_model.predict([empty_target_seq] + stat )
    decoder_concat_input = dense(dec_outputs)
    sampled_word_index = np.argmax( decoder_concat_input[0, -1, :] )
    sampled_word = index2word[sampled_word_index]

    if sampled_word != '<EOS> ':
        decoded_translation += sampled_word
    if sampled_word == '<EOS> ' or len(decoded_translation.split()) > MAX_LEN+1:
        stop_condition = True

    empty_target_seq = np.zeros( ( 1 , 1 ) )  
    empty_target_seq[ 0 , 0 ] = sampled_word_index

    stat = [h, c]
# y_pred.append(decoded_translation)
print(decoded_translation)

KeyboardInterrupt: 

In [None]:
y_pred

In [None]:
print(classification_report(test_encode_tags, y_pred))