In [1]:
import pandas as pd
from pyvi import ViTokenizer, ViPosTagger
import nltk
import numpy as np
import warnings
import time
import re
warnings.filterwarnings('ignore')

# 1. Import data

In [2]:
train_filename = "train_nor_811.xlsx"
valid_filename = "valid_nor_811.xlsx"
test_filename = "test_nor_811.xlsx"
train_data = pd.read_excel(train_filename, engine = "openpyxl")
valid_data = pd.read_excel(valid_filename, engine = "openpyxl")
test_data = pd.read_excel(test_filename, engine = "openpyxl")

In [3]:
from sklearn.preprocessing import LabelEncoder
def file_processing(data):
    data.drop(columns = {"Unnamed: 0"}, axis = 1, inplace = True)
    data["emotion_encode"] = data["Emotion"]
    encoder = LabelEncoder()
    data.emotion_encode = encoder.fit_transform(data.Emotion)
    return data

In [4]:
train_data = file_processing(train_data)
valid_data = file_processing(valid_data)
test_data = file_processing(test_data)

# 2. Data preprocessing

In [5]:
def remove_duplicate(word):
    prev_char = ""
    clean_word = ""
    for character in word:
        if(character != prev_char):
            clean_word += character
            prev_char = character
    return clean_word

In [6]:
def deEmojify(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def normalize_sentences(sentences):
    punc_lst = {'.', ',', '...', '-', '“', '”', ':', '(', ')', '"', '!', '&', ';', '?', '*', ']', '>', '…', '’',"``","''", "=", "%", "^", "@", "<", ">"}
    confusing_words = {"per"}
    acronym_word = {
        "ko" : "không",
        "k" : "không",
        "z" : "vậy",
        "v" : "vậy",
        "dzậy" : "vậy",
        "dậy": "vậy",
        "t" : "tao",
        "m" : "mày",
        "sgk" : "sách_giáo_khoa",
        "zi" : "vậy",
        "dth" : "dễ_thương",
        "dume": "đụ mẹ"
    }
    
    clean_sentences = []
    
    # remove punctuation and lowercase
    for sent in sentences:
        
        # remove emojis
        sent = deEmojify(sent)
        
        sent = nltk.word_tokenize(sent)
        temp = []
        for word in sent:
            word = word.lower()
            word = remove_duplicate(word)
            if (word in punc_lst or word in confusing_words):
                continue
            elif(word in acronym_word):
                temp.append(acronym_word[word])
            elif(word.isdigit()):
                temp.append("<NUM>")
            else:
                temp.append(word)
        # remove whitespace
        sent = ' '.join(temp)
        
        clean_sentences.append(sent)
        
    return clean_sentences

In [7]:
def normalize_dataset(data):
    sentences = []
    for i in range(len(data)):
        sentences.append(ViTokenizer.tokenize(data.Sentence[i]))
    
    sentences = normalize_sentences(sentences)
    encode_tags = data.Emotion
    
    # remove empty sentences
    for idx, sent in enumerate(sentences):
        if sent.strip() == "":
            del sentences[idx]
            del encode_tags[idx]
    
    return sentences, encode_tags

In [8]:
train_clean_sentences, train_encode_tags = normalize_dataset(train_data)
valid_clean_sentences, valid_encode_tags = normalize_dataset(valid_data)
test_clean_sentences, test_encode_tags = normalize_dataset(test_data)

In [48]:
test_clean_sentences

['người_ta có bạn_bè nhìn vui thật',
 'cho nghỉ viêc mói đúng sao goi là kỷ_luật',
 'kinh vãi',
 'nhà thì không xa lắm nhưng chưa bao_giờ đi vì sợ bị đè_bẹp luôn người',
 'bố không thích nộp đấy mày thích ý_kiến không',
 'bắt_cóc bỏ đĩa quanh đi quẩn lại chỉ khổ dân đen thôi',
 'khi nào bạn có bằng xe ôtô thì bạn mới hiểu sai thì đã có ca',
 'một hành_động đẹp đầy tính nhân_văn',
 'anh nhỏ_nhẹ xin 10k rồi trước khi đi còn chào người đẹp đi cẩn_thận `',
 'trình mày vẫn còn thấp chán',
 'mấy ai được như_vậy',
 'có ai như tao vô đọc bình_luận tao cười tao xĩu',
 'ủa má <NUM> đứa yêu nhau vô_duyên và sân si giống nhau vậy',
 'góp vui cho chủ thớt',
 'sống <NUM> năm chưa crush ai bao_giờ',
 'ổng quay thiệt mà sao chửi ổng thế không quay sao máy bạn xem được đó',
 'thần_kinh cái đầu mày',
 'nghe ngọt thế',
 'sao lại cứ phải thế nhỉ',
 'mệt vãi mà cừời cũng vãi',
 'kinh_khủng thật',
 'hậu_duệ mặt_trời đấy chứ đùa đâu nhá',
 'tôi ở phú yên và anh này nói rất chuẩn <NUM>',
 'đứa nào bị táo_bón 

In [41]:
filename =["bạn bè.txt","các câu hỏi phức tạp.txt","du lịch.txt","gia đình.txt","giải trí.txt",
           "học tập.txt","nghề nghiệp.txt","nghỉ lễ.txt","người yêu.txt","robot.txt","shoping.txt",
           "sở thích.txt","tdtu.txt","thông tin cá nhân.txt","trò chuyện về đi ăn.txt","tán gẫu.txt","đất nước.txt","địa chỉ.txt"]

In [42]:
# import train data
temp_ques = []
temp_ans = []
tag = []
for k in range(len(filename)):
    with open('dataset/' + filename[k], encoding='utf-8') as f:
        lines = f.readlines()
    for i in range(len(lines)):
        if lines[i].startswith('__eou__'):
            continue
        else:
            part = lines[i].strip('__eou__').split('__eou__')
            temp_ques.append(ViTokenizer.tokenize(part[0].lower().strip()))
            temp_ans.append(ViTokenizer.tokenize(part[1].lower().strip()))
            tag.append(filename[k].split(".")[0])

In [43]:
data = pd.DataFrame({'Question':temp_ques,'Answer':temp_ans,'Tag':tag})
data.head()

Unnamed: 0,Question,Answer,Tag
0,thích đánh_lộn không ?,ngon nhà_vô,bạn bè
1,solo yasua không,chấp lun 2 mạng đầu,bạn bè
2,mai đi picnic không ?,mai bận học rồi,bạn bè
3,mai học ca mấy vậy ?,mai học ca 3,bạn bè
4,còn tiền không ?,còn chết liền,bạn bè


In [44]:
ques = np.array(data["Question"])
ans = np.array(data["Answer"])

In [45]:
lst_empty_answer_index = []
for i in range(len(ans)):
    if(ans[i] == ""):
        lst_empty_answer_index.append(i)

In [46]:
for i in lst_empty_answer_index:
    ques = np.delete(ques, i)
    ans = np.delete(ans, i)

In [47]:
def clean_sentences(sentences):
    Punc = {'.', ',', '...', '-', '“', '”', ':', '(', ')', '"', '!', '&', ';', '?', '*', ']', '>', '…', '’',"``","''"}
    for i, sent in enumerate(sentences):
        sent = sent.lower()
        
        sent = [char for char in sent if char not in Punc]
        sent = "".join(sent)
        
        sent = sent.replace("   ", " ")
        sent = sent.replace("  ", " ")
        sent = sent.strip()
        
        sentences[i] = sent
    return sentences

In [9]:
clean_ques = train_clean_sentences
clean_ans = train_encode_tags

In [10]:
word2count = {}
for sent in clean_ques:
    for word in sent.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

for sent in clean_ans:
    for word in sent.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [11]:
len(word2count)

6083

In [12]:
thresh = 1
word2index = {}
word_num = 0

for word, count in word2count.items():
    if (count >= thresh):
        word2index[word] = word_num
        word_num += 1

In [13]:
len(word2index)

6083

In [15]:
# clean_ans = clean_ans.tolist()
for i in range(len(clean_ans)):
    clean_ans[i] = '<BOS> ' + clean_ans[i] + ' <EOS>'

Other
0
Disgust
1
Disgust
2
Enjoyment
3
Enjoyment
4
Anger
5
Other
6
Surprise
7
Anger
8
Sadness
9
Enjoyment
10
Disgust
11
Other
12
Enjoyment
13
Disgust
14
Sadness
15
Other
16
Other
17
Fear
18
Enjoyment
19
Anger
20
Disgust
21
Enjoyment
22
Enjoyment
23
Sadness
24
Sadness
25
Anger
26
Fear
27
Fear
28
Other
29
Surprise
30
Fear
31
Sadness
32
Sadness
33
Fear
34
Enjoyment
35
Other
36
Other
37
Sadness
38
Enjoyment
39
Anger
40
Sadness
41
Sadness
42
Disgust
43
Other
44
Other
45
Disgust
46
Anger
47
Surprise
48
Disgust
49
Sadness
50
Sadness
51
Enjoyment
52
Sadness
53
Disgust
54
Sadness
55
Anger
56
Disgust
57
Other
58
Enjoyment
59
Anger
60
Sadness
61
Sadness
62
Enjoyment
63
Disgust
64
Disgust
65
Enjoyment
66
Sadness
67
Sadness
68
Enjoyment
69
Surprise
70
Disgust
71
Enjoyment
72
Enjoyment
73
Sadness
74
Sadness
75
Fear
76
Enjoyment
77
Enjoyment
78
Enjoyment
79
Surprise
80
Disgust
81
Enjoyment
82
Enjoyment
83
Surprise
84
Sadness
85
Disgust
86
Enjoyment
87
Enjoyment
88
Disgust
89
Enjoyment
90
Disgust
91


Other
804
Sadness
805
Enjoyment
806
Enjoyment
807
Enjoyment
808
Enjoyment
809
Enjoyment
810
Other
811
Anger
812
Enjoyment
813
Disgust
814
Enjoyment
815
Enjoyment
816
Enjoyment
817
Other
818
Enjoyment
819
Sadness
820
Enjoyment
821
Sadness
822
Disgust
823
Other
824
Enjoyment
825
Surprise
826
Sadness
827
Disgust
828
Fear
829
Disgust
830
Enjoyment
831
Sadness
832
Other
833
Enjoyment
834
Other
835
Sadness
836
Enjoyment
837
Other
838
Sadness
839
Disgust
840
Enjoyment
841
Disgust
842
Disgust
843
Enjoyment
844
Disgust
845
Enjoyment
846
Enjoyment
847
Other
848
Enjoyment
849
Other
850
Anger
851
Sadness
852
Disgust
853
Enjoyment
854
Sadness
855
Sadness
856
Enjoyment
857
Other
858
Anger
859
Enjoyment
860
Enjoyment
861
Enjoyment
862
Enjoyment
863
Sadness
864
Enjoyment
865
Enjoyment
866
Disgust
867
Sadness
868
Disgust
869
Other
870
Sadness
871
Sadness
872
Anger
873
Other
874
Disgust
875
Sadness
876
Disgust
877
Disgust
878
Fear
879
Sadness
880
Disgust
881
Sadness
882
Disgust
883
Sadness
884
Disgust
8

Sadness
1576
Anger
1577
Enjoyment
1578
Other
1579
Other
1580
Enjoyment
1581
Enjoyment
1582
Other
1583
Disgust
1584
Anger
1585
Other
1586
Other
1587
Enjoyment
1588
Enjoyment
1589
Enjoyment
1590
Other
1591
Fear
1592
Enjoyment
1593
Disgust
1594
Enjoyment
1595
Disgust
1596
Disgust
1597
Other
1598
Surprise
1599
Enjoyment
1600
Other
1601
Sadness
1602
Disgust
1603
Disgust
1604
Enjoyment
1605
Disgust
1606
Anger
1607
Enjoyment
1608
Disgust
1609
Surprise
1610
Disgust
1611
Enjoyment
1612
Enjoyment
1613
Other
1614
Enjoyment
1615
Other
1616
Fear
1617
Enjoyment
1618
Enjoyment
1619
Surprise
1620
Other
1621
Enjoyment
1622
Surprise
1623
Enjoyment
1624
Sadness
1625
Enjoyment
1626
Enjoyment
1627
Enjoyment
1628
Enjoyment
1629
Enjoyment
1630
Enjoyment
1631
Disgust
1632
Disgust
1633
Enjoyment
1634
Disgust
1635
Sadness
1636
Other
1637
Fear
1638
Sadness
1639
Sadness
1640
Enjoyment
1641
Surprise
1642
Enjoyment
1643
Disgust
1644
Sadness
1645
Enjoyment
1646
Enjoyment
1647
Sadness
1648
Other
1649
Other
1650
Enjoy

2322
Sadness
2323
Anger
2324
Disgust
2325
Disgust
2326
Enjoyment
2327
Enjoyment
2328
Sadness
2329
Enjoyment
2330
Enjoyment
2331
Disgust
2332
Other
2333
Enjoyment
2334
Other
2335
Other
2336
Other
2337
Enjoyment
2338
Other
2339
Enjoyment
2340
Disgust
2341
Anger
2342
Anger
2343
Sadness
2344
Disgust
2345
Enjoyment
2346
Enjoyment
2347
Sadness
2348
Enjoyment
2349
Disgust
2350
Sadness
2351
Other
2352
Other
2353
Enjoyment
2354
Sadness
2355
Fear
2356
Other
2357
Anger
2358
Enjoyment
2359
Disgust
2360
Surprise
2361
Anger
2362
Disgust
2363
Other
2364
Surprise
2365
Enjoyment
2366
Disgust
2367
Enjoyment
2368
Enjoyment
2369
Disgust
2370
Enjoyment
2371
Anger
2372
Other
2373
Anger
2374
Sadness
2375
Enjoyment
2376
Other
2377
Disgust
2378
Disgust
2379
Sadness
2380
Enjoyment
2381
Enjoyment
2382
Enjoyment
2383
Other
2384
Enjoyment
2385
Enjoyment
2386
Enjoyment
2387
Anger
2388
Disgust
2389
Enjoyment
2390
Disgust
2391
Surprise
2392
Disgust
2393
Other
2394
Other
2395
Disgust
2396
Disgust
2397
Disgust
2398
Dis

3025
Enjoyment
3026
Disgust
3027
Anger
3028
Other
3029
Enjoyment
3030
Other
3031
Surprise
3032
Enjoyment
3033
Enjoyment
3034
Enjoyment
3035
Enjoyment
3036
Sadness
3037
Enjoyment
3038
Enjoyment
3039
Other
3040
Sadness
3041
Other
3042
Enjoyment
3043
Sadness
3044
Enjoyment
3045
Sadness
3046
Other
3047
Other
3048
Enjoyment
3049
Disgust
3050
Enjoyment
3051
Surprise
3052
Sadness
3053
Anger
3054
Other
3055
Sadness
3056
Sadness
3057
Disgust
3058
Disgust
3059
Anger
3060
Disgust
3061
Sadness
3062
Enjoyment
3063
Other
3064
Enjoyment
3065
Disgust
3066
Enjoyment
3067
Surprise
3068
Anger
3069
Other
3070
Sadness
3071
Other
3072
Enjoyment
3073
Enjoyment
3074
Sadness
3075
Disgust
3076
Anger
3077
Enjoyment
3078
Disgust
3079
Other
3080
Enjoyment
3081
Other
3082
Disgust
3083
Sadness
3084
Other
3085
Enjoyment
3086
Fear
3087
Fear
3088
Other
3089
Sadness
3090
Sadness
3091
Disgust
3092
Enjoyment
3093
Fear
3094
Other
3095
Sadness
3096
Other
3097
Enjoyment
3098
Disgust
3099
Disgust
3100
Sadness
3101
Enjoyment
3

3801
Enjoyment
3802
Disgust
3803
Fear
3804
Disgust
3805
Sadness
3806
Sadness
3807
Other
3808
Sadness
3809
Enjoyment
3810
Sadness
3811
Disgust
3812
Other
3813
Fear
3814
Disgust
3815
Enjoyment
3816
Fear
3817
Disgust
3818
Other
3819
Other
3820
Surprise
3821
Enjoyment
3822
Sadness
3823
Enjoyment
3824
Enjoyment
3825
Enjoyment
3826
Fear
3827
Sadness
3828
Enjoyment
3829
Fear
3830
Enjoyment
3831
Other
3832
Enjoyment
3833
Surprise
3834
Fear
3835
Fear
3836
Sadness
3837
Sadness
3838
Disgust
3839
Fear
3840
Surprise
3841
Anger
3842
Anger
3843
Fear
3844
Sadness
3845
Other
3846
Enjoyment
3847
Anger
3848
Sadness
3849
Surprise
3850
Sadness
3851
Enjoyment
3852
Surprise
3853
Sadness
3854
Enjoyment
3855
Other
3856
Sadness
3857
Disgust
3858
Sadness
3859
Other
3860
Other
3861
Enjoyment
3862
Sadness
3863
Enjoyment
3864
Enjoyment
3865
Other
3866
Other
3867
Enjoyment
3868
Other
3869
Sadness
3870
Anger
3871
Enjoyment
3872
Enjoyment
3873
Disgust
3874
Sadness
3875
Disgust
3876
Disgust
3877
Anger
3878
Enjoyment
38

4653
Sadness
4654
Disgust
4655
Enjoyment
4656
Sadness
4657
Enjoyment
4658
Other
4659
Other
4660
Enjoyment
4661
Enjoyment
4662
Disgust
4663
Fear
4664
Other
4665
Other
4666
Enjoyment
4667
Other
4668
Anger
4669
Sadness
4670
Other
4671
Anger
4672
Enjoyment
4673
Sadness
4674
Sadness
4675
Enjoyment
4676
Enjoyment
4677
Surprise
4678
Anger
4679
Other
4680
Other
4681
Disgust
4682
Disgust
4683
Enjoyment
4684
Enjoyment
4685
Other
4686
Disgust
4687
Enjoyment
4688
Other
4689
Sadness
4690
Sadness
4691
Sadness
4692
Sadness
4693
Enjoyment
4694
Disgust
4695
Fear
4696
Other
4697
Other
4698
Other
4699
Fear
4700
Sadness
4701
Enjoyment
4702
Disgust
4703
Sadness
4704
Sadness
4705
Disgust
4706
Enjoyment
4707
Disgust
4708
Sadness
4709
Enjoyment
4710
Sadness
4711
Enjoyment
4712
Sadness
4713
Other
4714
Enjoyment
4715
Anger
4716
Enjoyment
4717
Other
4718
Disgust
4719
Enjoyment
4720
Sadness
4721
Disgust
4722
Fear
4723
Sadness
4724
Other
4725
Disgust
4726
Other
4727
Enjoyment
4728
Enjoyment
4729
Enjoyment
4730
Enj

Disgust
5312
Enjoyment
5313
Disgust
5314
Enjoyment
5315
Enjoyment
5316
Enjoyment
5317
Anger
5318
Disgust
5319
Enjoyment
5320
Sadness
5321
Other
5322
Enjoyment
5323
Surprise
5324
Surprise
5325
Sadness
5326
Enjoyment
5327
Enjoyment
5328
Enjoyment
5329
Other
5330
Anger
5331
Surprise
5332
Anger
5333
Disgust
5334
Enjoyment
5335
Enjoyment
5336
Enjoyment
5337
Enjoyment
5338
Anger
5339
Other
5340
Enjoyment
5341
Sadness
5342
Sadness
5343
Enjoyment
5344
Disgust
5345
Disgust
5346
Other
5347
Other
5348
Enjoyment
5349
Disgust
5350
Enjoyment
5351
Disgust
5352
Enjoyment
5353
Disgust
5354
Enjoyment
5355
Disgust
5356
Other
5357
Sadness
5358
Anger
5359
Other
5360
Other
5361
Enjoyment
5362
Enjoyment
5363
Enjoyment
5364
Enjoyment
5365
Enjoyment
5366
Enjoyment
5367
Other
5368
Disgust
5369
Enjoyment
5370
Enjoyment
5371
Fear
5372
Disgust
5373
Disgust
5374
Enjoyment
5375
Enjoyment
5376
Disgust
5377
Enjoyment
5378
Surprise
5379
Other
5380
Enjoyment
5381
Surprise
5382
Anger
5383
Sadness
5384
Disgust
5385
Sadnes

In [16]:
len(clean_ans)

5547

In [17]:
tokens = ['<BOS>', '<EOS>', '<OUT>']
x = len(word2index)
for token in tokens:
    word2index[token] = x
    x += 1

In [18]:
len(word2index)

6086

In [19]:
index2word = {w: v for v, w in word2index.items()}
len(index2word)

6086

In [20]:
encoder_input = []
for sent in clean_ques:
    lst = []
    for word in sent.split():
        if word not in word2index:
            lst.append(word2index["<OUT>"])
        else:
            lst.append(word2index[word])
    encoder_input.append(lst)

In [21]:
len(encoder_input)

5547

In [22]:
decoder_input = []
for sent in clean_ans:
    lst = []
    for word in sent.split():
        if word not in word2index:
            lst.append(word2index["<OUT>"])
        else:
            lst.append(word2index[word])
    decoder_input.append(lst)

In [23]:
len(decoder_input)

5547

In [24]:
from keras.preprocessing.sequence import pad_sequences
MAX_LEN = 150
encoder_input = pad_sequences(encoder_input, MAX_LEN, padding='post', truncating='post')
decoder_input = pad_sequences(decoder_input, MAX_LEN, padding='post', truncating='post')

In [25]:
decoder_final_output = []
for i in decoder_input:
    decoder_final_output.append(i[1:])

In [66]:
decoder_final_output[:3]

[array([ 986, 2522, 4767,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]),
 array([2523, 2252,  453,  490,  474, 4767,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]),
 array([   5,  803,    8,   15, 4767,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])]

In [26]:
decoder_final_output = pad_sequences(decoder_final_output, MAX_LEN, padding='post', truncating='post')
decoder_final_output[:3]

array([[6076, 6084,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [27]:
from keras.utils import to_categorical
decoder_final_output = to_categorical(decoder_final_output, len(word2index))

In [28]:
decoder_final_output.shape

(5547, 150, 6086)

In [29]:
from keras.layers import Embedding
from keras.layers import Input, Dense, LSTM, TimeDistributed
from keras.models import Model

In [36]:
# Define input
enc_inp = Input(shape=(MAX_LEN, ))
dec_inp = Input(shape=(MAX_LEN, ))

VOCAB_SIZE = len(word2index)
HIDDEN_DIM = 50
embedding_dimention = 100

# Define embedding layer
embed = Embedding(VOCAB_SIZE + 1, output_dim = embedding_dimention, input_length = MAX_LEN, trainable = True)

In [37]:
# Define encoder layers
enc_embed = embed(enc_inp)
enc_lstm = LSTM(HIDDEN_DIM, return_sequences = True, return_state = True)
enc_op, h, c = enc_lstm(enc_embed)
enc_states = [h, c]

#Define decoder layers
dec_embed = embed(dec_inp)
dec_lstm = LSTM(HIDDEN_DIM, return_sequences = True, return_state = True)
dec_op, _, _ = dec_lstm(dec_embed, initial_state = enc_states)

dense = Dense(VOCAB_SIZE, activation = "softmax")

dense_op = dense(dec_op)

model = Model([enc_inp, dec_inp], dense_op)

In [38]:
import tensorflow as tf

# Train model
model.compile(loss='categorical_crossentropy',metrics=['acc'],optimizer='adam')

BATCH_SIZE = 32
EPOCHS = 10

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1)

model.fit([encoder_input, decoder_input],decoder_final_output,epochs=EPOCHS,batch_size=BATCH_SIZE, callbacks = [es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1d2b2b0e748>

In [42]:
model.save("LSTM_model.h5")

In [43]:
import keras.models
model = keras.models.load_model("LSTM_model.h5")

In [44]:
model.save("LSTM_model.h5")

In [45]:
model.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 150, 100)     608700      input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
lstm_4 (LSTM)                   [(None, 150, 50), (N 30200       embedding_2[0][0]     

In [46]:
#Load encoder model
enc_inp = model.input[0]
enc_op, h, c = model.layers[3].output
enc_states = [h, c]

enc_model = Model([enc_inp], enc_states)

In [47]:
#Load decoder model to predict next word
decoder_state_input_h = Input(shape=(HIDDEN_DIM,))
decoder_state_input_c = Input(shape=(HIDDEN_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_inp = model.input[1]
embed = model.layers[2] # Embedding layer
dec_lstm = model.layers[4] # Decoder layer
dense = model.layers[5] # Dense

dec_embed = embed(dec_inp)
decoder_outputs, state_h, state_c = dec_lstm(dec_embed , initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]
# output = dense(decoder_outputs)

dec_model = Model([dec_inp]+ decoder_states_inputs, [decoder_outputs] + decoder_states)

In [126]:
def clean_one_sent(sent):
    Punc = {'.', ',', '...', '-', '“', '”', ':', '(', ')', '"', '!', '&', ';', '?', '*', ']', '>', '…', '’',"``","''"}
    sent = sent.lower()

    sent = [char for char in sent if char not in Punc]
    sent = "".join(sent)

    sent = sent.replace("   ", " ")
    sent = sent.replace("  ", " ")
    sent = sent.strip()

    sent = ViTokenizer.tokenize(sent)
    
    return sent

In [127]:
prepro1 = ""
while prepro1 != 'quit':
    prepro1  = input("Question : ")
    prepro1 = clean_one_sent(prepro1)
    prepro = [prepro1]
    txt = []
    for x in prepro:
        lst = []
        for y in x.split():
            try:
                lst.append(word2index[y])
            except:
                lst.append(word2index['<OUT>'])
        txt.append(lst)
        txt = pad_sequences(txt, MAX_LEN, padding='post')

    stat = enc_model.predict( txt )
    empty_target_seq = np.zeros( ( 1 , 1) )
    empty_target_seq[0, 0] = word2index['<BOS>']
    stop_condition = False

    decoded_translation = ''

    while not stop_condition :
        dec_outputs , h, c= dec_model.predict([empty_target_seq] + stat )
        decoder_concat_input = dense(dec_outputs)
        sampled_word_index = np.argmax( decoder_concat_input[0, -1, :] )
        sampled_word = index2word[sampled_word_index] + ' '

        if sampled_word != '<EOS> ':
            decoded_translation += sampled_word
        if sampled_word == '<EOS> ' or len(decoded_translation.split()) > MAX_LEN+1:
            stop_condition = True

        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index

        stat = [h, c]
    print("Answer : ", decoded_translation )

Question : bạn có crush chưa?
Answer :  có rồi bạn 
Question : quit
Answer :  


In [None]:
y_pred = []
for prepro1 in test_clean_sentences:
    prepro = [prepro1]
    txt = []
    for x in prepro:
        lst = []
        for y in x.split():
            try:
                lst.append(word2index[y])
            except:
                lst.append(word2index['<OUT>'])
        txt.append(lst)
        txt = pad_sequences(txt, MAX_LEN, padding='post')

    stat = enc_model.predict( txt )
    empty_target_seq = np.zeros( ( 1 , 1) )
    empty_target_seq[0, 0] = word2index['<BOS>']
    stop_condition = False

    decoded_translation = ''

    while not stop_condition :
        dec_outputs , h, c= dec_model.predict([empty_target_seq] + stat )
        decoder_concat_input = dense(dec_outputs)
        sampled_word_index = np.argmax( decoder_concat_input[0, -1, :] )
        sampled_word = index2word[sampled_word_index]

        if sampled_word != '<EOS> ':
            decoded_translation += sampled_word
        if sampled_word == '<EOS> ' or len(decoded_translation.split()) > MAX_LEN+1:
            stop_condition = True

        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index

        stat = [h, c]
    y_pred.append(decoded_translation)

In [None]:
y_pred

In [None]:
print(classification_report(test_encode_tags, y_pred))