## 1.Import Library

In [32]:
import pandas as pd
import numpy as np


# Modeling
import tensorflow as tf
from keras.utils import to_categorical
from keras.layers import LSTM, Dense, TimeDistributed, Embedding, Bidirectional, Dropout, SpatialDropout1D
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint
from keras_contrib.layers import CRF

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, make_scorer
from seqeval.metrics import precision_score, recall_score, f1_score as seqeval_f1_score, classification_report as seqeval_classification_report

from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences


## 2.1 Built model

### 2.1.1.Explore data analysis

In [33]:
df = pd.read_csv('../data/paper_data/train.csv', sep = '\t',header=None, names=["Word", "Pos", "Chunk", "Tag"])

> First 5 lines

In [34]:
print(df.head(20))

     Word Pos Chunk Tag
0     Mâu   _     _   O
1   thuẫn   _     _   O
2    tình   _     _   O
3     cảm   _     _   O
4       ,   _     _   O
5   thanh   _     _   O
6    niên   _     _   O
7     tạt   _     _   O
8    xăng   _     _   O
9     đốt   _     _   O
10    cửa   _     _   O
11   hàng   _     _   O
12    bạn   _     _   O
13    gái   _     _   O
14    Xảy   _     _   O
15     ra   _     _   O
16    cãi   _     _   O
17     vã   _     _   O
18      ,   _     _   O
19  trong   _     _   O


In [35]:
df['Word'] = df['Word'].astype(str)
df

Unnamed: 0,Word,Pos,Chunk,Tag
0,Mâu,_,_,O
1,thuẫn,_,_,O
2,tình,_,_,O
3,cảm,_,_,O
4,",",_,_,O
...,...,...,...,...
479248,Cao,_,_,B-PERSON
479249,Lực,_,_,I-PERSON
479250,(,_,_,O
479251,Theo,_,_,O


> Identify a sentence

In [36]:
sentence_list = []
index = 2
sentence_list.insert(0, "Sentence 1")
for i in range(1, len(df['Word'])+1):
    if df['Word'][i-1] == ".":
        sentence_list.append("Sentence " + str(index))
        index = index + 1
    else:
        sentence_list.append(" ")

sentence_list[0] = "Sentence 1"


In [37]:
df['Sentence'] = pd.Series(sentence_list)


In [38]:
df['Sentence'] = df['Sentence'].replace(' ', np.nan).ffill()

In [39]:
print(df.head(50))

         Word Pos Chunk             Tag    Sentence
0         Mâu   _     _               O  Sentence 1
1       thuẫn   _     _               O  Sentence 1
2        tình   _     _               O  Sentence 1
3         cảm   _     _               O  Sentence 1
4           ,   _     _               O  Sentence 1
5       thanh   _     _               O  Sentence 1
6        niên   _     _               O  Sentence 1
7         tạt   _     _               O  Sentence 1
8        xăng   _     _               O  Sentence 1
9         đốt   _     _               O  Sentence 1
10        cửa   _     _               O  Sentence 1
11       hàng   _     _               O  Sentence 1
12        bạn   _     _               O  Sentence 1
13        gái   _     _               O  Sentence 1
14        Xảy   _     _               O  Sentence 1
15         ra   _     _               O  Sentence 1
16        cãi   _     _               O  Sentence 1
17         vã   _     _               O  Sentence 1
18          

In [40]:
df1 = df[['Sentence', 'Word', 'Tag']]
df1

Unnamed: 0,Sentence,Word,Tag
0,Sentence 1,Mâu,O
1,Sentence 1,thuẫn,O
2,Sentence 1,tình,O
3,Sentence 1,cảm,O
4,Sentence 1,",",O
...,...,...,...
479248,Sentence 13014,Cao,B-PERSON
479249,Sentence 13014,Lực,I-PERSON
479250,Sentence 13014,(,O
479251,Sentence 13014,Theo,O


> Count tag

In [41]:
list_tags = list(set(df1["Tag"].values))
len_tags = len(list_tags)
print("List of tags: " + ', '.join([tag for tag in list_tags]))
print(f"Total Number of tags {len_tags}")

List of tags: I-DATETIME-SET, B-ORGANIZATION, I-DATETIME-DATE, B-DATETIME-SET, B-LOCATION-STRUC, B-DATETIME-DATERANGE, I-LOCATION-GPE, I-LOCATION-STRUC, B-ORGANIZATION-SPORTS, I-ORGANIZATION, I-LOCATION, I-DATETIME-DURATION, I-DATETIME-TIME, I-ORGANIZATION-MED, I-DATETIME-DATERANGE, I-DATETIME, I-LOCATION-GEO, I-ORGANIZATION-STOCK, B-LOCATION-GPE, B-ORGANIZATION-MED, B-DATETIME-TIME, B-ORGANIZATION-STOCK, B-LOCATION, I-ORGANIZATION-SPORTS, I-DATETIME-TIMERANGE, B-DATETIME-TIMERANGE, B-PERSON, O, I-PERSON, B-DATETIME-DURATION, B-LOCATION-GEO, B-DATETIME, B-DATETIME-DATE
Total Number of tags 33


# Encoding Data

In [42]:
class getsentence(object):
    def __init__(self, data):
        self.n_sent = 1.0
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                     s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [43]:
getter = getsentence(df1)

  self.grouped = self.data.groupby("Sentence").apply(agg_func)


In [44]:
sentences = getter.sentences
#ths is how a sentence will look like.
print(sentences[10])

[('Nhiều', 'O'), ('người', 'O'), ('đồn', 'O'), ('đoán', 'O'), (',', 'O'), ('bầu', 'O'), ('Đức', 'B-PERSON'), ('sang', 'O'), ('Thái', 'B-LOCATION-GPE'), ('Lan', 'I-LOCATION-GPE'), ('để', 'O'), ('gặp', 'O'), ('gỡ', 'O'), ('Kiatisak', 'B-PERSON'), ('Mặc', 'O'), ('dù', 'O'), ('vậy', 'O'), (',', 'O'), ('người', 'O'), ('đứng', 'O'), ('đầu', 'O'), ('HAGL', 'B-ORGANIZATION-SPORTS'), ('khẳng', 'O'), ('định', 'O'), ('ông', 'O'), ('chưa', 'O'), ('có', 'O'), ('ý', 'O'), ('định', 'O'), ('mời', 'O'), ('Kiatisak', 'B-PERSON'), ('sang', 'O'), ('Việt', 'B-LOCATION-GPE'), ('Nam', 'I-LOCATION-GPE'), ('làm', 'O'), ('việc', 'O'), ('.', 'O')]


In [45]:
batch_size = 64
epochs = 50
max_len = 75
embedding = 40
hidden_size = 50

In [46]:
# Xây dựng vocab cho word và tag
words = list(df1['Word'].unique())
tags = list(df1['Tag'].unique())

# Tạo dict word to index, thêm 2 từ đặc biệt là Unknown và Padding
word2idx = {w : i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0

# Tạo dict tag to index, thêm 1 tag đặc biệt và Padding
tag2idx = {t : i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

# Tạo 2 dict index to word và index to tag
idx2word = {i: w for w, i in word2idx.items()}
idx2tag = {i: w for w, i in tag2idx.items()}

In [47]:
# Chuyển các câu về dạng vector of index
X = [[word2idx[w[0]] for w in s] for s in sentences]
# Padding các câu về max_len
X = pad_sequences(maxlen = max_len, sequences = X, padding = "post", value = word2idx["PAD"])
# Chuyển các tag về dạng index
y = [[tag2idx[w[1]] for w in s] for s in sentences]
# Tiền hành padding về max_len
y = pad_sequences(maxlen = max_len, sequences = y, padding = "post", value = tag2idx["PAD"])

In [48]:
#from keras.utils.np_utils import to_categorical
num_tag = df['Tag'].nunique()
y = [to_categorical(i, num_classes = num_tag + 1) for i in y]

In [49]:
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)

# Build and Train Model - BiLSTM

In [50]:
input_word = Input(shape = (max_len,))
model = Embedding(input_dim = len(words)+2, output_dim = max_len, input_length = max_len)(input_word)
model = SpatialDropout1D(0.1)(model)
model = Bidirectional(LSTM(units = 100,return_sequences = True, recurrent_dropout = 0.1))(model)
out = TimeDistributed(Dense(num_tag+1,activation = 'softmax'))(model)
model = Model(input_word,out)

model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])
model.summary()



In [51]:
checkpoint = ModelCheckpoint(filepath = 'model.keras',
                           verbose = 0,
                           mode = 'auto',
                           save_best_only = True,
                           monitor='val_loss')
history = model.fit(X_train, np.array(y_train), batch_size = 64, verbose = 1, epochs = 150, validation_split = 0.2, callbacks=[checkpoint])

Epoch 1/150
[1m139/139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 135ms/step - accuracy: 0.8060 - loss: 0.9996 - val_accuracy: 0.9512 - val_loss: 0.2412
Epoch 2/150
[1m124/139[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m1s[0m 125ms/step - accuracy: 0.9553 - loss: 0.2067

# Test and Report

In [None]:
# Test với toàn bộ tập test
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(y_test, -1)

# Kiểm thử F1-Score
y_pred = [[idx2tag[i] for i in row] for row in y_pred]
y_test_true = [[idx2tag[i] for i in row] for row in y_test_true]
print("F1-score is : {:.1%}".format(f1_score(y_test_true, y_pred)))

[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 58ms/step




F1-score is : 74.0%


In [None]:
idx = np.random.randint(0,X_test.shape[0])

p = model.predict(np.array([X_test[idx]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_test[idx], -1)

print("Example #{}".format(idx))

print("{:15}||{:5}||{}".format("Word", "True\t\t\t", "Pred"))
print(50 * "*")
for w, t, pred in zip(X_test[idx], true, p[0]):
    if w != 0:
        print("{:15}: {:15}\t {}".format(words[w-2], idx2tag[t], idx2tag[pred]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Example #751
Word           ||True			||Pred
**************************************************
Việc           : O              	 O
họ             : O              	 O
chơi           : O              	 O
không          : O              	 O
tốt            : O              	 O
trong          : O              	 O
ba             : O              	 O
trận           : O              	 O
trên           : O              	 O
sân            : O              	 O
nhà            : O              	 O
đang           : O              	 O
đặt            : O              	 O
ra             : O              	 O
những          : O              	 O
câu            : O              	 O
hỏi            : O              	 O
,              : O              	 O
nhưng          : O              	 O
không          : O              	 O
có             : O              	 O
gì             : O              	 O
phải           : O              	 O
nghi 

In [None]:
print(classification_report(y_test_true, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


                     precision    recall  f1-score   support

                 AD       1.00      1.00      1.00      1842
           DATETIME       0.31      0.27      0.29       159
      DATETIME-DATE       0.36      0.36      0.36        70
 DATETIME-DATERANGE       0.08      0.11      0.10         9
  DATETIME-DURATION       0.34      0.44      0.38        55
       DATETIME-SET       0.00      0.00      0.00         0
      DATETIME-TIME       0.20      0.21      0.21        19
 DATETIME-TIMERANGE       0.24      0.25      0.24        20
           LOCATION       0.47      0.46      0.47       526
       LOCATION-GEO       0.16      0.27      0.20        11
       LOCATION-GPE       0.49      0.58      0.53       453
     LOCATION-STRUC       0.23      0.31      0.26        39
       ORGANIZATION       0.54      0.60      0.57       637
   ORGANIZATION-MED       0.10      0.11      0.11         9
ORGANIZATION-SPORTS       0.81      0.78      0.80       134
             PERSON    