In [1]:
# https://towardsdatascience.com/named-entity-recognition-ner-using-keras-bidirectional-lstm-28cd3f301f54

In [68]:
import numpy as np
import pandas as pd
from collections import Counter


In [69]:
data = pd.read_csv("../Dataset/bert/trainval.csv")
data.rename(columns={"sentence_id":"Sentence #", "words":"Word","labels":"Tag"},inplace=True)
data.head()

Unnamed: 0,Sentence #,Word,Tag
0,1382601382042103808,Hidup,O
1,1382601382042103808,sesedih,O
2,1382601382042103808,dan,O
3,1382601382042103808,secaper,O
4,1382601382042103808,apa,O


In [None]:
data = pd.read_csv("../Dataset/bert/trainval.csv")
data.head()



In [70]:
from itertools import chain
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok


token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

In [71]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data.head()

Unnamed: 0,Sentence #,Word,Tag,Word_idx,Tag_idx
0,1382601382042103808,Hidup,O,19390,11
1,1382601382042103808,sesedih,O,8977,11
2,1382601382042103808,dan,O,26426,11
3,1382601382042103808,secaper,O,4147,11
4,1382601382042103808,apa,O,16209,11


In [72]:
# Fill na
data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
data_group = data_fillna.groupby(
['Sentence #'],as_index=False
)['Word', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))
# Visualise data
data_group.head()

  


Unnamed: 0,Sentence #,Word,Tag,Word_idx,Tag_idx
0,1380681275594305537,"[@ridwankamil, @KickAndyShow, @Metro_TV, Upaya...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-P...","[8764, 1510, 2757, 7242, 20728, 2221, 9544, 14...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1..."
1,1380685355897085952,"[@ShopeeID, Bismillah, yok, menang, ����, SHOP...","[O, O, O, O, O, B-PROD, O, O, O, O]","[3400, 26574, 2023, 26668, 27226, 9218, 3400, ...","[11, 11, 11, 11, 11, 4, 11, 11, 11, 11]"
2,1380686115007393792,"[udah, 2, mingguan, tinggal, bareng, teteh, ,,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC,...","[11744, 5170, 24871, 17882, 11859, 28607, 1932...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1..."
3,1380688640611786754,"[@ShopeeID, Potong, buah, semangka, pakai, ala...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-ORG,...","[3400, 28642, 17480, 27824, 19729, 14422, 2665...","[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1..."
4,1380688670282289153,"[RT, @kukluxcats, :, Selevel, gubernur, Jatim,...","[O, O, O, O, B-PER, I-PER, O, O, O, O, O, O, O...","[3126, 5039, 10579, 17351, 20935, 3070, 14484,...","[11, 11, 11, 11, 10, 0, 11, 11, 11, 11, 11, 11..."


In [73]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [74]:
def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )
    
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 4009 
train_tokens length: 4009 
test_tokens length: 594 
test_tags: 594 
val_tokens: 1337 
val_tags: 1337


In [75]:
test_tokens

array([[ 8865, 25863, 11574, ..., 30204, 30204, 30204],
       [ 5450, 29088, 10885, ..., 30204, 30204, 30204],
       [13685, 18057,  1681, ..., 30204, 30204, 30204],
       ...,
       [26609, 20871, 29804, ..., 30204, 30204, 30204],
       [23294, 15055, 22047, ..., 30204, 30204, 30204],
       [20522, 13284, 26426, ..., 30204, 30204, 30204]])

In [76]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

In [77]:
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [78]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

input_dim:  30206 
output_dim:  64 
input_length:  136 
n_tags:  13


In [79]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    #Optimiser 
    # adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [80]:

def train_model(X, y, model):
    loss = list()
    for i in range(5):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [81]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 136, 64)           1933184   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 136, 128)          66048     
_________________________________________________________________
lstm_9 (LSTM)                (None, 136, 64)           49408     
_________________________________________________________________
time_distributed_4 (TimeDist (None, 136, 13)           845       
Total params: 2,049,485
Trainable params: 2,049,485
Non-trainable params: 0
_________________________________________________________________
('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [51]:
results

Unnamed: 0,with_add_lstm
0,
1,
2,
3,
4,
