In [1]:
import tensorflow as tf
import numpy as np
import unicodedata
import re
import pandas as pd

In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D, Input
from tensorflow.keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D

from tensorflow.keras.utils import to_categorical



In [3]:
# !pip install sklearn-crfsuite
# !pip install seqeval
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import f1_score
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report


In [4]:
# from tensorflow_addons.layers.crf import CRF

# import tensorflow as tf
# from keras_contrib.layers import CRF
# %load_ext autoreload
# %autoreload 2
# from crf import CRF
# !pip install tf2crf
# from tf2CRF import CRF

# from tf2crf import CRF, ModelWithCRFLoss
# from keras_contrib.metrics import crf_accuracy

import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
from tensorflow_addons.text import crf_log_likelihood, crf_decode


class CRF(L.Layer):
    def __init__(self,
                 output_dim,
                 sparse_target=True,
                 **kwargs):
        """    
        Args:
            output_dim (int): the number of labels to tag each temporal input.
            sparse_target (bool): whether the the ground-truth label represented in one-hot.
        Input shape:
            (batch_size, sentence length, output_dim)
        Output shape:
            (batch_size, sentence length, output_dim)
        """
        super(CRF, self).__init__(**kwargs)
        self.output_dim = int(output_dim) 
        self.sparse_target = sparse_target
        self.input_spec = L.InputSpec(min_ndim=3)
        self.supports_masking = False
        self.sequence_lengths = None
        self.transitions = None

    def build(self, input_shape):
        assert len(input_shape) == 3
        f_shape = tf.TensorShape(input_shape)
        input_spec = L.InputSpec(min_ndim=3, axes={-1: f_shape[-1]})

        if f_shape[-1] is None:
            raise ValueError('The last dimension of the inputs to `CRF` '
                             'should be defined. Found `None`.')
        if f_shape[-1] != self.output_dim:
            raise ValueError('The last dimension of the input shape must be equal to output'
                             ' shape. Use a linear layer if needed.')
        self.input_spec = input_spec
        self.transitions = self.add_weight(name='transitions',
                                           shape=[self.output_dim, self.output_dim],
                                           initializer='glorot_uniform',
                                           trainable=True)
        self.built = True

    def compute_mask(self, inputs, mask=None):
        # Just pass the received mask from previous layer, to the next layer or
        # manipulate it if this layer changes the shape of the input
        return mask

    def call(self, inputs, sequence_lengths=None, training=None, **kwargs):
        sequences = tf.convert_to_tensor(inputs, dtype=self.dtype)
        if sequence_lengths is not None:
            assert len(sequence_lengths.shape) == 2
            assert tf.convert_to_tensor(sequence_lengths).dtype == 'int32'
            seq_len_shape = tf.convert_to_tensor(sequence_lengths).get_shape().as_list()
            assert seq_len_shape[1] == 1
            self.sequence_lengths = K.flatten(sequence_lengths)
        else:
            self.sequence_lengths = tf.ones(tf.shape(inputs)[0], dtype=tf.int32) * (
                tf.shape(inputs)[1]
            )

        viterbi_sequence, _ = crf_decode(sequences,
                                         self.transitions,
                                         self.sequence_lengths)
        output = K.one_hot(viterbi_sequence, self.output_dim)
        return K.in_train_phase(sequences, output)

    @property
    def loss(self):
        def crf_loss(y_true, y_pred):
            y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
            log_likelihood, self.transitions = crf_log_likelihood(
                y_pred,
                tf.cast(K.argmax(y_true), dtype=tf.int32) if self.sparse_target else y_true,
                self.sequence_lengths,
                transition_params=self.transitions,
            )
            return tf.reduce_mean(-log_likelihood)
        return crf_loss

    @property
    def accuracy(self):
        def viterbi_accuracy(y_true, y_pred):
            # -1e10 to avoid zero at sum(mask)
            mask = K.cast(
                K.all(K.greater(y_pred, -1e10), axis=2), K.floatx())
            shape = tf.shape(y_pred)
            sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
            y_pred, _ = crf_decode(y_pred, self.transitions, sequence_lengths)
            if self.sparse_target:
                y_true = K.argmax(y_true, 2)
            y_pred = K.cast(y_pred, 'int32')
            y_true = K.cast(y_true, 'int32')
            corrects = K.cast(K.equal(y_true, y_pred), K.floatx())
            return K.sum(corrects * mask) / K.sum(mask)
        return viterbi_accuracy

    def compute_output_shape(self, input_shape):
        tf.TensorShape(input_shape).assert_has_rank(3)
        return input_shape[:2] + (self.output_dim,)

    def get_config(self):
        config = {
            'output_dim': self.output_dim,
            'sparse_target': self.sparse_target,
            'supports_masking': self.supports_masking,
            'transitions': K.eval(self.transitions)
        }
        base_config = super(CRF, self).get_config()
        return dict(base_config, **config)

# Pretrained Bert try

In [5]:
import tensorflow
from transformers import BertTokenizer, AutoModel
from transformers import TFAutoModel
from transformers import TFBertModel
from transformers import AutoTokenizer,AutoModelForTokenClassification,TFAutoModelForTokenClassification
# tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p2")
# tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-lite-large-p2")
# model = AutoModel.from_pretrained("indobenchmark/indobert-large-p2")
# TFPreTrainedModel.from_pretrained("indobenchmark/indobert-large-p2")

In [6]:
# from transformers import file_utils
# print(file_utils.default_cache_path)

In [9]:
# tokenizer = AutoTokenizer.from_pretrained("cahya/xlm-roberta-large-indonesian-NER")
# model = AutoModelForTokenClassification.from_pretrained('cahya/xlm-roberta-large-indonesian-NER')
# model

In [6]:
# model = TFBertModel.from_pretrained("indobenchmark/indobert-large-p2")
# model

In [9]:
import transformers
transformers.__version__

'2.9.0'

In [7]:
# import tensorflow as tf
# import torch
# word = 'karawaci baru, kakap raya 156 rt 1 rw 3 karawaci wongosari'
# x = tokenizer.encode(word) 
# print(x)

In [8]:
# model.predict(x)[0].shape

In [9]:
# a= tokenizer.convert_ids_to_tokens(tokenizer.encode(word))
# print(a)

In [10]:
# tokenizer.convert_tokens_to_ids('[PAD]')

# add tokenized label

In [11]:
import string
if '###asd' in string.punctuation:
    print('yeah')

In [12]:
word = 'karawaci baru, kakap raya 156 rt 1 rw 3 karawaci wongosari'
x = tokenizer.encode(word)


def fix_label(list1,label,raw):
    label_fix = []
    enumerate_s = 0
    enumerate_label = 0
    for x in list1:
        if (('[' in x) or (x in string.punctuation) or ('#' in x)) and (x not in ['[CLS]','[PAD]','[SEP]']):
#             print('wo'+ x)
            if 'B' in label_fix[-1]:
                label_fix.append(label_fix[-1].replace('B','I'))
            else:
                label_fix.append(label_fix[-1])
        elif x in ['[CLS]','[SEP]','[UNK]','[PAD]']:
            label_fix.append('O')
        else:
            try:
                label_fix.append(label[enumerate_label])
            except Exception as e:
                raise Exception('An error occurred')
                
        if x not in ['[CLS]','[SEP]','[UNK]','[PAD]']:
            enumerate_s+=len(x.strip().replace('#',''))
#             print(x,enumerate_s,len(raw.split(' ')[enumerate_label]))
            if enumerate_s==len(raw.split(' ')[enumerate_label]):
                enumerate_s = 0
                enumerate_label += 1
        
    return label_fix

label = ['B-street','I-street','O','B-street','O','O','O','O','O','O','O','O']
print(word.split(' '))
print(label)
# print(a)
print(fix_label([tokenizer.convert_ids_to_tokens(x1) for x1 in x],label,word))

['karawaci', 'baru,', 'kakap', 'raya', '156', 'rt', '1', 'rw', '3', 'karawaci', 'wongosari']
['B-street', 'I-street', 'O', 'B-street', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'B-street', 'I-street', 'I-street', 'O', 'B-street', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [13]:
word = 'kampung.gudang areng,desa:anyer, kecamatan:anyar, kabupaten: serang, belakang bca anyar'
word_token = ['[CLS]', 'kampung', '.', 'gudang', 'aren', '##g', ',', 'desa', ':', 'anyer', ',', 'kecamatan', ':', 'anyar', ',', 'kabupaten', ':', 'serang', ',', 'belakang', 'bca', 'anyar', '[SEP]']
label = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
print(fix_label(word_token,label,word))
print(len(fix_label(word_token,label,word)),len(word_token))

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
23 23


In [14]:
word = 'kampung. gudang areng,desa:anyer, kecamatan:anyar, kabupaten: serang, belakang bca anyar'
# word_token = ['[CLS]', 'kampung', '.', 'gudang', 'aren', '##g', ',', 'desa', ':', 'anyer', ',', 'kecamatan', ':', 'anyar', ',', 'kabupaten', ':', 'serang', ',', 'belakang', 'bca', 'anyar', '[SEP]']
word_token = tokenizer.convert_ids_to_tokens(tokenizer.encode(word))
label = ['B-street', 'I-street', 'B-street', 'O', 'O', 'O', 'O', 'O','O']
print(word_token)
print(fix_label(word_token,label,word))
print(len(fix_label(word_token,label,word)),len(word_token))

['[CLS]', 'kampung', '.', 'gudang', 'aren', '##g', ',', 'desa', ':', 'anyer', ',', 'kecamatan', ':', 'anyar', ',', 'kabupaten', ':', 'serang', ',', 'belakang', 'bca', 'anyar', '[SEP]']
['O', 'B-street', 'I-street', 'I-street', 'B-street', 'I-street', 'I-street', 'B-street', 'I-street', 'B-street', 'I-street', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
23 23


# data

In [15]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


In [16]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru


In [17]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,raw_address
0,0,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per, baloi indah kel. lubuk baja"
2,2,"asma laun, mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar"
4,4,"cut mutia, 35 baiturrahman"


# Street

## data

In [10]:
data_train_label = pd.read_csv('data_train_label.csv')
data_train_label.head()

Unnamed: 0,raw,street,raw_split,street_split,label
0,jl kapuk timur delta sili iii lippo cika 11 a ...,jl kapuk timur delta sili iii lippo cika,"['jl', 'kapuk', 'timur', 'delta', 'sili', 'iii...","['jl', 'kapuk', 'timur', 'delta', 'sili', 'iii...","['B-street', 'I-street', 'I-street', 'I-street..."
1,"aye, jati sampurna",,"['aye,', 'jati', 'sampurna']",[''],"['O', 'O', 'O']"
2,setu siung 119 rt 5 1 13880 cipayung,siung,"['setu', 'siung', '119', 'rt', '5', '1', '1388...",['siung'],"['O', 'B-street', 'O', 'O', 'O', 'O', 'O', 'O']"
3,"toko dita, kertosono",,"['toko', 'dita,', 'kertosono']",[''],"['O', 'O', 'O']"
4,jl. orde baru,jl. orde baru,"['jl.', 'orde', 'baru']","['jl.', 'orde', 'baru']","['B-street', 'I-street', 'I-street']"


In [11]:
import ast
# list_y = data_train_label['label'].apply(lambda x: ast.literal_eval(x)).values

data_train_label['label'] = data_train_label['label'].apply(lambda x: ast.literal_eval(x)).values
data_train_label['raw_split'] = data_train_label['raw'].apply(lambda x: [x1 for x1 in x.split(' ') if x1!='']).values
list_x = data_train_label['raw_split'].values

In [13]:
data_train_label[data_train_label['raw_split'].apply(lambda x: x[0]).isin(['kampung.gudang'])]

Unnamed: 0,raw,street,raw_split,street_split,label
15,"kampung.gudang areng,desa:anyer, kecamatan:any...",,"[kampung.gudang, areng,desa:anyer,, kecamatan:...",[''],"[O, O, O, O, O, O, O, O]"


In [20]:
data_train_label['street_split'] = data_train_label['street_split'].apply(lambda x: ast.literal_eval(x)).values

In [21]:
X = [tokenizer.encode(x1) for x1 in data_train_label['raw'].values.tolist()]
print(len(X))

300000


In [22]:
print(X[0])
len(data_train_label)

[2, 1637, 1012, 70, 1276, 16691, 18946, 4100, 20744, 6484, 29832, 1113, 253, 8019, 88, 10320, 1417, 3]


300000

In [23]:
data_train_label.head()

Unnamed: 0,raw,street,raw_split,street_split,label
0,jl kapuk timur delta sili iii lippo cika 11 a ...,jl kapuk timur delta sili iii lippo cika,"[jl, kapuk, timur, delta, sili, iii, lippo, ci...","[jl, kapuk, timur, delta, sili, iii, lippo, cika]","[B-street, I-street, I-street, I-street, I-str..."
1,"aye, jati sampurna",,"[aye,, jati, sampurna]",[],"[O, O, O]"
2,setu siung 119 rt 5 1 13880 cipayung,siung,"[setu, siung, 119, rt, 5, 1, 13880, cipayung]",[siung],"[O, B-street, O, O, O, O, O, O]"
3,"toko dita, kertosono",,"[toko, dita,, kertosono]",[],"[O, O, O]"
4,jl. orde baru,jl. orde baru,"[jl., orde, baru]","[jl., orde, baru]","[B-street, I-street, I-street]"


In [24]:
import copy
data_train_label['raw_token'] = copy.deepcopy(X)

In [25]:
data_train_label['raw_token_word'] = [[tokenizer.convert_ids_to_tokens(x12) for x12 in x1] for x1 in data_train_label['raw_token'].values.tolist()]

In [26]:
import copy
X_backup = copy.deepcopy(X)

In [27]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [28]:
data_train_label['label_fix_token'] = data_train_label[['raw','raw_token_word','label']].apply(lambda x: fix_label(x.raw_token_word,x.label,x.raw),axis=1)
data_train_label.head()

Unnamed: 0,raw,street,raw_split,street_split,label,raw_token,raw_token_word,label_fix_token
0,jl kapuk timur delta sili iii lippo cika 11 a ...,jl kapuk timur delta sili iii lippo cika,"[jl, kapuk, timur, delta, sili, iii, lippo, ci...","[jl, kapuk, timur, delta, sili, iii, lippo, cika]","[B-street, I-street, I-street, I-street, I-str...","[2, 1637, 1012, 70, 1276, 16691, 18946, 4100, ...","[[CLS], jl, kap, ##uk, timur, delta, sili, iii...","[O, B-street, I-street, I-street, I-street, I-..."
1,"aye, jati sampurna",,"[aye,, jati, sampurna]",[],"[O, O, O]","[2, 675, 29835, 29946, 4868, 372, 2951, 3]","[[CLS], ay, ##e, ,, jati, samp, ##urna, [SEP]]","[O, O, O, O, O, O, O, O]"
2,setu siung 119 rt 5 1 13880 cipayung,siung,"[setu, siung, 119, rt, 5, 1, 13880, cipayung]",[siung],"[O, B-street, O, O, O, O, O, O]","[2, 332, 29838, 27505, 17689, 4345, 418, 111, ...","[[CLS], set, ##u, siung, 119, rt, 5, 1, 138, #...","[O, O, O, B-street, O, O, O, O, O, O, O, O, O]"
3,"toko dita, kertosono",,"[toko, dita,, kertosono]",[],"[O, O, O]","[2, 1605, 18154, 29946, 21309, 7551, 1867, 3]","[[CLS], toko, dita, ,, kert, ##oso, ##no, [SEP]]","[O, O, O, O, O, O, O, O]"
4,jl. orde baru,jl. orde baru,"[jl., orde, baru]","[jl., orde, baru]","[B-street, I-street, I-street]","[2, 1637, 29948, 9057, 440, 3]","[[CLS], jl, ., orde, baru, [SEP]]","[O, B-street, I-street, I-street, I-street, O]"


In [29]:
def check_token_label(df2):
    df= df2.copy()
    df['check'] = df[['raw_token_word','label_fix_token']].apply(lambda x: len(x.raw_token_word)==len(x.label_fix_token),axis=1)
    print(df[df['check']==False])
check_token_label(data_train_label)   

Empty DataFrame
Columns: [raw, street, raw_split, street_split, label, raw_token, raw_token_word, label_fix_token, check]
Index: []


In [30]:
# data_train_label['label_join'] = [' '.join(x) for x in list_y]


list_y_token = data_train_label['label_fix_token'].values.tolist()
data_train_label['label_join_token'] = [' '.join(x) for x in list_y_token]

In [31]:
import random
data_train_label[['raw_split','label','raw_token_word','label_fix_token']].values[random.randint(0,len(data_train_label))]

array([list(['mayor', 'madm', 'hasi', '42', 'rt', '4', '4', 'margahayu', 'bekasi', 'timur']),
       list(['B-street', 'I-street', 'I-street', 'O', 'O', 'O', 'O', 'O', 'O', 'O']),
       list(['[CLS]', 'mayor', 'mad', '##m', 'has', '##i', '42', 'rt', '4', '4', 'marga', '##ha', '##yu', 'bekasi', 'timur', '[SEP]']),
       list(['O', 'B-street', 'I-street', 'I-street', 'I-street', 'I-street', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])],
      dtype=object)

In [32]:
# raw_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')  # the filters ='' so that keras doesnot remove any punctuation in our data
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='',lower=False)

In [33]:
# raw data train and test concat so that there is no word that does not have index
target_data = data_train_label['label_join_token'].values
target_tokenizer.fit_on_texts(target_data)



In [34]:
tag2idx = {}
for key in target_tokenizer.word_index.keys():
    tag2idx[key] = target_tokenizer.word_index[key]

idx2tag = {}
for key in target_tokenizer.index_word.keys():
    idx2tag[key] = target_tokenizer.index_word[key]
idx2tag

tag2idx['PAD'] = 0
idx2tag[0]='PAD'

In [35]:
# Add 0 padding so all data has the same length
X_pad = tf.keras.preprocessing.sequence.pad_sequences(X,padding='post',value=tokenizer.convert_tokens_to_ids('[PAD]'))
print(X_pad[:3])

[[    2  1637  1012    70  1276 16691 18946  4100 20744  6484 29832  1113
    253  8019    88 10320  1417     3     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0]
 [    2   675 29835 29946  4868   372  2951     3     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0]
 [    2   332 29838 27505 17689  4345   418   111 20092  3193  5908 21562
      3     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0]]


In [36]:
len(X_pad[0])

47

In [37]:
data_target_in = [[tag2idx[w] for w in s] for s in list_y_token]

y_pad = tf.keras.preprocessing.sequence.pad_sequences(data_target_in,padding='post',value=tag2idx['PAD'])
print(y_pad[:3])


[[1 3 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 3 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0]]


In [38]:
len(data_target_in[0])

18

In [39]:
len(tag2idx)

4

In [40]:
n_tags = len(tag2idx)
# n_tags
y_pad = [to_categorical(i, num_classes=n_tags) for i in y_pad]
y_pad[:2]

[array([[0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1.,

In [41]:
# data_fix_train = pd.DataFrame({'X':X_pad.tolist(),'y':y_pad})
# data_fix_train.head()

In [42]:
# data_fix_train.to_csv('data_fix_train_street.csv',index=False)

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_pad, y_pad, test_size=0.15,shuffle=True,random_state=0)

In [44]:
X_train.shape

(255000, 47)

## train

In [45]:
# !pip install numba 

In [46]:
# from numba import cuda 
# device = cuda.get_current_device()
# device.reset()

In [47]:
target_vocab_size = len(idx2tag)
max_len = X_train.shape[1]

In [48]:
def build_model():
    input_ = Input(shape=(max_len,),dtype=tf.int32)
    bert = TFBertModel.from_pretrained("indobenchmark/indobert-lite-large-p2")
    model = bert(input_)[0]

    model = Bidirectional(LSTM(units=150, return_sequences=True, recurrent_dropout=0.1))(model)
    model = TimeDistributed(Dense(50, activation="relu"))(model)
    model = Dense(n_tags)(model)
    crf = CRF(n_tags)
    out = crf(model)
    model = Model(input_, out)
#     print(model.summary())
#     for layer in model.get_layer('tf_bert_model').layers:
#         layer.trainable=False
#     for w in model.get_layer('tf_bert_model').weights:
#         w._trainable=False

    opt = tf.keras.optimizers.Adam(lr=0.01, decay=1e-6)
    model.compile(optimizer="rmsprop", loss= crf.loss, metrics=[crf.accuracy])

    print(model.summary())
    return model

In [49]:
model = build_model()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 47)]              0         
_________________________________________________________________
tf_bert_model (TFBertModel)  ((None, 47, 1024), (None, 334607360 
_________________________________________________________________
bidirectional (Bidirectional (None, 47, 300)           1410000   
_________________________________________________________________
time_distributed (TimeDistri (None, 47, 50)            15050     
_________________________________________________________________
dense_1 (Dense)              (None, 47, 4)             204       
_________________________________________________________________
crf (CRF)                    (None, 47, 4)             16        
Total params: 336,032,630
Trainable params: 336,032,630
Non-trainable params: 0
_______________________________________________

In [50]:
earlyStopping = EarlyStopping(monitor='val_viterbi_accuracy', patience=3, verbose=0, mode='min')
mcp_save = ModelCheckpoint('best_street.h5', save_best_only=True,save_weights_only=True, monitor='val_viterbi_accuracy', mode='min')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_viterbi_accuracy', factor=0.1, patience=2, verbose=1, epsilon=1e-4, mode='min')


history = model.fit(X_train, np.array(y_train), 
                    batch_size=64, 
                    epochs=50, 
                    validation_split=0.15,
                    callbacks=[earlyStopping,mcp_save,reduce_lr_loss],
                    verbose=1)

Train on 216750 samples, validate on 38250 samples
Epoch 1/50


ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted:  OOM when allocating tensor with shape[64,47,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node model/tf_bert_model/bert/encoder/layer_._2/intermediate/activation/truediv (defined at C:\Users\BIGDATA02\AppData\Roaming\Python\Python37\site-packages\transformers\modeling_tf_bert.py:64) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[metrics/viterbi_accuracy/mul/_1048]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted:  OOM when allocating tensor with shape[64,47,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node model/tf_bert_model/bert/encoder/layer_._2/intermediate/activation/truediv (defined at C:\Users\BIGDATA02\AppData\Roaming\Python\Python37\site-packages\transformers\modeling_tf_bert.py:64) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

0 successful operations.
0 derived errors ignored. [Op:__inference_distributed_function_75176]

Errors may have originated from an input operation.
Input Source operations connected to node model/tf_bert_model/bert/encoder/layer_._2/intermediate/activation/truediv:
 model/tf_bert_model/bert/encoder/layer_._2/intermediate/dense/BiasAdd (defined at C:\Users\BIGDATA02\AppData\Roaming\Python\Python37\site-packages\transformers\modeling_tf_bert.py:320)

Input Source operations connected to node model/tf_bert_model/bert/encoder/layer_._2/intermediate/activation/truediv:
 model/tf_bert_model/bert/encoder/layer_._2/intermediate/dense/BiasAdd (defined at C:\Users\BIGDATA02\AppData\Roaming\Python\Python37\site-packages\transformers\modeling_tf_bert.py:320)

Function call stack:
distributed_function -> distributed_function


In [96]:
# import matplotlib.pyplot as plt

# def plot_graphs(history, string):
#     plt.plot(history.history[string])
#     plt.plot(history.history['val_'+string])
#     plt.xlabel("Epochs")
#     plt.ylabel(string)
#     plt.legend([string, 'val_'+string])
#     plt.show()
  

# plot_graphs(history, "accuracy")
# plot_graphs(history, "loss")

In [56]:
import random
model

<tensorflow.python.keras.engine.functional.Functional at 0x2342e9e2d90>

In [59]:
i = random.randint(0,len(X_test))
# i = 42087
p = model.predict(np.array([X_test[i]]))

p = np.argmax(p, axis=-1)


print("{:15} {:5} {:15}".format("Word",  "Pred", 'GT'))
for w, pred, gt in zip(X_test[i], p[0], list(np.argmax(y_test[i],axis=-1))):
    if w==0:
        continue
    print("{:15}: {:5} {:15} ".format(raw_tokenizer.index_word[w], idx2tag[pred], idx2tag[gt]))


Word            Pred  GT             
raya           : B-street B-street        
banj           : I-street I-street        
no             : O     O               
496            : O     O               
photo          : O     O               
copy           : O     O               
laris,         : O     O               
suka           : O     O               
sari           : O     O               


In [62]:
i

6602

In [76]:
# model.save('street_20200316.h5')
model.save_weights('street_20200316.h5')

In [78]:
word = 'wig ten iv, gununganyartambak kel. gununganyar'
word_idx = [[word2idx[w] for w in s] for s in [word.split(' ')]]
print(word_idx)
model_street = build_model(raw_vocab_size,target_vocab_size,max_len)
model_street.load_weights('street_20200316.h5')

# model_street = tf.keras.models.load_model('street_20200316.h5',custom_objects={'CRF':CRF(n_tags)})
model_street

[[5763, 175, 212, 12766, 13, 1316]]
Model: "functional_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        [(None, 32)]              0         
_________________________________________________________________
embedding_9 (Embedding)      (None, 32, 50)            6612200   
_________________________________________________________________
bidirectional_9 (Bidirection (None, 32, 200)           120800    
_________________________________________________________________
time_distributed_9 (TimeDist (None, 32, 50)            10050     
_________________________________________________________________
dense_19 (Dense)             (None, 32, 4)             204       
_________________________________________________________________
crf_10 (CRF)                 (None, 32, 4)             16        
Total params: 6,743,270
Trainable params: 6,743,270
Non-trainable params: 0
_______

<tensorflow.python.keras.engine.functional.Functional at 0x235a33687f0>

In [79]:
# i = random.randint(0,len(X_test))
# i = 42087
p = model_street.predict(np.array([X_test[i]]))

p = np.argmax(p, axis=-1)


print("{:15} {:5} {:15}".format("Word",  "Pred", 'GT'))
for w, pred, gt in zip(X_test[i], p[0], list(np.argmax(y_test[i],axis=-1))):
    if w==0:
        continue
    print("{:15}: {:5} {:15} ".format(raw_tokenizer.index_word[w], idx2tag[pred], idx2tag[gt]))


Word            Pred  GT             
raya           : B-street B-street        
banj           : I-street I-street        
no             : O     O               
496            : O     O               
photo          : O     O               
copy           : O     O               
laris,         : O     O               
suka           : O     O               
sari           : O     O               


In [85]:
y_pred = model_street.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(y_test, -1)

y_pred = [[idx2tag[i] for i in row] for row in y_pred]
y_test_true = [[idx2tag[i] for i in row] for row in y_test_true] 


In [86]:
print("F1-score is : {:.1%}".format(f1_score(y_test_true, y_pred)))
report = flat_classification_report(y_pred=y_pred, y_true=y_test_true)
print(report)



F1-score is : 91.2%




              precision    recall  f1-score   support

    B-street       0.81      0.83      0.82     29891
    I-street       0.82      0.85      0.83     40820
           O       0.95      0.95      0.95    236144
         PAD       1.00      1.00      1.00   1133145

    accuracy                           0.98   1440000
   macro avg       0.90      0.91      0.90   1440000
weighted avg       0.98      0.98      0.98   1440000

