# Library

In [1]:
import tensorflow as tf
import numpy as np
import unicodedata
import re
import pandas as pd

In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D, Input
from tensorflow.keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D

from tensorflow.keras.utils import to_categorical



In [84]:
!pip install sklearn-crfsuite
!pip install seqeval
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import f1_score
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py): started
  Building wheel for seqeval (setup.py): finished with status 'done'
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16176 sha256=ffe58a1691d727828d8cbec67a9d7ac77fb97f3be6454ca908f013c739144ab0
  Stored in directory: c:\users\andreas m\appdata\local\pip\cache\wheels\ad\5c\ba\05fa33fa5855777b7d686e843ec07452f22a66a138e290e732
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


# crf

In [33]:
# from tensorflow_addons.layers.crf import CRF

# import tensorflow as tf
# from keras_contrib.layers import CRF
# %load_ext autoreload
# %autoreload 2
# from crf import CRF
# !pip install tf2crf
# from tf2CRF import CRF

# from tf2crf import CRF, ModelWithCRFLoss
# from keras_contrib.metrics import crf_accuracy

import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
from tensorflow_addons.text import crf_log_likelihood, crf_decode


class CRF(L.Layer):
    def __init__(self,
                 output_dim,
                 sparse_target=True,
                 **kwargs):
        """    
        Args:
            output_dim (int): the number of labels to tag each temporal input.
            sparse_target (bool): whether the the ground-truth label represented in one-hot.
        Input shape:
            (batch_size, sentence length, output_dim)
        Output shape:
            (batch_size, sentence length, output_dim)
        """
        super(CRF, self).__init__(**kwargs)
        self.output_dim = int(output_dim) 
        self.sparse_target = sparse_target
        self.input_spec = L.InputSpec(min_ndim=3)
        self.supports_masking = False
        self.sequence_lengths = None
        self.transitions = None

    def build(self, input_shape):
        assert len(input_shape) == 3
        f_shape = tf.TensorShape(input_shape)
        input_spec = L.InputSpec(min_ndim=3, axes={-1: f_shape[-1]})

        if f_shape[-1] is None:
            raise ValueError('The last dimension of the inputs to `CRF` '
                             'should be defined. Found `None`.')
        if f_shape[-1] != self.output_dim:
            raise ValueError('The last dimension of the input shape must be equal to output'
                             ' shape. Use a linear layer if needed.')
        self.input_spec = input_spec
        self.transitions = self.add_weight(name='transitions',
                                           shape=[self.output_dim, self.output_dim],
                                           initializer='glorot_uniform',
                                           trainable=True)
        self.built = True

    def compute_mask(self, inputs, mask=None):
        # Just pass the received mask from previous layer, to the next layer or
        # manipulate it if this layer changes the shape of the input
        return mask

    def call(self, inputs, sequence_lengths=None, training=None, **kwargs):
        sequences = tf.convert_to_tensor(inputs, dtype=self.dtype)
        if sequence_lengths is not None:
            assert len(sequence_lengths.shape) == 2
            assert tf.convert_to_tensor(sequence_lengths).dtype == 'int32'
            seq_len_shape = tf.convert_to_tensor(sequence_lengths).get_shape().as_list()
            assert seq_len_shape[1] == 1
            self.sequence_lengths = K.flatten(sequence_lengths)
        else:
            self.sequence_lengths = tf.ones(tf.shape(inputs)[0], dtype=tf.int32) * (
                tf.shape(inputs)[1]
            )

        viterbi_sequence, _ = crf_decode(sequences,
                                         self.transitions,
                                         self.sequence_lengths)
        output = K.one_hot(viterbi_sequence, self.output_dim)
        return K.in_train_phase(sequences, output)

    @property
    def loss(self):
        def crf_loss(y_true, y_pred):
            y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
            log_likelihood, self.transitions = crf_log_likelihood(
                y_pred,
                tf.cast(K.argmax(y_true), dtype=tf.int32) if self.sparse_target else y_true,
                self.sequence_lengths,
                transition_params=self.transitions,
            )
            return tf.reduce_mean(-log_likelihood)
        return crf_loss

    @property
    def accuracy(self):
        def viterbi_accuracy(y_true, y_pred):
            # -1e10 to avoid zero at sum(mask)
            mask = K.cast(
                K.all(K.greater(y_pred, -1e10), axis=2), K.floatx())
            shape = tf.shape(y_pred)
            sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
            y_pred, _ = crf_decode(y_pred, self.transitions, sequence_lengths)
            if self.sparse_target:
                y_true = K.argmax(y_true, 2)
            y_pred = K.cast(y_pred, 'int32')
            y_true = K.cast(y_true, 'int32')
            corrects = K.cast(K.equal(y_true, y_pred), K.floatx())
            return K.sum(corrects * mask) / K.sum(mask)
        return viterbi_accuracy

    def compute_output_shape(self, input_shape):
        tf.TensorShape(input_shape).assert_has_rank(3)
        return input_shape[:2] + (self.output_dim,)

    def get_config(self):
        config = {
            'output_dim': self.output_dim,
            'sparse_target': self.sparse_target,
            'supports_masking': self.supports_masking,
            'transitions': K.eval(self.transitions)
        }
        base_config = super(CRF, self).get_config()
        return dict(base_config, **config)

# data

In [4]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


In [5]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru


In [6]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,raw_address
0,0,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per, baloi indah kel. lubuk baja"
2,2,"asma laun, mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar"
4,4,"cut mutia, 35 baiturrahman"


# street

## data

In [7]:
data_train_label = pd.read_csv('data_train_label.csv')
data_train_label.head()

Unnamed: 0,raw,street,raw_split,street_split,label
0,jl kapuk timur delta sili iii lippo cika 11 a ...,jl kapuk timur delta sili iii lippo cika,"['jl', 'kapuk', 'timur', 'delta', 'sili', 'iii...","['jl', 'kapuk', 'timur', 'delta', 'sili', 'iii...","['B-street', 'I-street', 'I-street', 'I-street..."
1,"aye, jati sampurna",,"['aye,', 'jati', 'sampurna']",[''],"['O', 'O', 'O']"
2,setu siung 119 rt 5 1 13880 cipayung,siung,"['setu', 'siung', '119', 'rt', '5', '1', '1388...",['siung'],"['O', 'B-street', 'O', 'O', 'O', 'O', 'O', 'O']"
3,"toko dita, kertosono",,"['toko', 'dita,', 'kertosono']",[''],"['O', 'O', 'O']"
4,jl. orde baru,jl. orde baru,"['jl.', 'orde', 'baru']","['jl.', 'orde', 'baru']","['B-street', 'I-street', 'I-street']"


In [8]:
import ast
list_y = data_train_label['label'].apply(lambda x: ast.literal_eval(x)).values

data_train_label['raw_split'] = data_train_label['raw'].apply(lambda x: [x1 for x1 in x.split(' ') if x1!='']).values
list_x = data_train_label['raw_split'].values

In [9]:
print(list_y[:4])

[list(['B-street', 'I-street', 'I-street', 'I-street', 'I-street', 'I-street', 'I-street', 'I-street', 'O', 'O', 'O', 'O', 'O'])
 list(['O', 'O', 'O'])
 list(['O', 'B-street', 'O', 'O', 'O', 'O', 'O', 'O'])
 list(['O', 'O', 'O'])]


In [10]:
print(list_x[:4])

[list(['jl', 'kapuk', 'timur', 'delta', 'sili', 'iii', 'lippo', 'cika', '11', 'a', 'cicau', 'cikarang', 'pusat'])
 list(['aye,', 'jati', 'sampurna'])
 list(['setu', 'siung', '119', 'rt', '5', '1', '13880', 'cipayung'])
 list(['toko', 'dita,', 'kertosono'])]


In [11]:
data_train_label['label_join'] = [' '.join(x) for x in list_y]

In [12]:
raw_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')  # the filters ='' so that keras doesnot remove any punctuation in our data
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='',lower=False)

In [13]:
len(raw_tokenizer.word_index)

0

In [14]:
# raw data train and test concat so that there is no word that does not have index
raw_data = pd.concat([train['raw_address'],test['raw_address']],axis=0).values
target_data = data_train_label['label_join'].values


raw_tokenizer.fit_on_texts(raw_data)
target_tokenizer.fit_on_texts(target_data)



In [15]:
a = 'saya  mau makan'
print(a.split(' '))
raw_tokenizer.texts_to_sequences([a])

['saya', '', 'mau', 'makan']


[[5944, 11461, 269]]

In [16]:
tag2idx = {}
for key in target_tokenizer.word_index.keys():
    tag2idx[key] = target_tokenizer.word_index[key]

idx2tag = {}
for key in target_tokenizer.index_word.keys():
    idx2tag[key] = target_tokenizer.index_word[key]
idx2tag

tag2idx['PAD'] = 0
idx2tag[0]='PAD'

In [17]:
word2idx = {}
for key in raw_tokenizer.word_index.keys():
    word2idx[key] = raw_tokenizer.word_index[key]

idx2word = {}
for key in raw_tokenizer.index_word.keys():
    idx2word[key] = raw_tokenizer.index_word[key]
idx2word

word2idx['PAD'] = 0
idx2word[0]='PAD'

In [18]:
# X = raw_tokenizer.texts_to_sequences(train['raw_address'])
X = [[word2idx[w] for w in s] for s in list_x]
data_target_in = [[tag2idx[w] for w in s] for s in list_y]

# Add 0 padding so all data has the same length
X = tf.keras.preprocessing.sequence.pad_sequences(X,padding='post',value=word2idx['PAD'])
print(X[:3])

data_target_in = tf.keras.preprocessing.sequence.pad_sequences(data_target_in,padding='post',value=tag2idx['PAD'])
print(data_target_in[:3])


[[   59   275    10   886 11880    48  2171   774    31    60  8116   104
    309     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [20376    47   476     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [  415 22368  1529     2    11     4 10063   165     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]]
[[3 2 2 2 2 2 2 2 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 3 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [19]:
[idx2word[x] for x in X[0] if x!=0]

['jl',
 'kapuk',
 'timur',
 'delta',
 'sili',
 'iii',
 'lippo',
 'cika',
 '11',
 'a',
 'cicau',
 'cikarang',
 'pusat']

In [20]:
n_tags = len(tag2idx)
# n_tags
y = [to_categorical(i, num_classes=n_tags) for i in data_target_in]
y[:5]

[array([[0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]], dtype=float32),
 array([[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,shuffle=True,random_state=0)

In [22]:
X_train.shape

(255000, 32)

In [23]:
len(y_train[0])

32

In [24]:
len(tag2idx)

4

## train

In [25]:
n_tags

4

In [26]:
raw_vocab_size = len(raw_tokenizer.word_index) + 1
target_vocab_size = len(idx2tag)

max_len = X_train.shape[1]

In [55]:
def build_model(raw_vocab_size,target_vocab_size,max_len):
    input_ = Input(shape=(max_len,))
    model = Embedding(input_dim=raw_vocab_size, output_dim=50, input_length=max_len, mask_zero = True)(input_)
    model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
    model = TimeDistributed(Dense(50, activation="relu"))(model)
    model = Dense(n_tags)(model)
    crf = CRF(n_tags)
    out = crf(model)
    model = Model(input_, out)

    opt = tf.keras.optimizers.Adam(lr=0.01, decay=1e-6)
    # model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy"])          
    model.compile(optimizer="rmsprop", loss= crf.loss, metrics=[crf.accuracy])

    print(model.summary())
    return model

In [34]:
input_ = Input(shape=(max_len,))
model = Embedding(input_dim=raw_vocab_size, output_dim=50, input_length=max_len, mask_zero = True)(input_)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
model = TimeDistributed(Dense(50, activation="relu"))(model)
model = Dense(n_tags)(model)
crf = CRF(n_tags)
out = crf(model)
model = Model(input_, out)

opt = tf.keras.optimizers.Adam(lr=0.01, decay=1e-6)
# model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy"])          
model.compile(optimizer="rmsprop", loss= crf.loss, metrics=[crf.accuracy])

model.summary()

Model: "functional_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 32)]              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 32, 50)            6612200   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 32, 200)           120800    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 32, 50)            10050     
_________________________________________________________________
dense_7 (Dense)              (None, 32, 4)             204       
_________________________________________________________________
crf_3 (CRF)                  (None, 32, 4)             16        
Total params: 6,743,270
Trainable params: 6,743,270
Non-trainable params: 0
____________________________________________

In [35]:
earlyStopping = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')
mcp_save = ModelCheckpoint('best.h5', save_best_only=True,save_weights_only=True, monitor='val_loss', mode='min')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, epsilon=1e-4, mode='min')


history = model.fit(X_train, np.array(y_train), 
                    batch_size=512, 
                    epochs=20, 
                    validation_split=0.15,
                    callbacks=[earlyStopping, mcp_save, reduce_lr_loss],
                    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
 39/424 [=>............................] - ETA: 2:18 - loss: 0.0609 - viterbi_accuracy: 0.9940

KeyboardInterrupt: 

In [96]:
# import matplotlib.pyplot as plt

# def plot_graphs(history, string):
#     plt.plot(history.history[string])
#     plt.plot(history.history['val_'+string])
#     plt.xlabel("Epochs")
#     plt.ylabel(string)
#     plt.legend([string, 'val_'+string])
#     plt.show()
  

# plot_graphs(history, "accuracy")
# plot_graphs(history, "loss")

In [56]:
import random
model

<tensorflow.python.keras.engine.functional.Functional at 0x2342e9e2d90>

In [59]:
i = random.randint(0,len(X_test))
# i = 42087
p = model.predict(np.array([X_test[i]]))

p = np.argmax(p, axis=-1)


print("{:15} {:5} {:15}".format("Word",  "Pred", 'GT'))
for w, pred, gt in zip(X_test[i], p[0], list(np.argmax(y_test[i],axis=-1))):
    if w==0:
        continue
    print("{:15}: {:5} {:15} ".format(raw_tokenizer.index_word[w], idx2tag[pred], idx2tag[gt]))


Word            Pred  GT             
raya           : B-street B-street        
banj           : I-street I-street        
no             : O     O               
496            : O     O               
photo          : O     O               
copy           : O     O               
laris,         : O     O               
suka           : O     O               
sari           : O     O               


In [62]:
i

6602

In [76]:
# model.save('street_20200316.h5')
model.save_weights('street_20200316.h5')

In [78]:
word = 'wig ten iv, gununganyartambak kel. gununganyar'
word_idx = [[word2idx[w] for w in s] for s in [word.split(' ')]]
print(word_idx)
model_street = build_model(raw_vocab_size,target_vocab_size,max_len)
model_street.load_weights('street_20200316.h5')

# model_street = tf.keras.models.load_model('street_20200316.h5',custom_objects={'CRF':CRF(n_tags)})
model_street

[[5763, 175, 212, 12766, 13, 1316]]
Model: "functional_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        [(None, 32)]              0         
_________________________________________________________________
embedding_9 (Embedding)      (None, 32, 50)            6612200   
_________________________________________________________________
bidirectional_9 (Bidirection (None, 32, 200)           120800    
_________________________________________________________________
time_distributed_9 (TimeDist (None, 32, 50)            10050     
_________________________________________________________________
dense_19 (Dense)             (None, 32, 4)             204       
_________________________________________________________________
crf_10 (CRF)                 (None, 32, 4)             16        
Total params: 6,743,270
Trainable params: 6,743,270
Non-trainable params: 0
_______

<tensorflow.python.keras.engine.functional.Functional at 0x235a33687f0>

In [79]:
# i = random.randint(0,len(X_test))
# i = 42087
p = model_street.predict(np.array([X_test[i]]))

p = np.argmax(p, axis=-1)


print("{:15} {:5} {:15}".format("Word",  "Pred", 'GT'))
for w, pred, gt in zip(X_test[i], p[0], list(np.argmax(y_test[i],axis=-1))):
    if w==0:
        continue
    print("{:15}: {:5} {:15} ".format(raw_tokenizer.index_word[w], idx2tag[pred], idx2tag[gt]))


Word            Pred  GT             
raya           : B-street B-street        
banj           : I-street I-street        
no             : O     O               
496            : O     O               
photo          : O     O               
copy           : O     O               
laris,         : O     O               
suka           : O     O               
sari           : O     O               


In [85]:
y_pred = model_street.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(y_test, -1)

y_pred = [[idx2tag[i] for i in row] for row in y_pred]
y_test_true = [[idx2tag[i] for i in row] for row in y_test_true] 


In [86]:
print("F1-score is : {:.1%}".format(f1_score(y_test_true, y_pred)))
report = flat_classification_report(y_pred=y_pred, y_true=y_test_true)
print(report)



F1-score is : 91.2%




              precision    recall  f1-score   support

    B-street       0.81      0.83      0.82     29891
    I-street       0.82      0.85      0.83     40820
           O       0.95      0.95      0.95    236144
         PAD       1.00      1.00      1.00   1133145

    accuracy                           0.98   1440000
   macro avg       0.90      0.91      0.90   1440000
weighted avg       0.98      0.98      0.98   1440000



# POI

## data

In [150]:
data_train_label = pd.read_csv('data_train_label_poi.csv')
data_train_label.head()

Unnamed: 0,raw,poi,raw_split,poi_split,label
0,jl kapuk timur delta sili iii lippo cika 11 a ...,,"['jl', 'kapuk', 'timur', 'delta', 'sili', 'iii...",[''],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,"aye, jati sampurna",,"['aye,', 'jati', 'sampurna']",[''],"['O', 'O', 'O']"
2,setu siung 119 rt 5 1 13880 cipayung,,"['setu', 'siung', '119', 'rt', '5', '1', '1388...",[''],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"
3,"toko dita, kertosono",toko dita,"['toko', 'dita,', 'kertosono']","['toko', 'dita']","['B-poi', 'I-poi', 'O']"
4,jl. orde baru,,"['jl.', 'orde', 'baru']",[''],"['O', 'O', 'O']"


In [151]:
import ast
list_y = data_train_label['label'].apply(lambda x: ast.literal_eval(x)).values

data_train_label['raw_split'] = data_train_label['raw'].apply(lambda x: [x1 for x1 in x.split(' ') if x1!='']).values
list_x = data_train_label['raw_split'].values

In [152]:
print(list_y[:4])

[list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])
 list(['O', 'O', 'O']) list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])
 list(['B-poi', 'I-poi', 'O'])]


In [153]:
data_train_label['label_join'] = [' '.join(x) for x in list_y]

In [154]:
# raw_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')  # the filters ='' so that keras doesnot remove any punctuation in our data
target_tokenizer2 = tf.keras.preprocessing.text.Tokenizer(filters='',lower=False)

In [155]:
# raw data train and test concat so that there is no word that does not have index
# raw_data = pd.concat([train['raw_address'],test['raw_address']],axis=0).values
target_data = data_train_label['label_join'].values


# raw_tokenizer.fit_on_texts(raw_data)
target_tokenizer2.fit_on_texts(target_data)



In [156]:
target_tokenizer2.word_index

{'O': 1, 'I-poi': 2, 'B-poi': 3}

In [157]:
tag2idx2 = {}
for key in target_tokenizer2.word_index.keys():
    tag2idx2[key] = target_tokenizer2.word_index[key]

idx2tag2 = {}
for key in target_tokenizer2.index_word.keys():
    idx2tag2[key] = target_tokenizer2.index_word[key]
idx2tag2

tag2idx2['PAD'] = 0
idx2tag2[0]='PAD'
idx2tag2

{1: 'O', 2: 'I-poi', 3: 'B-poi', 0: 'PAD'}

In [158]:
# X = raw_tokenizer.texts_to_sequences(train['raw_address'])
# X = [[word2idx[w] for w in s] for s in list_x]
data_target_in = [[tag2idx2[w] for w in s] for s in list_y]

# Add 0 padding so all data has the same length
# X = tf.keras.preprocessing.sequence.pad_sequences(X,padding='post',value=word2idx['PAD'])
# print(X[:3])

data_target_in = tf.keras.preprocessing.sequence.pad_sequences(data_target_in,padding='post',value=tag2idx2['PAD'])
print(data_target_in[:10])


[[1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [3 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 3 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [159]:
n_tags = len(tag2idx2)
# n_tags
y = [to_categorical(i, num_classes=n_tags) for i in data_target_in]
y[:5]

[array([[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]], dtype=float32),
 array([[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 

In [160]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,shuffle=True,random_state=0)

In [161]:
X_train.shape

(255000, 32)

## train

In [162]:
n_tags

4

In [163]:
raw_vocab_size = len(raw_tokenizer.word_index) + 1
target_vocab_size = len(idx2tag)

max_len = X_train.shape[1]

In [164]:
model2 = build_model(raw_vocab_size,target_vocab_size,max_len)

Model: "functional_33"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        [(None, 32)]              0         
_________________________________________________________________
embedding_16 (Embedding)     (None, 32, 50)            6612200   
_________________________________________________________________
bidirectional_16 (Bidirectio (None, 32, 200)           120800    
_________________________________________________________________
time_distributed_16 (TimeDis (None, 32, 50)            10050     
_________________________________________________________________
dense_33 (Dense)             (None, 32, 4)             204       
_________________________________________________________________
crf_17 (CRF)                 (None, 32, 4)             16        
Total params: 6,743,270
Trainable params: 6,743,270
Non-trainable params: 0
___________________________________________

In [165]:
earlyStopping = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')
mcp_save = ModelCheckpoint('best_poi.h5', save_best_only=True,save_weights_only=True, monitor='val_loss', mode='min')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, epsilon=1e-4, mode='min')


history2 = model2.fit(X_train, np.array(y_train), 
                    batch_size=512, 
                    epochs=20, 
                    validation_split=0.15,
                    callbacks=[earlyStopping, mcp_save, reduce_lr_loss],
                    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20

KeyboardInterrupt: 

In [204]:
import random
i

36684

In [249]:
i = random.randint(0,len(X_test))
# i = 42087
p = model2.predict(np.array([X_test[i]]))

p = np.argmax(p, axis=-1)


print("{:15} {:5} {:15}".format("Word",  "Pred", 'GT'))
for w, pred, gt in zip(X_test[i], p[0], list(np.argmax(y_test[i],axis=-1))):
    if w==0:
        continue
    print("{:15}: {:5} {:15} ".format(raw_tokenizer.index_word[w], idx2tag2[pred], idx2tag2[gt]))


Word            Pred  GT             
amb,           : O     O               
toko           : B-poi B-poi           
kelontong,     : I-poi I-poi           
wuluhan        : O     O               


In [166]:
model.save_weights('poi_20200316_last.h5')

In [202]:
word = 's. par 53 sidanegara 4 cilacap tengah'
word_idx = [[word2idx[w] for w in s] for s in [word.split(' ')]]
print(word_idx)
model_poi = build_model(raw_vocab_size,target_vocab_size,max_len)
# model_poi.load_weights('poi_20200316.h5')
model_poi.load_weights('best_poi.h5')

# model_street = tf.keras.models.load_model('street_20200316.h5',custom_objects={'CRF':CRF(n_tags)})
model_poi

[[5763, 175, 212, 12766, 13, 1316]]
Model: "functional_47"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_24 (InputLayer)        [(None, 32)]              0         
_________________________________________________________________
embedding_23 (Embedding)     (None, 32, 50)            6612200   
_________________________________________________________________
bidirectional_23 (Bidirectio (None, 32, 200)           120800    
_________________________________________________________________
time_distributed_23 (TimeDis (None, 32, 50)            10050     
_________________________________________________________________
dense_47 (Dense)             (None, 32, 4)             204       
_________________________________________________________________
crf_24 (CRF)                 (None, 32, 4)             16        
Total params: 6,743,270
Trainable params: 6,743,270
Non-trainable params: 0
_______

<tensorflow.python.keras.engine.functional.Functional at 0x2362c159cd0>

In [211]:
model_poi.save_weights('poi_20200316.h5')

In [253]:
# i = random.randint(0,len(X_test))
# i = 42087
# p = model_poi.predict(np.array([X_test[i]]))
# p = model_street.predict(np.array([X_test[i]]))

p = np.argmax(p, axis=-1)


print("{:15} {:5} {:15}".format("Word",  "Pred", 'GT'))
for w, pred, gt in zip(X_test[i], p[0], list(np.argmax(y_test[i],axis=-1))):
    if w==0:
        continue
    print("{:15}: {:5} {:15} ".format(raw_tokenizer.index_word[w], idx2tag2[pred], idx2tag2[gt]))


Word            Pred 
amb,           : O     
toko           : O     
kelontong,     : O     
wuluhan        : O     


In [277]:
# i = random.randint(0,len(X_test))
word = 's. par 53 sidanegara 4 cilacap tengah'
word_idx = [[word2idx[w] for w in s] for s in [word.split(' ')]]
# p = model_poi.predict(np.array(word_idx))
p = model_street.predict(np.array(word_idx))

# i = 42087
# p = model_poi.predict(np.array([X_test[i]]))
# p = model_street.predict(np.array([X_test[i]]))

p = np.argmax(p, axis=-1)


print("{:15} {:5}".format("Word",  "Pred"))
for w, pred in zip(word_idx[0], p[0]):
    if w==0:
        continue
    print("{:15}: {:5} ".format(raw_tokenizer.index_word[w], idx2tag2[pred]))


Word            Pred 
s.             : B-poi 
par            : I-poi 
53             : O     
sidanegara     : O     
4              : O     
cilacap        : O     
tengah         : O     


In [278]:
p

array([[3, 2, 1, 1, 1, 1, 1]], dtype=int64)

In [173]:
y_pred = model_poi.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(y_test, -1)

y_pred = [[idx2tag2[i] for i in row] for row in y_pred]
y_test_true = [[idx2tag2[i] for i in row] for row in y_test_true] 


In [174]:
print("F1-score is : {:.1%}".format(f1_score(y_test_true, y_pred)))
report = flat_classification_report(y_pred=y_pred, y_true=y_test_true)
print(report)



F1-score is : 91.9%




              precision    recall  f1-score   support

       B-poi       0.76      0.76      0.76     13880
       I-poi       0.75      0.78      0.76     25938
           O       0.97      0.96      0.97    267037
         PAD       1.00      1.00      1.00   1133145

    accuracy                           0.99   1440000
   macro avg       0.87      0.88      0.87   1440000
weighted avg       0.99      0.99      0.99   1440000



In [258]:
word_idx

[[440, 394, 399, 3495, 8, 695, 43]]

# predict

In [262]:
test.head()

Unnamed: 0,id,raw_address,raw_split
0,0,s. par 53 sidanegara 4 cilacap tengah,"[s., par, 53, sidanegara, 4, cilacap, tengah]"
1,1,"angg per, baloi indah kel. lubuk baja","[angg, per,, baloi, indah, kel., lubuk, baja]"
2,2,"asma laun, mand imog,","[asma, laun,, mand, imog,]"
3,3,"ud agung rej, raya nga sri wedari karanganyar","[ud, agung, rej,, raya, nga, sri, wedari, kara..."
4,4,"cut mutia, 35 baiturrahman","[cut, mutia,, 35, baiturrahman]"


In [133]:
raw_vocab_size = len(raw_tokenizer.word_index) + 1
target_vocab_size = len(idx2tag)

max_len = X_train.shape[1]

In [25]:
# def build_model(raw_vocab_size,target_vocab_size,max_len):
#     input_ = Input(shape=(max_len,))
#     model = Embedding(input_dim=raw_vocab_size, output_dim=50, input_length=max_len)(input_)
#     model = Dropout(0.5)(model)
#     model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
#     out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  
#     model = Model(input_, out)
#     return model

In [212]:
model_street = build_model(raw_vocab_size,target_vocab_size,max_len)
model_poi = build_model(raw_vocab_size,target_vocab_size,max_len)

Model: "functional_49"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_25 (InputLayer)        [(None, 32)]              0         
_________________________________________________________________
embedding_24 (Embedding)     (None, 32, 50)            6612200   
_________________________________________________________________
bidirectional_24 (Bidirectio (None, 32, 200)           120800    
_________________________________________________________________
time_distributed_24 (TimeDis (None, 32, 50)            10050     
_________________________________________________________________
dense_49 (Dense)             (None, 32, 4)             204       
_________________________________________________________________
crf_25 (CRF)                 (None, 32, 4)             16        
Total params: 6,743,270
Trainable params: 6,743,270
Non-trainable params: 0
___________________________________________

In [213]:
model_street.load_weights('street_20200316.h5')
model_poi.load_weights('poi_20200316.h5')

In [191]:
tag2idx

{'O': 1, 'I-street': 2, 'B-street': 3, 'PAD': 0}

In [228]:
test

Unnamed: 0,id,raw_address,raw_split
0,0,s. par 53 sidanegara 4 cilacap tengah,"[s., par, 53, sidanegara, 4, cilacap, tengah]"
1,1,"angg per, baloi indah kel. lubuk baja","[angg, per,, baloi, indah, kel., lubuk, baja]"
2,2,"asma laun, mand imog,","[asma, laun,, mand, imog,]"
3,3,"ud agung rej, raya nga sri wedari karanganyar","[ud, agung, rej,, raya, nga, sri, wedari, kara..."
4,4,"cut mutia, 35 baiturrahman","[cut, mutia,, 35, baiturrahman]"
...,...,...,...
49995,49995,toko mbak farid semboro semboro,"[toko, mbak, farid, semboro, semboro]"
49996,49996,"vie - tk. ridho kids, vete 3 cari, 16720 ciawi","[vie, -, tk., ridho, kids,, vete, 3, cari,, 16..."
49997,49997,"mart dan roti bakar malabar, nasio,","[mart, dan, roti, bakar, malabar,, nasio,]"
49998,49998,graha indah pamulang jl. mujair raya bambu apu...,"[graha, indah, pamulang, jl., mujair, raya, ba..."


In [269]:
test['raw_split'] = test['raw_address'].apply(lambda x: [x1 for x1 in x.split(' ') if x1!='']).values
list_x_test = test['raw_split'].values
X_test_ori = [[word2idx[w] for w in s] for s in list_x_test]
X_test_ori = tf.keras.preprocessing.sequence.pad_sequences(X_test_ori,padding='post',maxlen=32)

In [274]:
X_test_ori[0]

array([ 440,  394,  399, 3495,    8,  695,   43,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [271]:
word_idx

[[440, 394, 399, 3495, 8, 695, 43]]

In [276]:
a = model_street.predict(np.array(word_idx))
a = np.argmax(a, axis=-1)
a

array([[3, 2, 1, 1, 1, 1, 1]], dtype=int64)

In [196]:
predict_street = model_street.predict(X_test_ori)
predict_street_final = np.argmax(predict_street, axis=-1)


In [214]:
predict_poi = model_poi.predict(X_test_ori)
predict_poi_final = np.argmax(predict_poi, axis=-1)

In [224]:
(predict_poi==predict_street).all()

False

In [241]:
predict_street_final[3000]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [242]:
predict_poi_final[3000]

array([1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [215]:
predict_street_final.shape

(50000, 32)

In [216]:
raw_address_split_test = test['raw_address'].apply(lambda x: np.array(x.split(' '))).values


In [279]:
# print(raw_address_split_test[0])
# print(' '.join(raw_address_split_test[0][np.argwhere(predict_street_final[0] > 0).reshape(-1)]))
# predict_street_final[0][np.argwhere(x > 0.01)]

def get_prediction_word(list_x,predict):
#     print(list_x,predict)
    try:
        return ' '.join(np.array(list_x)[np.argwhere(np.array(predict) > 1).reshape(-1)])
    except:
        print(list_x,predict)
        return ' '.join(np.array(list_x)[np.argwhere(np.array(predict) > 1).reshape(-1)[:-1]])

In [285]:
final = pd.DataFrame()
final['raw_address'] = test['raw_address'].values
final['raw_address_split'] = test['raw_split'].values
final['predict_street_raw'] = predict_street_final.tolist()
final['predict_poi_raw'] = predict_poi_final.tolist()


final['predict_street'] = final[['raw_address_split','predict_street_raw']].apply(lambda x: get_prediction_word(x['raw_address_split'],x['predict_street_raw']), axis=1)
final['predict_poi'] = final[['raw_address_split','predict_poi_raw']].apply(lambda x: get_prediction_word(x['raw_address_split'],x['predict_poi_raw']), axis=1)
final.head()

Unnamed: 0,raw_address,raw_address_split,predict_street_raw,predict_poi_raw,predict_street,predict_poi
0,s. par 53 sidanegara 4 cilacap tengah,"[s., par, 53, sidanegara, 4, cilacap, tengah]","[3, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",s. par,
1,"angg per, baloi indah kel. lubuk baja","[angg, per,, baloi, indah, kel., lubuk, baja]","[3, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","angg per,",
2,"asma laun, mand imog,","[asma, laun,, mand, imog,]","[1, 1, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","mand imog,",
3,"ud agung rej, raya nga sri wedari karanganyar","[ud, agung, rej,, raya, nga, sri, wedari, kara...","[1, 1, 1, 3, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[3, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",raya nga,ud agung
4,"cut mutia, 35 baiturrahman","[cut, mutia,, 35, baiturrahman]","[3, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","cut mutia,",


In [286]:
pd.read_csv('sampleSubmission.csv')

Unnamed: 0,id,POI/street
0,0,/
1,1,/angg per
2,2,asma laundry/mand imogiri
3,3,ud agung rejeki/raya ngawi-
4,4,/cut mutia


In [287]:
final['final_predict'] = final['predict_poi']+'/'+final['predict_street']
final['final_predict'] = final['final_predict'].apply(lambda x: x.strip())
final = final.reset_index().rename(columns={'index':'id','final_predict':'POI/street'})


In [288]:
final.head()

Unnamed: 0,id,raw_address,raw_address_split,predict_street_raw,predict_poi_raw,predict_street,predict_poi,POI/street
0,0,s. par 53 sidanegara 4 cilacap tengah,"[s., par, 53, sidanegara, 4, cilacap, tengah]","[3, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",s. par,,/s. par
1,1,"angg per, baloi indah kel. lubuk baja","[angg, per,, baloi, indah, kel., lubuk, baja]","[3, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","angg per,",,"/angg per,"
2,2,"asma laun, mand imog,","[asma, laun,, mand, imog,]","[1, 1, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","mand imog,",,"/mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar","[ud, agung, rej,, raya, nga, sri, wedari, kara...","[1, 1, 1, 3, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[3, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",raya nga,ud agung,ud agung/raya nga
4,4,"cut mutia, 35 baiturrahman","[cut, mutia,, 35, baiturrahman]","[3, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","cut mutia,",,"/cut mutia,"


In [289]:
submission = final[['id','POI/street']]
submission.head()

Unnamed: 0,id,POI/street
0,0,/s. par
1,1,"/angg per,"
2,2,"/mand imog,"
3,3,ud agung/raya nga
4,4,"/cut mutia,"


In [290]:
submission.to_csv('submission_20210316.csv',index=False)