In [3]:
import tensorflow as tf
import numpy as np
import unicodedata
import re
import pandas as pd

In [4]:

def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)   # it is used to convert all acented characters è to e etc2
        if unicodedata.category(c) != 'Mn')


def normalize_string(s):
    s = unicode_to_ascii(s)   # why we use \r? so that the python does not process it as a for example treat \n as enter etc but as a raw string which regex exactly needed
    s = re.sub(r'([!.?])', r' \1', s)  # the !.? is grouped, so if there is a match, the grouped will be position at 1. So using " \1" will be outputted " [?!.]"
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s) # + means one or more repetition
    return s

In [5]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru


In [6]:
train.columns

Index(['id', 'raw_address', 'POI/street'], dtype='object')

In [7]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,raw_address
0,0,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per, baloi indah kel. lubuk baja"
2,2,"asma laun, mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar"
4,4,"cut mutia, 35 baiturrahman"


In [14]:
street = train['POI/street'].apply(lambda x: x.split('/')[1])
data_train = pd.DataFrame({'raw':train['raw_address'].values,'street':street}).values
data_train[:10]
# street_split = street.apply(lambda x: x.split(' '))
# street_split

array([['jl kapuk timur delta sili iii lippo cika 11 a cicau cikarang pusat',
        'jl kapuk timur delta sili iii lippo cika'],
       ['aye, jati sampurna', ''],
       ['setu siung 119 rt 5 1 13880 cipayung', 'siung'],
       ['toko dita, kertosono', ''],
       ['jl. orde baru', 'jl. orde baru'],
       ['raya samb gede, 299 toko bb kids', 'raya samb gede'],
       ['kem mel raya, no 4 bojong rawalumbu rt 1 36 rawalumbu',
        'kem mel raya'],
       ['tela keuramat kuta alam', 'tela'],
       ['gg. i wates magersari', 'gg. i'],
       ['bunga ncole ix 2', 'bunga ncole ix']], dtype=object)

In [17]:
raw_data, target_data = list(zip(*data_train))  # remember zip will map first row to first row on each list
raw_data, target_data = list(raw_data), list(target_data)

raw_data_in = ['<start> ' + data for data in target_data]
raw_data_out = [data + ' <end>' for data in target_data]

In [18]:
raw_data_in[:5]

['<start> jl kapuk timur delta sili iii lippo cika',
 '<start> ',
 '<start> siung',
 '<start> ',
 '<start> jl. orde baru']

In [19]:
raw_data_out[:5]

['jl kapuk timur delta sili iii lippo cika <end>',
 ' <end>',
 'siung <end>',
 ' <end>',
 'jl. orde baru <end>']

In [20]:
raw_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')  # the filters ='' so that keras doesnot remove any punctuation in our data
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')


In [21]:
raw_tokenizer.fit_on_texts(raw_data)
target_tokenizer.fit_on_texts(raw_data_in)
target_tokenizer.fit_on_texts(raw_data_out)


In [23]:
# print(target_tokenizer.word_index)


In [30]:
data_raw = raw_tokenizer.texts_to_sequences(raw_data)
data_target_in = target_tokenizer.texts_to_sequences(raw_data_in)
data_target_out = target_tokenizer.texts_to_sequences(raw_data_out)

# Add 0 padding so all data has the same length
data_raw = tf.keras.preprocessing.sequence.pad_sequences(data_raw,padding='post')
print(data_raw[:3])

data_target_in = tf.keras.preprocessing.sequence.pad_sequences(data_target_in,padding='post')
print(data_target_in[:3])
data_target_out = tf.keras.preprocessing.sequence.pad_sequences(data_target_out,padding='post')


[[   59   276    10   885 11600    48  2256   775    31    60  7404   105
    305     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [20286    47   472     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [  410 20287  1613     2    11     4 10110   161     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]]
[[   1   22  334   12  740 2966    8 4506  253    0    0    0    0    0
     0    0]
 [   1    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [   1 7233    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]


In [26]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, lstm_size): # vocab size is the input max size, embedding size is for the vector. ltsm size for the model
        super(Encoder, self).__init__()
        self.lstm_size = lstm_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)  # here the input is integer. as we retrain the embedding
        self.lstm = tf.keras.layers.LSTM(
            lstm_size, return_sequences=True, return_state=True)

    def call(self, sequence, states):
        embed = self.embedding(sequence)
        output, state_h, state_c = self.lstm(embed, initial_state=states) # the output usually notused, we only need state h and c for the decoder

        return output, state_h, state_c

    def init_states(self, batch_size):
        return (tf.zeros([batch_size, self.lstm_size]),
                tf.zeros([batch_size, self.lstm_size]))

In [27]:

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, lstm_size):
        super(Decoder, self).__init__()
        self.lstm_size = lstm_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.lstm = tf.keras.layers.LSTM(
            lstm_size, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, sequence, state):
        embed = self.embedding(sequence)
        lstm_out, state_h, state_c = self.lstm(embed, state)
        logits = self.dense(lstm_out)

        return logits, state_h, state_c

In [108]:
EMBEDDING_SIZE = 32
LSTM_SIZE = 64

raw_vocab_size = len(raw_tokenizer.word_index) + 1
encoder = Encoder(raw_vocab_size, EMBEDDING_SIZE, LSTM_SIZE)

target_vocab_size = len(target_tokenizer.word_index) + 1
decoder = Decoder(target_vocab_size, EMBEDDING_SIZE, LSTM_SIZE)

print((raw_vocab_size))
print((target_vocab_size))

source_input = tf.constant([[1, 3, 5, 7, 2, 0, 0, 0]])
initial_state = encoder.init_states(1)
encoder_output, en_state_h, en_state_c = encoder(source_input, initial_state)

target_input = tf.constant([[1, 4, 6, 9, 2, 0, 0]])
decoder_output, de_state_h, de_state_c = decoder(target_input, (en_state_h, en_state_c))

print('Source sequences', source_input.shape)
print('Encoder outputs', encoder_output.shape)
print('Encoder state_h', en_state_h.shape)
print('Encoder state_c', en_state_c.shape)

print('\nDestination vocab size', target_vocab_size)
print('Destination sequences', target_input.shape)
print('Decoder outputs', decoder_output.shape)
print('Decoder state_h', de_state_h.shape)
print('Decoder state_c', de_state_c.shape)

'''
Source sequences (1, 8)
Encoder outputs (1, 8, 64)
Encoder state_h (1, 64)
Encoder state_c (1, 64)
Destination vocab size 107
Destination sequences (1, 7)
Decoder outputs (1, 7, 107)
Decoder state_h (1, 64)
Decoder state_c (1, 64)
'''

121342
23870
Source sequences (1, 8)
Encoder outputs (1, 8, 64)
Encoder state_h (1, 64)
Encoder state_c (1, 64)

Destination vocab size 23870
Destination sequences (1, 7)
Decoder outputs (1, 7, 23870)
Decoder state_h (1, 64)
Decoder state_c (1, 64)


'\nSource sequences (1, 8)\nEncoder outputs (1, 8, 64)\nEncoder state_h (1, 64)\nEncoder state_c (1, 64)\nDestination vocab size 107\nDestination sequences (1, 7)\nDecoder outputs (1, 7, 107)\nDecoder state_h (1, 64)\nDecoder state_c (1, 64)\n'

In [109]:

def loss_func(targets, logits):
    crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True)
    mask = tf.math.logical_not(tf.math.equal(targets, 0))   # we donot need to compute the zero padding and such we create a mask
    mask = tf.cast(mask, dtype=tf.int64)
    loss = crossentropy(targets, logits, sample_weight=mask)

    return loss

In [110]:
optimizer = tf.keras.optimizers.Adam()


In [111]:
@tf.function   # tf.fuction is for static graph. If you want to debugging, remove it
def train_step(source_seq, target_seq_in, target_seq_out, en_initial_states):
    with tf.GradientTape() as tape:
        en_outputs = encoder(source_seq, en_initial_states)
        en_states = en_outputs[1:]
        de_states = en_states

        de_outputs = decoder(target_seq_in, de_states)
        logits = de_outputs[0]
        loss = loss_func(target_seq_out, logits)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

In [113]:
# predict does not need padding as no need batching
def predict():
    idx = np.random.choice(len(raw_data))
    test_source_text = raw_data[idx]
    target_source_text = target_data[idx]
    print(test_source_text)
    test_source_seq = raw_tokenizer.texts_to_sequences([test_source_text])
#     print(test_source_seq)
    print('Ground truth :')
    print(target_source_text)

    en_initial_states = encoder.init_states(1)
    en_outputs = encoder(tf.constant(test_source_seq), en_initial_states)

    de_input = tf.constant([[target_tokenizer.word_index['<start>']]])
    de_state_h, de_state_c = en_outputs[1:]
    out_words = []

    while True:
        de_output, de_state_h, de_state_c = decoder(
            de_input, (de_state_h, de_state_c))
        de_input = tf.argmax(de_output, -1)
        out_words.append(target_tokenizer.index_word[de_input.numpy()[0][0]])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
            break
    
    print('Prediction :')
    print(' '.join(out_words))

In [114]:
from tqdm.notebook import tqdm
NUM_EPOCHS = 250
BATCH_SIZE = 128
# Create a Tensorflow dataset instances
dataset = tf.data.Dataset.from_tensor_slices(
    (data_raw, data_target_in, data_target_out))
dataset = dataset.shuffle(20).batch(BATCH_SIZE)  # shape of data and batch
total = len(list(dataset.as_numpy_iterator()))

for e in range(NUM_EPOCHS):
    for batch, (source_seq, target_seq_in, target_seq_out) in tqdm(enumerate(dataset.take(-1)),total=total): # get all
        en_initial_states = encoder.init_states(source_seq.shape[0])
#         print(source_seq.shape,source.target_seq_in.shape,target_seq_out.seq.shape)
        loss = train_step(source_seq, target_seq_in,
                          target_seq_out, en_initial_states)

    print('Epoch {} Loss {:.4f}'.format(e + 1, loss.numpy()))
    
    predict()
    

HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 1 Loss 0.7232
jalan baru underpass, springville residence blok d34, duren jaya
Ground truth :
jalan baru underpass
Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 2 Loss 0.5703
rotimanis, cis kulon, no 14a 40293 arcamanik
Ground truth :
cis kulon
Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 3 Loss 0.4608
sal indah 4, no 17
Ground truth :
sal indah 4
Prediction :
taman kar road <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 4 Loss 0.4153
imam bon 64 80112
Ground truth :
imam bon
Prediction :
imam umar <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 5 Loss 0.3835
siliw, no 31 jawa cell, panyingkiran
Ground truth :
siliw
Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 6 Loss 0.2825
permi 39 nusaniwe (nusanive)
Ground truth :

Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 7 Loss 0.2207
warung tegal, kolo sugi, duren sawit
Ground truth :
kolo sugiono
Prediction :
ir. s. riyadi <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 8 Loss 0.1849
hypermart -lippo plaza lubuk linggau lantai lg
Ground truth :

Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 9 Loss 0.1656
tirta mumpuni, kamb, sendangadi
Ground truth :
kamb
Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 10 Loss 0.1340
flo 14 ulak karang utara padang utara
Ground truth :
flo
Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 11 Loss 0.1123
masjid nurul iman jetisharjo, jetish jt ii, jetis
Ground truth :
jetisharjo jt ii
Prediction :
j dhar <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 12 Loss 0.0960
tebet barat vii, tebet barat
Ground truth :

Prediction :
tebet barat <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 13 Loss 0.0831
raya man 3 mangunreja mangunreja
Ground truth :
raya man
Prediction :
raya let ars <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 14 Loss 0.0735
pt pelni kantor pelni surabaya jl perak timur no.564 perak utara, surabaya
Ground truth :
jl perak timur
Prediction :
jl sido road <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 15 Loss 0.0707
tk cinde laras raya wonotu,
Ground truth :
raya wonotunggal
Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 16 Loss 0.0732
es kel muda seberang bidakara, ters derw,
Ground truth :
ters derw
Prediction :
ra <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 17 Loss 0.0524
masjid an nur, pengad barat, no 38 rt 9 8 pengadegan
Ground truth :
pengad barat
Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 18 Loss 0.0521
kart jaya v cikeas udik gunung putri
Ground truth :
kart jaya v
Prediction :
h. regge iii <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 19 Loss 0.0460
papar kerto - tul 73 64153 papar
Ground truth :
kerto - tul
Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 20 Loss 0.0446
bonto bula, sumber rejo balikpapan tengah
Ground truth :
bonto bula
Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 21 Loss 0.0384
yant ayu,
Ground truth :

Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 22 Loss 0.0403
arif supriyadi,
Ground truth :

Prediction :
brig o. alam kus <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 23 Loss 0.0406
suka suka tani kel. rajeg
Ground truth :
suka
Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 24 Loss 0.0414
mera ii cimekar cileunyi
Ground truth :

Prediction :
jamil lubis <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 25 Loss 0.0382
pt fardana berlian papua pantai mentari jl.abdul latif blok b no. 28, kenjeran
Ground truth :
jl.abdul latif
Prediction :
p tanjung duren <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 26 Loss 0.0254
mak 48 bojong rawalumbu 4
Ground truth :
mak
Prediction :
taman ged ab <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 27 Loss 0.0275
man dua 14a rt 1 2 pinangsia taman sari
Ground truth :
man dua
Prediction :
man dua a5 <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 28 Loss 0.0298
ruby, curug sangereng kel. kelapa dua
Ground truth :
ruby
Prediction :
taman duren gad <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 29 Loss 0.0228
tanjungsari, the hel shop,
Ground truth :
tanjungsari
Prediction :
ruko dalem id <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 30 Loss 0.0249
r merd, pabuaran karawaci
Ground truth :
r merd
Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 31 Loss 0.0177
penampungan pasar senen blok.6 sebrang stasiun,bakmi balige.lantai 2 blok-a aks
Ground truth :

Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 32 Loss 0.0214
ger betlehem bete,
Ground truth :

Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 33 Loss 0.0219
gg. taul, kampung melayu rt 7 12 jatinegara
Ground truth :
gg. taul
Prediction :
gg. lang <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 34 Loss 0.0236
bengkel uyip, bantarujeg
Ground truth :

Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 35 Loss 0.0190
jl. seng rt 1 muncul setu
Ground truth :
jl. seng
Prediction :
jl. bha indah <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 36 Loss 0.0208
kembaran kulon warung mendoan mak nyess, 53319
Ground truth :

Prediction :
brig o. kuni barat gg. 14 <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 37 Loss 0.0197
duri selatan gg. setia masa v 1 rt 6 1 tambora
Ground truth :
gg. setia masa v
Prediction :
gg. tuba x <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 38 Loss 0.0146
medya salon, raya cica gunung putri
Ground truth :
raya cica
Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 39 Loss 0.0167
pt sari lautan gresik, indo, ujung pangkah
Ground truth :
indo
Prediction :
ujung sukam <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 40 Loss 0.0124
apotek edha farma, kelapa, lagaligo
Ground truth :
kelapa
Prediction :
<end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 41 Loss 0.0104
rorotan roro 2 10 rt 1 4 14140 cilincing
Ground truth :
roro 2
Prediction :
roro 2 <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 42 Loss 0.0107
duri kepa man 24 118 rt 2 rw 3 kebon jeruk
Ground truth :
man 24
Prediction :
man 24 <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 43 Loss 0.0154
bapak nailul qitmi, hos cokr,
Ground truth :
hos cokr
Prediction :
cut mas mutta lr. gede 14 <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))


Epoch 44 Loss 0.0121
panti pijat usaha mand t pang polem
Ground truth :
t pang polem
Prediction :
kapt. t brigjen. mas <end>


HBox(children=(FloatProgress(value=0.0, max=2344.0), HTML(value='')))




KeyboardInterrupt: 

In [117]:
encoder.save('encoder_20210314')
decoder.save('decoder_20210314')

TypeError: call() missing 1 required positional argument: 'states'