In [1]:
import tensorflow as tf
import numpy as np
import unicodedata
import re
import pandas as pd

In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D, Input
from tensorflow.keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D

from tensorflow.keras.utils import to_categorical



In [3]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


In [4]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru


In [5]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,raw_address
0,0,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per, baloi indah kel. lubuk baja"
2,2,"asma laun, mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar"
4,4,"cut mutia, 35 baiturrahman"


# street

## data

In [6]:
data_train_label = pd.read_csv('data_train_label.csv')
data_train_label.head()

Unnamed: 0,raw,street,raw_split,street_split,label
0,jl kapuk timur delta sili iii lippo cika 11 a ...,jl kapuk timur delta sili iii lippo cika,"['jl', 'kapuk', 'timur', 'delta', 'sili', 'iii...","['jl', 'kapuk', 'timur', 'delta', 'sili', 'iii...","['B-street', 'I-street', 'I-street', 'I-street..."
1,"aye, jati sampurna",,"['aye,', 'jati', 'sampurna']",[''],"['O', 'O', 'O']"
2,setu siung 119 rt 5 1 13880 cipayung,siung,"['setu', 'siung', '119', 'rt', '5', '1', '1388...",['siung'],"['O', 'B-street', 'O', 'O', 'O', 'O', 'O', 'O']"
3,"toko dita, kertosono",,"['toko', 'dita,', 'kertosono']",[''],"['O', 'O', 'O']"
4,jl. orde baru,jl. orde baru,"['jl.', 'orde', 'baru']","['jl.', 'orde', 'baru']","['B-street', 'I-street', 'I-street']"


In [7]:
import ast
list_y = data_train_label['label'].apply(lambda x: ast.literal_eval(x)).values

In [8]:
print(list_y[:4])

[list(['B-street', 'I-street', 'I-street', 'I-street', 'I-street', 'I-street', 'I-street', 'I-street', 'O', 'O', 'O', 'O', 'O'])
 list(['O', 'O', 'O'])
 list(['O', 'B-street', 'O', 'O', 'O', 'O', 'O', 'O'])
 list(['O', 'O', 'O'])]


In [9]:
data_train_label['label_join'] = [' '.join(x) for x in list_y]

In [10]:
raw_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')  # the filters ='' so that keras doesnot remove any punctuation in our data
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='',lower=False)

In [11]:
len(raw_tokenizer.word_index)

0

In [12]:
# raw data train and test concat so that there is no word that does not have index
raw_data = pd.concat([train['raw_address'],test['raw_address']],axis=0).values
target_data = data_train_label['label_join'].values


raw_tokenizer.fit_on_texts(raw_data)
target_tokenizer.fit_on_texts(target_data)



In [13]:
len(raw_tokenizer.word_index)

132243

In [14]:
tag2idx = {}
for key in target_tokenizer.word_index.keys():
    tag2idx[key] = target_tokenizer.word_index[key]-1

idx2tag = {0: 'o', 1: 'i-street', 2: 'b-street'}
for key in target_tokenizer.index_word.keys():
    idx2tag[key-1] = target_tokenizer.index_word[key]
idx2tag

{0: 'O', 1: 'I-street', 2: 'B-street'}

In [15]:
X = raw_tokenizer.texts_to_sequences(train['raw_address'])
data_target_in = [[tag2idx[w] for w in s] for s in list_y]

# Add 0 padding so all data has the same length
X = tf.keras.preprocessing.sequence.pad_sequences(X,padding='post')
print(X[:3])

data_target_in = tf.keras.preprocessing.sequence.pad_sequences(data_target_in,padding='post',value=tag2idx['O'])
print(data_target_in[:3])


[[   59   275    10   886 11880    48  2171   774    31    60  8116   104
    309     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [20376    47   476     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]
 [  415 22368  1529     2    11     4 10063   165     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0]]
[[2 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [19]:
[raw_tokenizer.index_word[x] for x in X[0] if x!=0]

['jl',
 'kapuk',
 'timur',
 'delta',
 'sili',
 'iii',
 'lippo',
 'cika',
 '11',
 'a',
 'cicau',
 'cikarang',
 'pusat']

In [20]:
n_tags = len(tag2idx)
# n_tags
y = [to_categorical(i, num_classes=n_tags) for i in data_target_in]
y[:5]

[array([[0., 0., 1.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]], dtype=float32),
 array([[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0.

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,shuffle=True,random_state=0)

In [22]:
X_train.shape

(255000, 32)

In [23]:
len(y_train[0])

32

## train

In [79]:
n_tags

3

In [23]:
raw_vocab_size = len(raw_tokenizer.word_index) + 1
target_vocab_size = len(idx2tag)

max_len = X_train.shape[1]

In [24]:
input_ = Input(shape=(max_len,))
model = Embedding(input_dim=raw_vocab_size, output_dim=50, input_length=max_len)(input_)
model = Dropout(0.5)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  
model = Model(input_, out)




In [26]:
opt = tf.keras.optimizers.Adam(lr=0.01, decay=1e-6)
model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy"])          
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 32, 50)            6612200   
_________________________________________________________________
dropout (Dropout)            (None, 32, 50)            0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 200)           120800    
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 3)             603       
Total params: 6,733,603
Trainable params: 6,733,603
Non-trainable params: 0
_________________________________________________________________


In [94]:
earlyStopping = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')
mcp_save = ModelCheckpoint('best.h5', save_best_only=True, monitor='val_loss', mode='min')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, epsilon=1e-4, mode='min')


history = model.fit(X_train, np.array(y_train), 
                    batch_size=512, 
                    epochs=100, 
                    validation_split=0.15,
                    callbacks=[earlyStopping, mcp_save, reduce_lr_loss],
                    verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
 16/424 [>.............................] - ETA: 1:45 - loss: 0.0213 - accuracy: 0.9924

KeyboardInterrupt: 

In [96]:
# import matplotlib.pyplot as plt

# def plot_graphs(history, string):
#     plt.plot(history.history[string])
#     plt.plot(history.history['val_'+string])
#     plt.xlabel("Epochs")
#     plt.ylabel(string)
#     plt.legend([string, 'val_'+string])
#     plt.show()
  

# plot_graphs(history, "accuracy")
# plot_graphs(history, "loss")

In [98]:
import random


43884

In [121]:
# i = random.randint(0,len(X_test))
i = 42087
p = model.predict(np.array([X_test[i]]))

p = np.argmax(p, axis=-1)


print("{:15} {:5} {:15}".format("Word",  "Pred", 'GT'))
for w, pred, gt in zip(X_test[i], p[0], list(np.argmax(y_test[i],axis=-1))):
    if w==0:
        continue
    print("{:15}: {:5} {:15} ".format(raw_tokenizer.index_word[w], idx2tag[pred], idx2tag[gt]))


Word            Pred  GT             
wig            : B-street B-street        
ten            : I-street I-street        
iv,            : I-street I-street        
gununganyartambak: O     O               
kel.           : O     O               
gununganyar    : O     O               


In [119]:
model.save('street_20200315.h5')

In [30]:
word = 'wig ten iv, gununganyartambak kel. gununganyar'
word_idx = raw_tokenizer.texts_to_sequences([word])
print(word_idx)
model = tf.keras.models.load_model('street_20200315.h5')

[[5763, 175, 212, 12766, 13, 1316]]


In [43]:
p = model.predict(np.array(word_idx))

p = np.argmax(p, axis=-1)


print("{:15} {:5} ".format("Word",  "Pred"))
for w, pred in zip(word_idx[0], p[0]):
    if w==0:
        continue
    print("{:15}: {:5} ".format(raw_tokenizer.index_word[w], idx2tag[pred]))


Word            Pred  
wig            : B-street 
ten            : I-street 
iv,            : I-street 
gununganyartambak: O     
kel.           : O     
gununganyar    : O     


In [41]:
w

[5763, 175, 212, 12766, 13, 1316]

# POI

## data

In [30]:
data_train_label = pd.read_csv('data_train_label_poi.csv')
data_train_label.head()

Unnamed: 0,raw,poi,raw_split,poi_split,label
0,jl kapuk timur delta sili iii lippo cika 11 a ...,,"['jl', 'kapuk', 'timur', 'delta', 'sili', 'iii...",[''],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,"aye, jati sampurna",,"['aye,', 'jati', 'sampurna']",[''],"['O', 'O', 'O']"
2,setu siung 119 rt 5 1 13880 cipayung,,"['setu', 'siung', '119', 'rt', '5', '1', '1388...",[''],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"
3,"toko dita, kertosono",toko dita,"['toko', 'dita,', 'kertosono']","['toko', 'dita']","['B-poi', 'I-poi', 'O']"
4,jl. orde baru,,"['jl.', 'orde', 'baru']",[''],"['O', 'O', 'O']"


In [31]:
import ast
list_y = data_train_label['label'].apply(lambda x: ast.literal_eval(x)).values

In [32]:
print(list_y[:4])

[list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])
 list(['O', 'O', 'O']) list(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])
 list(['B-poi', 'I-poi', 'O'])]


In [33]:
data_train_label['label_join'] = [' '.join(x) for x in list_y]

In [35]:
# raw_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')  # the filters ='' so that keras doesnot remove any punctuation in our data
target_tokenizer2 = tf.keras.preprocessing.text.Tokenizer(filters='',lower=False)

In [36]:
# raw data train and test concat so that there is no word that does not have index
# raw_data = pd.concat([train['raw_address'],test['raw_address']],axis=0).values
target_data = data_train_label['label_join'].values


# raw_tokenizer.fit_on_texts(raw_data)
target_tokenizer2.fit_on_texts(target_data)



In [37]:
target_tokenizer2.word_index.keys()

dict_keys(['O', 'I-poi', 'B-poi'])

In [38]:
tag2idx2 = {}
for key in target_tokenizer2.word_index.keys():
    tag2idx2[key] = target_tokenizer2.word_index[key]-1

idx2tag2 = {}
for key in target_tokenizer2.index_word.keys():
    idx2tag2[key-1] = target_tokenizer2.index_word[key]
idx2tag2

{0: 'O', 1: 'I-poi', 2: 'B-poi'}

In [39]:
# X = raw_tokenizer.texts_to_sequences(train['raw_address'])
data_target_in = [[tag2idx2[w] for w in s] for s in list_y]

# Add 0 padding so all data has the same length
# X = tf.keras.preprocessing.sequence.pad_sequences(X,padding='post')
# print(X[:3])

data_target_in = tf.keras.preprocessing.sequence.pad_sequences(data_target_in,padding='post',value=tag2idx2['O'])
print(data_target_in[:6])


[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [57]:
n_tags = len(tag2idx2)
# n_tags
y = [to_categorical(i, num_classes=n_tags) for i in data_target_in]
y[:5]

[array([[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]], dtype=float32),
 array([[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0.

In [58]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,shuffle=True,random_state=0)

In [59]:
X_train.shape

(255000, 32)

In [60]:
len(y_train[0])

32

## train

In [61]:
n_tags

3

In [62]:
raw_vocab_size = len(raw_tokenizer.word_index) + 1
target_vocab_size = len(idx2tag)

max_len = X_train.shape[1]

In [64]:
input_2 = Input(shape=(max_len,))
model2 = Embedding(input_dim=raw_vocab_size, output_dim=50, input_length=max_len)(input_2)
model2 = Dropout(0.5)(model2)
model2= Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model2)
out2 = TimeDistributed(Dense(n_tags, activation="softmax"))(model2)  
model2 = Model(input_2, out2)




In [65]:
opt = tf.keras.optimizers.Adam(lr=0.01, decay=1e-6)
model2.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy"])          
model2.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 32)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 32, 50)            6612200   
_________________________________________________________________
dropout_1 (Dropout)          (None, 32, 50)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 32, 200)           120800    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 32, 3)             603       
Total params: 6,733,603
Trainable params: 6,733,603
Non-trainable params: 0
_________________________________________________________________


In [66]:
earlyStopping = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='min')
mcp_save = ModelCheckpoint('best_poi.h5', save_best_only=True, monitor='val_loss', mode='min')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, epsilon=1e-4, mode='min')


history = model2.fit(X_train, np.array(y_train), 
                    batch_size=512, 
                    epochs=100, 
                    validation_split=0.15,
                    callbacks=[earlyStopping, mcp_save, reduce_lr_loss],
                    verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
 20/424 [>.............................] - ETA: 1:42 - loss: 0.0086 - accuracy: 0.9969

KeyboardInterrupt: 

In [96]:
# import matplotlib.pyplot as plt

# def plot_graphs(history, string):
#     plt.plot(history.history[string])
#     plt.plot(history.history['val_'+string])
#     plt.xlabel("Epochs")
#     plt.ylabel(string)
#     plt.legend([string, 'val_'+string])
#     plt.show()
  

# plot_graphs(history, "accuracy")
# plot_graphs(history, "loss")

In [68]:
import random


In [93]:
i = random.randint(0,len(X_test))
# i = 42087
p = model2.predict(np.array([X_test[i]]))

p = np.argmax(p, axis=-1)


print("{:15} {:5} {:15}".format("Word",  "Pred", 'GT'))
for w, pred, gt in zip(X_test[i], p[0], list(np.argmax(y_test[i],axis=-1))):
    if w==0:
        continue
    print("{:15}: {:5} {:15} ".format(raw_tokenizer.index_word[w], idx2tag2[pred], idx2tag2[gt]))


Word            Pred  GT             
ud             : B-poi B-poi           
syinta         : I-poi I-poi           
jaya           : I-poi I-poi           
cilin          : O     O               
lama           : O     O               


In [94]:
# model.save('poi_20200315.h5')

In [30]:
word = 'wig ten iv, gununganyartambak kel. gununganyar'
word_idx = raw_tokenizer.texts_to_sequences([word])
print(word_idx)
model2 = tf.keras.models.load_model('street_20200315.h5')

[[5763, 175, 212, 12766, 13, 1316]]


In [43]:
p = model.predict(np.array(word_idx))

p = np.argmax(p, axis=-1)


print("{:15} {:5} ".format("Word",  "Pred"))
for w, pred in zip(word_idx[0], p[0]):
    if w==0:
        continue
    print("{:15}: {:5} ".format(raw_tokenizer.index_word[w], idx2tag[pred]))


Word            Pred  
wig            : B-street 
ten            : I-street 
iv,            : I-street 
gununganyartambak: O     
kel.           : O     
gununganyar    : O     


In [41]:
w

[5763, 175, 212, 12766, 13, 1316]

# predict

In [16]:
test.head()

Unnamed: 0,id,raw_address
0,0,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per, baloi indah kel. lubuk baja"
2,2,"asma laun, mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar"
4,4,"cut mutia, 35 baiturrahman"


In [24]:
raw_vocab_size = len(raw_tokenizer.word_index) + 1
target_vocab_size = len(idx2tag)

max_len = X_train.shape[1]

In [25]:
def build_model(raw_vocab_size,target_vocab_size,max_len):
    input_ = Input(shape=(max_len,))
    model = Embedding(input_dim=raw_vocab_size, output_dim=50, input_length=max_len)(input_)
    model = Dropout(0.5)(model)
    model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
    out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  
    model = Model(input_, out)
    return model

In [26]:
model_street = build_model(raw_vocab_size,target_vocab_size,max_len)
model_poi = build_model(raw_vocab_size,target_vocab_size,max_len)



In [29]:
model_street = tf.keras.models.load_model('street_20200315.h5')
model_poi = tf.keras.models.load_model('poi_20200315.h5')



In [40]:
tag2idx

{'O': 0, 'I-street': 1, 'B-street': 2}

In [41]:
tag2idx2

{'O': 0, 'I-poi': 1, 'B-poi': 2}

In [56]:
X_test_ori = raw_tokenizer.texts_to_sequences(test['raw_address'])
X_test_ori = tf.keras.preprocessing.sequence.pad_sequences(X_test_ori,padding='post',maxlen=32)

In [57]:
X_test_ori.shape

(50000, 32)

In [58]:
predict_street = model_street.predict(X_test_ori)
predict_poi = model_poi.predict(X_test_ori)

In [61]:
predict_street_final = np.argmax(predict_street, axis=-1)
predict_poi_final = np.argmax(predict_poi, axis=-1)

In [82]:
predict_street_final.shape

(50000, 32)

In [101]:
' '.join(np.array(['1','2','3']))

'1 2 3'

In [68]:
raw_address_split_test = test['raw_address'].apply(lambda x: np.array(x.split(' '))).values


In [123]:
print(raw_address_split_test[0])
print(' '.join(raw_address_split_test[0][np.argwhere(predict_street_final[0] > 0).reshape(-1)]))
# predict_street_final[0][np.argwhere(x > 0.01)]

def get_prediction_word(list_x,predict):
#     print(list_x,predict)
    try:
        return ' '.join(np.array(list_x)[np.argwhere(np.array(predict) > 0).reshape(-1)])
    except:
        print(list_x,predict)
        return ' '.join(np.array(list_x)[np.argwhere(np.array(predict) > 0).reshape(-1)[:-1]])

['s.' 'par' '53' 'sidanegara' '4' 'cilacap' 'tengah']
s. par


In [124]:
final = pd.DataFrame()
final['raw_address'] = test['raw_address'].values
final['raw_address_split'] = final['raw_address'].apply(lambda x: np.array(x.split(' '))).values
final['predict_street_raw'] = predict_street_final.tolist()
final['predict_poi_raw'] = predict_poi_final.tolist()


final['predict_street'] = final[['raw_address_split','predict_street_raw']].apply(lambda x: get_prediction_word(x['raw_address_split'],x['predict_street_raw']), axis=1)
final['predict_poi'] = final[['raw_address_split','predict_poi_raw']].apply(lambda x: get_prediction_word(x['raw_address_split'],x['predict_poi_raw']), axis=1)
final.head()

['rm' 'salero' 'ajo,' 'asem' 'baris' 'raya'] [0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['hotel' 'wina' 'beach,' 'kom'] [0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


Unnamed: 0,raw_address,raw_address_split,predict_street_raw,predict_poi_raw,predict_street,predict_poi
0,s. par 53 sidanegara 4 cilacap tengah,"[s., par, 53, sidanegara, 4, cilacap, tengah]","[2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",s. par,
1,"angg per, baloi indah kel. lubuk baja","[angg, per,, baloi, indah, kel., lubuk, baja]","[2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","angg per,",
2,"asma laun, mand imog,","[asma, laun,, mand, imog,]","[0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","mand imog,",
3,"ud agung rej, raya nga sri wedari karanganyar","[ud, agung, rej,, raya, nga, sri, wedari, kara...","[0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",raya nga,ud agung
4,"cut mutia, 35 baiturrahman","[cut, mutia,, 35, baiturrahman]","[2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","cut mutia,",


In [125]:
len(final)

50000

In [126]:
pd.read_csv('sampleSubmission.csv')

Unnamed: 0,id,POI/street
0,0,/
1,1,/angg per
2,2,asma laundry/mand imogiri
3,3,ud agung rejeki/raya ngawi-
4,4,/cut mutia


In [127]:
final['final_predict'] = final['predict_poi']+'/'+final['predict_street']
final['final_predict'] = final['final_predict'].apply(lambda x: x.strip())
final = final.reset_index().rename(columns={'index':'id','final_predict':'POI/street'})


In [128]:
final.head()

Unnamed: 0,id,raw_address,raw_address_split,predict_street_raw,predict_poi_raw,predict_street,predict_poi,POI/street
0,0,s. par 53 sidanegara 4 cilacap tengah,"[s., par, 53, sidanegara, 4, cilacap, tengah]","[2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",s. par,,/s. par
1,1,"angg per, baloi indah kel. lubuk baja","[angg, per,, baloi, indah, kel., lubuk, baja]","[2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","angg per,",,"/angg per,"
2,2,"asma laun, mand imog,","[asma, laun,, mand, imog,]","[0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","mand imog,",,"/mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar","[ud, agung, rej,, raya, nga, sri, wedari, kara...","[0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",raya nga,ud agung,ud agung/raya nga
4,4,"cut mutia, 35 baiturrahman","[cut, mutia,, 35, baiturrahman]","[2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","cut mutia,",,"/cut mutia,"


In [129]:
submission = final[['id','POI/street']]
submission.head()

Unnamed: 0,id,POI/street
0,0,/s. par
1,1,"/angg per,"
2,2,"/mand imog,"
3,3,ud agung/raya nga
4,4,"/cut mutia,"


In [130]:
submission.to_csv('submission_20210315.csv',index=False)