In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, optimizers, Sequential
from tensorflow.keras.models import Model

In [3]:
# debugging
pd.set_option('display.max_rows', 50)

In [4]:
# read data from input
training_df = pd.read_csv('..\\Resources\\train.csv')
N = len(training_df.index)

In [5]:
training_df

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
...,...,...,...
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko
299997,299997,tanjung gusta jl. yaya 2 no 17,/
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/


Preprocessing
-------------
Convert addresses into usable vectors

In [6]:
# env variables for preprocessing
num_words = 100000
max_length = 36 # should be even
max_label_length = int(max_length / 2)
trunc_type='pre'
padding_type='pre'
oov_tok = "<OOV>"

In [7]:
# filter training and testing data
training_sentences = training_df['raw_address'].values

raw_labels = training_df['POI/street'].values
training_labels = np.array([tuple(s.split('/')) for s in raw_labels])

In [8]:
# initialize tokenizers
# main tokenizer for training data
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

# tokenizer for training labels
label_tokenizer = Tokenizer(num_words=num_words, oov_token=oov_tok)
label_tokenizer.fit_on_texts(training_labels.flat)

word_index = {**tokenizer.word_index, **label_tokenizer.word_index}

In [9]:
# function for converting to sequences (tokenization + padding)
def convert(sentences):
    converted_sequences = tokenizer.texts_to_sequences(sentences)
    padded_sequences = pad_sequences(converted_sequences, padding=padding_type, truncating=trunc_type, maxlen=max_length)
    return padded_sequences
def convert_labels(labels):
    converted_labels = [label_tokenizer.texts_to_sequences(label) for label in labels]
    padded_labels = [pad_sequences(label, padding=padding_type, truncating=trunc_type, maxlen=max_label_length) for label in converted_labels]
    return np.array([np.concatenate(label_pair) for label_pair in padded_labels])

In [10]:
# convert training data
converted_training_sequences = convert(training_sentences)
converted_training_labels = convert_labels(training_labels)

ML Model
-------------

In [11]:
# debugging plot graph
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

In [12]:
# ml env variables
num_of_epochs = 20
validation_split = 0.2
embedding_dim = 30

In [13]:
# prep data into batches
x = np.array([converted_training_sequences])
y = np.array([converted_training_labels])

x_shape = x[0].shape
y_shape = y[0].shape

In [31]:
# model - basic straightforward model
model_base = Sequential([
    layers.Embedding(num_words, embedding_dim, input_length=max_length),
    layers.Bidirectional(layers.LSTM(15, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(20)),
    layers.Dense(y_shape[1], activation='softmax')
])

opt = optimizers.RMSprop(learning_rate=0.005)
model_base.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model_base.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 50, 30)            3000000   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 50, 30)            5520      
_________________________________________________________________
bidirectional_7 (Bidirection (None, 40)                8160      
_________________________________________________________________
dense_3 (Dense)              (None, 50)                2050      
Total params: 3,015,730
Trainable params: 3,015,730
Non-trainable params: 0
_________________________________________________________________


In [32]:
# run model
history_base = model_base.fit(converted_training_sequences, converted_training_labels, epochs=num_of_epochs, validation_split=validation_split)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
1469/7500 [====>.........................] - ETA: 2:41 - loss: 860904.0861 - accuracy: 0.2642

KeyboardInterrupt: 

In [35]:
# debugging: show progression of model
plot_graphs(history_base, "accuracy")
plot_graphs(history_base, "loss")

NameError: name 'history_base' is not defined

Encoding <-> Decoding Model (WIP)
--------------------------------

In [28]:
# encoder
encoder_inputs = layers.Input(shape=(None,))
x = layers.Embedding(num_words, embedding_dim, input_length=max_length)(encoder_inputs)
x, state_h, state_c = layers.LSTM(x_shape[1], return_state=True)(x)
encoder_states = [state_h, state_c]

In [30]:
# decoder
decoder_inputs = layers.Input(shape=(None,))
y = layers.Embedding(num_words, embedding_dim, input_length=max_length)(encoder_inputs)
y = layers.LSTM(y_shape[1], return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

NameError: name 'LSTM' is not defined

In [24]:
# model - encoder <-> decoder
# (source: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html)

# encoder
encoder_inputs = layers.Input(shape=(None,))
x = layers.Embedding(num_words, embedding_dim, input_length=max_length)(encoder_inputs)
x, state_h, state_c = layers.LSTM(x_shape[1], return_state=True)(x)
encoder_states = [state_h, state_c]

# decoder
decoder_inputs = Input(shape=y_shape)
y = layers.Embedding(num_words, embedding_dim, input_length=max_length)(encoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# compile into model
enc_dec_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
enc_dec_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
enc_dec_model.summary()

ResourceExhaustedError: OOM when allocating tensor with shape[1200000,300000] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu [Op:RandomStandardNormal]

In [None]:
# run model - enc_dec_model
enc_dec_history = enc_dec_model.fit(converted_training_sequences, converted_training_labels, epochs=num_of_epochs, validation_split=validation_split)

In [None]:
# debugging: show progression of enc_dec_model
plot_graphs(enc_dec_history, "accuracy")
plot_graphs(enc_dec_history, "loss")