In [None]:
import json
import os
import numpy as np
import tensorflow as tf
import random
from keras.models import Sequential
from keras import layers
from keras.regularizers import l2
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
train_data = []
trainset_path = './train/'
for file in os.listdir(trainset_path):
    if file == 'schema.json':
        continue
        
    with open(os.path.join(trainset_path, file), 'r') as json_file:
        for elem in json.load(json_file):
            train_data.append(elem)

In [None]:
test_data = []
testset_path = './test/'
for file in os.listdir(testset_path):
    if file == 'schema.json':
        continue
        
    with open(os.path.join(testset_path, file), 'r') as json_file:
        for elem in json.load(json_file):
            test_data.append(elem)

In [None]:
labels = ["o", "restaurant_name", "city", "time", "cuisine", "date"]

def slot2label(slot):
  if slot == "restaurant_name":
    return 2
  elif slot == "city":
    return 3
  elif slot == "time":
    return 4
  elif slot == "cuisine":
    return 5
  elif slot == "date":
    return 6
  else:
    return 1;

In [None]:
train_labels = []
test_labels = []
train_sentences = []
test_sentences = []

for dialogue in train_data:
    for idx in range(0, len(dialogue['turns']), 2):
        sentence = dialogue['turns'][idx]['utterance']
        labels = np.ones(len(sentence))
        slots = dialogue['turns'][idx]['frames'][0]['slots']

        for slot in slots:
          if slot2label(slot['slot']) != 1:
            labels = np.concatenate([labels[:slot['start']], np.array([slot2label(slot['slot'])]*(slot['exclusive_end']-slot['start'])), labels[slot['exclusive_end']:]])
            
            train_sentences.append(sentence)
            train_labels.append(labels);
        
for dialogue in test_data:
    for idx in range(0, len(dialogue['turns']), 2):
        sentence = dialogue['turns'][idx]['utterance']
        labels = np.ones(len(sentence))
        slots = dialogue['turns'][idx]['frames'][0]['slots']

        for slot in slots:
          if slot2label(slot['slot']) != 1:
            labels = np.concatenate([labels[:slot['start']], np.array([slot2label(slot['slot'])]*(slot['exclusive_end']-slot['start'])), labels[slot['exclusive_end']:]])
          
            test_sentences.append(sentence)
            test_labels.append(labels);

In [None]:
all_text = " ".join([" ".join(x) for x in train_sentences+test_sentences])
vocab = sorted(set(all_text))
char2idx = {u:i+1 for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

train_sentences = list(map(lambda s:list(map(lambda c: char2idx[c],s)),train_sentences))
test_sentences = list(map(lambda s:list(map(lambda c: char2idx[c],s)),test_sentences))

In [None]:
BATCH_SIZE = 128
BUFFER_SIZE = 1000

def gen_train_series():
    for eg in zip(train_sentences, train_labels):
      yield eg[0],eg[1]

def gen_test_series():
  for eg in zip(test_sentences, test_labels):
      yield eg[0],eg[1]

series = tf.data.Dataset.from_generator(gen_train_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))
series_test = tf.data.Dataset.from_generator(gen_test_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))

ds_series_batch = series.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)
ds_series_batch_test = series_test.padded_batch(BATCH_SIZE, padded_shapes=([None], [None]), drop_remainder=True)


In [None]:
vocab_size = len(vocab)+1

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

label_size = len(labels)  

# build LSTM model
def build_model(vocab_size,label_size, embedding_dim, rnn_units, batch_size):
      model = tf.keras.Sequential([
          tf.keras.layers.Embedding(vocab_size, embedding_dim,
                            batch_input_shape=[batch_size, None],mask_zero=True),
          tf.keras.layers.LSTM(rnn_units,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'),
          tf.keras.layers.Dense(label_size)
          ])
      return model

model = build_model(
      vocab_size = len(vocab)+1,
      label_size=len(labels)+1,
      embedding_dim=embedding_dim,
      rnn_units=rnn_units,
      batch_size=BATCH_SIZE)

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (128, None, 256)          19968     
_________________________________________________________________
lstm_1 (LSTM)                (128, None, 1024)         5246976   
_________________________________________________________________
dense_1 (Dense)              (128, None, 27)           27675     
Total params: 5,294,619
Trainable params: 5,294,619
Non-trainable params: 0
_________________________________________________________________


In [None]:
# define loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss,metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
EPOCHS=10
history = model.fit(ds_series_batch, epochs=EPOCHS, validation_data=ds_series_batch_test,callbacks=[checkpoint_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
preds = np.array([])
y_trues= np.array([])

for input_example_batch, target_example_batch in ds_series_batch_test:

  pred=model.predict_on_batch(input_example_batch)
  pred_max=tf.argmax(tf.nn.softmax(pred),2).numpy().flatten()
  y_true=target_example_batch.numpy().flatten()

  preds=np.concatenate([preds,pred_max])
  y_trues=np.concatenate([y_trues,y_true])

remove_padding = [(p,y) for p,y in zip(preds,y_trues) if y!=0]

r_p = [x[0] for x in remove_padding]
r_t = [x[1] for x in remove_padding]

print(confusion_matrix(r_p,r_t))
print(classification_report(r_p,r_t))

[[156288    422   2387    799      0   1134]
 [   162    920    256      0      0      1]
 [  1954     47  13899      0      0      6]
 [  1925      0      4   6374      0      7]
 [   246     44     54      0      0      2]
 [  4363     16     42     12      0  13340]]


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         1.0       0.95      0.97      0.96    161030
         2.0       0.63      0.69      0.66      1339
         3.0       0.84      0.87      0.85     15906
         4.0       0.89      0.77      0.82      8310
         5.0       0.00      0.00      0.00       346
         6.0       0.92      0.75      0.83     17773

    accuracy                           0.93    204704
   macro avg       0.70      0.67      0.69    204704
weighted avg       0.93      0.93      0.93    204704

