## 1. Preparing Data

In [1]:
import pandas as pd
import numpy as np
import nltk
from keras.preprocessing.sequence import pad_sequences
# from gensim.test.utils import datapath, get_tmpfile
# from gensim.models import KeyedVectors
# from gensim.scripts.glove2word2vec import glove2word2vec

Using TensorFlow backend.


In [None]:
# word_embeds = KeyedVectors.load_word2vec_format('word_vectors.txt')
# print("Initialized Word Embeddings...")

In [None]:
# def vectorize(sentence):
#     unk = word_embeds.wv['unknown']
#     matrix = []
#     tokens = nltk.word_tokenize(sentence)
#     tokens = [i.lower() for i in tokens]
#     for i in tokens:
#         if i not in word_embeds.vocab:
#             matrix.append(unk)
#         else:
#             matrix.append(word_embeds.wv[i])            
#     matrix = pad_sequences(maxlen=18, sequences=np.array([matrix]), padding="post", value=unk,dtype='float32')
#     return matrix[0]

In [2]:
data = pd.read_csv('./data/data.csv')

In [3]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Query,Action
0,0,0,0,What is the price for the event?,get_event_fees
1,1,1,1,Due to unavoidable reasons I'm unable to atten...,is_refundable
2,2,2,2,When is the deadline to make the payment?,get_registration_date
3,3,3,3,How much does it cost to register for the event?,get_event_fees
4,4,4,4,Do you have any online payment options for the...,get_payment_method


In [4]:
vocab = []
for index,row in data.iterrows():
    tokens = nltk.word_tokenize(row['Query'])
    for i in tokens:
        if not i in vocab:
            vocab.append(i)
vocab.append('UNK')
vocab.append('PAD')

In [5]:
n_words = len(vocab)
print("Number of unique tokens: " + str(n_words))

Number of unique tokens: 391


In [6]:
actions = list(data['Action'].unique())

In [7]:
print(actions)
n_actions = len(actions)
print("Number of unique actions : " + str(n_actions))

['get_event_fees', 'is_refundable', 'get_registration_date', 'get_payment_method', 'get_prizes', 'get_discounts', 'greet', 'show_schedule', 'get_event_date', 'get_event_time', 'show_accomodation', 'show_speakers', 'speaker_details_extra', 'show_food_arrangements', 'get_distance', 'get_location', 'show_contact_info', 'about_chatbot']
Number of unique actions : 18


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291 entries, 0 to 290
Data columns (total 5 columns):
Unnamed: 0        291 non-null int64
Unnamed: 0.1      291 non-null int64
Unnamed: 0.1.1    291 non-null int64
Query             291 non-null object
Action            291 non-null object
dtypes: int64(3), object(2)
memory usage: 11.4+ KB


In [9]:
action_index_1 = {}
action_index_2 = {}

for i,v in enumerate(actions):
    action_index_1[i] = v
    action_index_2[v] = i

In [10]:
def get_index_matrix(sentence):
    matrix = []
    w = nltk.word_tokenize(sentence)
    for i in w:
        if i in vocab:
            matrix.append(vocab.index(i))
        else :
            matrix.append(vocab.index('UNK'))
    x = pad_sequences(maxlen=18, sequences=[matrix], padding="post", value=vocab.index('PAD'))
    return x[0]

In [11]:
def get_categorical_array(action):
    
    z = np.zeros(n_actions)
    z[action_index_2[action]] = 1
    
    return z

In [12]:
print(get_categorical_array('is_refundable'))

[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [13]:
X = []
Y = []

for index,row in data.iterrows():
    
    X.append(get_index_matrix(row['Query']))
    Y.append(get_categorical_array(row['Action']))
X = np.array(X)
Y = np.array(Y)

In [14]:
print(X.shape)
print(Y.shape)

(291, 18)
(291, 18)


In [15]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.05)

In [16]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(276, 18) (276, 18)
(15, 18) (15, 18)


## 2. Training Model

In [None]:
# Uncomment this to load the model if you don't wish to train
# Make sure you comment out the training part if you are uncommenting this block

# json_file = open('Model/model.json', 'r')
# loaded_model_json = json_file.read()
# json_file.close()
# model = model_from_json(loaded_model_json)
# # load weights into new model
# model.load_weights("Model/model.h5")
# print("Loaded model from disk")

In [17]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional

In [18]:
model = Sequential()
model.add(Embedding(input_dim=n_words,output_dim=15,input_length=18))
model.add(Bidirectional(LSTM(units=20)))
model.add(Dense(n_actions,activation='softmax'))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [19]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 18, 15)            5865      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 40)                5760      
_________________________________________________________________
dense_1 (Dense)              (None, 18)                738       
Total params: 12,363
Trainable params: 12,363
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
model.fit(X,Y,batch_size=2,epochs=15,verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f701b1b72b0>

In [22]:
score,acc = model.evaluate(x_test,y_test, verbose = 1, batch_size = 2)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.06
acc: 1.00


In [23]:
model_json = model.to_json()
with open("Model/model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("Model/model.h5")
print("Saved model to disk")

Saved model to disk
