## 1. Preparing Data

In [52]:
import pandas as pd
import numpy as np
import nltk

In [53]:
data = pd.read_csv('./data/data.csv')

In [54]:
data.head()

Unnamed: 0,Query,Action
0,What is the price for the event?,get_event_fees
1,Due to unavoidable reasons I'm unable to atten...,is_refundable
2,When is the deadline to make the payment?,get_registration_date
3,How much does it cost to register for the event?,get_event_fees
4,Do you have any online payment options for the...,get_payment_method


In [55]:
from Text import TextAugmentor as ta
import time

In [56]:
data_aug = pd.DataFrame(columns=['Query','Action'])
querys={}

In [57]:
def augmenting(sen,ele):
    augmentor = ta(sen)
    augmentor.GenerateSentences()
    
    for item in augmentor.generated_sentences:
        querys[item]=ele

In [58]:
count=0
for sen in zip(data['Query'],data['Action']):
    #try:
        count += 1
        if count%25 == 0:
            time.sleep(120)
        augmenting(sen[0],sen[1])
    #except:
     #   print(count)

In [61]:
for key, value in querys.items():
    data = data.append(pd.DataFrame([list((key,value))],columns=['Query','Action']))

In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 558 entries, 0 to 0
Data columns (total 2 columns):
Query     558 non-null object
Action    549 non-null object
dtypes: object(2)
memory usage: 13.1+ KB


In [63]:
vocab = []

for index,row in data.iterrows():
    
    tokens = nltk.word_tokenize(row['Query'])
    for i in tokens :
        
        if i not in vocab:
            
            vocab.append(i)
vocab.append('UNK') #Unknown token
vocab.append('PAD') #Pad token

In [64]:
n_words = len(vocab)
print("Number of unique tokens: " + str(n_words))

Number of unique tokens: 457


In [65]:
actions = list(data['Action'].unique())

In [66]:
print(actions)
n_actions = len(actions)
print("Number of unique actions : " + str(n_actions))

['get_event_fees', 'is_refundable', 'get_registration_date', 'get_payment_method', 'get_prizes', 'get_discounts', nan, 'show_schedule', 'get_event_date', 'get_event_time', 'get_event_domain', 'show_accomodation', 'show_speakers', 'speaker_details_extra', 'show_speakers_count', 'show_speakers_details', 'show_speaker_details', 'show_food_arrangements', 'show_food_cost', 'show_accomodation_address', 'get_distance', 'show_accomodation_price']
Number of unique actions : 22


In [67]:
action_index_1 = {}
action_index_2 = {}

for i,v in enumerate(actions):
    action_index_1[i] = v
    action_index_2[v] = i

In [68]:
def get_categorical_array(action):
    
    z = np.zeros(n_actions)
    z[action_index_2[action]] = 1
    
    return z

In [69]:
print(get_categorical_array('is_refundable'))

[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [70]:
def get_embed_matrix(sentence):
    
    embeds = []
    tokens = nltk.word_tokenize(sentence)
    
    for i in tokens:
        
        if i in vocab:
            
            n = vocab.index(i)
        else :
            
            n = vocab.index('UNK')
        embeds.append(n)
    return np.array(embeds)

In [71]:
print(get_embed_matrix('What time is the event?'))

[  0 116   1   2   5   6]


In [72]:
X = []
Y = []

for index,row in data.iterrows():
    
    X.append(get_embed_matrix(row['Query']))
    Y.append(get_categorical_array(row['Action']))
X = np.array(X)
Y = np.array(Y)

In [73]:
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(maxlen=18, sequences=X, padding="post", value=vocab.index('PAD'))

ImportError: No module named 'keras'

In [15]:
print(X[1])
print(Y[1])

[ 7  8  9 10 11 12 13  8 14  2  5 15 16 11 17 18 19  6]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


## 2. Training Model

In [None]:
# Uncomment this to load the model if you don't wish to train
# Make sure you comment out the training part if you are uncommenting this block

# json_file = open('Model/model.json', 'r')
# loaded_model_json = json_file.read()
# json_file.close()
# model = model_from_json(loaded_model_json)
# # load weights into new model
# model.load_weights("Model/model.h5")
# print("Loaded model from disk")

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional

In [17]:
model = Sequential()
model.add(Embedding(input_dim=n_words, output_dim=15, input_length=18))
model.add(Bidirectional(LSTM(units=25, recurrent_dropout=0.2)))
model.add(Dense(n_actions,activation='softmax'))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [18]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 18, 15)            3840      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50)                8200      
_________________________________________________________________
dense_1 (Dense)              (None, 22)                1122      
Total params: 13,162
Trainable params: 13,162
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
model.fit(X,Y,batch_size=2,epochs=75,verbose=1)

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


<keras.callbacks.History at 0x7f59919d8b38>

In [20]:
model_json = model.to_json()
with open("Model/model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("Model/model.h5")
print("Saved model to disk")

Saved model to disk


In [21]:
def get_prediction(query):
    
    mat = get_embed_matrix(query)
    x = pad_sequences(maxlen=18, sequences=[mat], padding="post", value=vocab.index('PAD'))
    ans = np.argmax(model.predict(x)[0])
    return action_index_1[ans]

In [22]:
print(get_prediction('When is the event?'))

get_event_date


In [23]:
print(get_prediction('What is on day 2?'))

show_schedule


In [24]:
print(get_prediction('What about the food and stuff?'))

show_food_arrangements


In [25]:
print(get_prediction('Show me all the speakers for this event'))

show_speakers


In [27]:
print(get_prediction('I want a refund'))

is_refundable


In [42]:
print(get_prediction('When can I sign up?'))

get_registration_date


In [45]:
print(get_prediction('What time does the event start?'))

get_event_date


In [46]:
print(get_prediction('Who are the speakers?'))

show_speakers
