## 1. Preparing Data

In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
data = pd.read_csv('./data/data.csv')

In [3]:
data.head()

Unnamed: 0,Query,Action
0,What is the price for the event?,get_event_fees
1,Due to unavoidable reasons I'm unable to atten...,is_refundable
2,When is the deadline to make the payment?,get_registration_date
3,How much does it cost to register for the event?,get_event_fees
4,Do you have any online payment options for the...,get_payment_method


In [4]:
from Text import TextAugmentor as ta
import time

In [5]:
data_aug = pd.DataFrame(columns=['Query','Action'])
querys={}

In [6]:
def augmenting(sen,ele):
    augmentor = ta(sen)
    augmentor.GenerateSentences()
    
    for item in augmentor.generated_sentences:
        querys[item]=ele

In [None]:
count=0
for sen in zip(data['Query'],data['Action']):
    #try:
        count += 1
        if count%25 == 0:
            time.sleep(120)
        augmenting(sen[0],sen[1])
    #except:
     #   print(count)

In [None]:
for key, value in querys.items():
    data = data.append(pd.DataFrame([list((key,value))],columns=['Query','Action']))

In [None]:
data.info()

In [None]:
vocab = []

for index,row in data.iterrows():
    
    tokens = nltk.word_tokenize(row['Query'])
    for i in tokens :
        
        if i not in vocab:
            
            vocab.append(i)
vocab.append('UNK') #Unknown token
vocab.append('PAD') #Pad token

In [None]:
n_words = len(vocab)
print("Number of unique tokens: " + str(n_words))

In [None]:
actions = list(data['Action'].unique())

In [None]:
print(actions)
n_actions = len(actions)
print("Number of unique actions : " + str(n_actions))

In [None]:
action_index_1 = {}
action_index_2 = {}

for i,v in enumerate(actions):
    action_index_1[i] = v
    action_index_2[v] = i

In [None]:
def get_categorical_array(action):
    
    z = np.zeros(n_actions)
    z[action_index_2[action]] = 1
    
    return z

In [None]:
print(get_categorical_array('is_refundable'))

In [None]:
def get_embed_matrix(sentence):
    
    embeds = []
    tokens = nltk.word_tokenize(sentence)
    
    for i in tokens:
        
        if i in vocab:
            
            n = vocab.index(i)
        else :
            
            n = vocab.index('UNK')
        embeds.append(n)
    return np.array(embeds)

In [None]:
print(get_embed_matrix('What time is the event?'))

In [None]:
X = []
Y = []

for index,row in data.iterrows():
    
    X.append(get_embed_matrix(row['Query']))
    Y.append(get_categorical_array(row['Action']))
X = np.array(X)
Y = np.array(Y)

In [None]:
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(maxlen=18, sequences=X, padding="post", value=vocab.index('PAD'))

In [None]:
print(X[1])
print(Y[1])

## 2. Training Model

In [None]:
# Uncomment this to load the model if you don't wish to train
# Make sure you comment out the training part if you are uncommenting this block

# json_file = open('Model/model.json', 'r')
# loaded_model_json = json_file.read()
# json_file.close()
# model = model_from_json(loaded_model_json)
# # load weights into new model
# model.load_weights("Model/model.h5")
# print("Loaded model from disk")

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional

In [None]:
model = Sequential()
model.add(Embedding(input_dim=n_words, output_dim=15, input_length=18))
model.add(Bidirectional(LSTM(units=25, recurrent_dropout=0.2)))
model.add(Dense(n_actions,activation='softmax'))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
print(model.summary())

In [None]:
model.fit(X,Y,batch_size=2,epochs=75,verbose=1)

In [None]:
model_json = model.to_json()
with open("Model/model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("Model/model.h5")
print("Saved model to disk")
my_model = model.load_weights("Model/model.h5")
print(my_model)



In [None]:
def get_prediction(query):
    
    mat = get_embed_matrix(query)
    x = pad_sequences(maxlen=18, sequences=[mat], padding="post", value=vocab.index('PAD'))
    ans = np.argmax(model.predict(x)[0])
    return action_index_1[ans]

In [None]:
print(get_prediction('When is the event?'))

In [None]:
print(get_prediction('What is on day 2?'))

In [None]:
print(get_prediction('What about the food and stuff?'))

In [None]:
print(get_prediction('Show me all the speakers for this event'))

In [None]:
print(get_prediction('I want a refund'))

In [None]:
print(get_prediction('When can I sign up?'))

In [None]:
print(get_prediction('What time does the event start?'))

In [None]:
print(get_prediction('Who are the speakers?'))

In [None]:
print(my_model)