In [139]:
#imports 
from tensorflow import keras
import pandas as pd
import numpy as np
from keras.models import Model
from keras.layers import LSTM,Dense,Input,Bidirectional
from nltk.tokenize.treebank import TreebankWordTokenizer
import pickle
import gensim.models as gm

<h2 style="color:red">Preparing our Data</h2>

In [140]:
with open('dekhteX_train100', 'rb') as dekht:
    X_train = pickle.load(dekht)
with open('dekhteY_train100', 'rb') as dekhty:
    Y_train = pickle.load(dekhty)

In [141]:
#casting labels
Y_train = np.array(Y_train)
Y_train.shape

(700, 7)

<h2 style="color:red">Reading our word embedding</h2>
and preparing it

In [252]:
embeddings_index = {}
with open('/Users/mohammad/Documents/Internship-IAI/indian hotel/glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        
        coefs = [float(i) for i in values[1:]]
        embeddings_index[word] = coefs

embeddings_index['<PAD>'] = [0] * 300
embeddings_index['<UNK>'] = [1] * 300

# word_embedding = gm.KeyedVectors.load_word2vec_format('/Users/mohammad/Documents/Internship-IAI/indian hotel/GoogleNews-vectors-negative300.bin', binary=True)
# unk_index = [1] * 300
# pad_index = [0] * 300

<h2 style="color: red">proccessing sentences and replacing word vectors with words</h2>

In [312]:
MAX_SEQ = 20
for s in range(len(X_train)):
    n = MAX_SEQ - len(X_train[s])
    if n < 0:
        X_train[s] = X_train[s][:MAX_SEQ]
    else:
        for i in range(n):
            X_train[s].append('<PAD>')
    for v in range(len(X_train[s])):
        try:
            X_train[s][v] = list(word_embedding.word_vec(X_train[s][v]))
            
        except:
            X_train[s][v] = unk_index

In [313]:
#casting training set
X_train = np.array(X_train)
X_train.shape
# word_embedding.word_vec(X_train[0][0])

(640, 20, 300)

<h2 style="color:red">Building model with Batch size 64</h2>

In [106]:
BATCH_SIZE = 64
input_layer = Input( batch_shape = (BATCH_SIZE, MAX_SEQ, 300))
lstm_layer = Bidirectional(LSTM(units=MAX_SEQ))(input_layer)
output_layer = Dense(7, activation="softmax")(lstm_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='categorical_crossentropy',
              optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (64, 20, 300)             0         
_________________________________________________________________
bidirectional_2 (Bidirection (64, 40)                  51360     
_________________________________________________________________
dense_2 (Dense)              (64, 7)                   287       
Total params: 51,647
Trainable params: 51,647
Non-trainable params: 0
_________________________________________________________________


In [144]:
#Checkpoints at the end of each epoch
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint('weight_dekhte.{epoch:02d}.hdf5')
callbacks_list = [checkpoint]

In [317]:
# #training resize
# X_train = X_train[0:640]
# Y_train = Y_train[0:640]

In [318]:
#load previous weights
# model.load_weights('weight_dekhte.60.hdf5')

In [319]:
# #fit the model
# EPOCH_SIZE = 60
# model.fit(X_train, Y_train, epochs=EPOCH_SIZE, batch_size=BATCH_SIZE, callbacks=callbacks_list)

<h2 style="color:red">Building model with Batch size 1</h2>

In [152]:
BATCH_SIZE = 1
input_layer = Input( batch_shape = (BATCH_SIZE, MAX_SEQ, 300))
lstm_layer = Bidirectional(LSTM(units=MAX_SEQ))(input_layer)
output_layer = Dense(7, activation="softmax")(lstm_layer)

dekhtemodel = Model(inputs=input_layer, outputs=output_layer)
dekhtemodel.compile(loss='categorical_crossentropy',
              optimizer='adam')
dekhtemodel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (1, 20, 300)              0         
_________________________________________________________________
bidirectional_4 (Bidirection (1, 40)                   51360     
_________________________________________________________________
dense_4 (Dense)              (1, 7)                    287       
Total params: 51,647
Trainable params: 51,647
Non-trainable params: 0
_________________________________________________________________


In [154]:
# transfer prev model weights
# we = model.get_weights()
# dekhtemodel.set_weights(we)
dekhtemodel.load_weights('last_weights.hdf5')



In [17]:
#load previous weights
# dekhtemodel.load_weights('weight_dekhte.40.hdf5')

In [155]:
dekhtemodel.compile(loss='categorical_crossentropy',
              optimizer='adam')
dekhtemodel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (1, 20, 300)              0         
_________________________________________________________________
bidirectional_4 (Bidirection (1, 40)                   51360     
_________________________________________________________________
dense_4 (Dense)              (1, 7)                    287       
Total params: 51,647
Trainable params: 51,647
Non-trainable params: 0
_________________________________________________________________


In [316]:
# # #fit model
# EPOCH_SIZE = 12
# dekhtemodel.fit(X_train, Y_train, epochs=EPOCH_SIZE, batch_size=BATCH_SIZE, callbacks=callbacks_list)

<h2 style="color:red">Preparing Sentences for testing</h2>

In [178]:
def prepare(sentence):
  tokenizer = TreebankWordTokenizer()
  sent = tokenizer.tokenize(sentence)
  for i in sent:
    n = MAX_SEQ - len(sent)
    if n < 0:
      sent = sent[:MAX_SEQ]
    else:
        for j in range(n):
            sent.append('<PAD>')
  for j in range(len(sent)):
    try:
      sent[j] = list(word_embedding.word_vec(sent[j]))
    except:
      sent[j] = unk_index
  return np.array(sent).reshape((1, 20, 300))


In [179]:
def classify(sent):
    sentence = prepare(sent)
    sentence = dekhtemodel.predict(sentence)
    argmax = np.argmax(sentence)
    if argmax == 0:
        print('AddToPlaylist')
    elif argmax == 1:
        print('BookRestaurant')
    elif argmax == 2:
#         print('GetWeather')
        city = return_entity(sent , entity_city_iran)
        day = return_entity(sent, entity_day)
        print('you requested ' + city +'\'s weather for ' + day +"?")
    
    elif argmax == 3:
        print('PlayMusic')
    elif argmax == 4:
        print('RateBook')
    elif argmax == 5:
        print('SearchCreativeWork')
    elif argmax == 6:
        print('SearchScreeningEvent')

<h2 style="color:red">Entity recognitions</h2>

In [180]:
#similarity function
from scipy import spatial
def sim(dataSetI , dataSetII):
    return 1 - spatial.distance.cosine(dataSetI, dataSetII)

In [242]:
#recognize entity
def return_entity(sent , entity):
    sent = sent.lower()
    tokenizer = TreebankWordTokenizer()
    sent = tokenizer.tokenize(sent)
    ma = 0
    ans = ""
    for i in sent: 
        try:
            if i not in stop_words and sim(list(word_embedding.word_vec(i)) , entity) > ma:
                ma = sim(list(word_embedding.word_vec(i)) , entity)
                ans = i
        except:
            pass
    if ma < .1:
        return "nothing"
    return ans

In [302]:
#defining entities
from nltk.corpus import stopwords
embeddings_size = 300
stop_words = set(stopwords.words('english'))
stop_words.add('?')
entity_lists = { "cloth" : ['t_shirt' , 'shirts' , 'jeans'],
                "city_iran" : ['tehran', 'karaj', 'san_francisco'],
                "name_foreign" : ['john', 'jack', 'paul'],
                "music_genre" : ['pop', 'rap', 'jazz', 'rock', 'classical'],
                "day" : ['tomorrow', 'today', 'yesterday', 'friday', 'sunday', 'saturdays'],
                "adverb": ['sometimes', 'usually', 'never']
                
               }
for ent in entity_lists:
    sum_of_embedding = np.zeros(embeddings_size)
    for obj in entity_lists[ent]:
        sum_of_embedding += word_embedding.word_vec(obj)
#         sum_of_embedding += np.array(embeddings_index[obj])
    sum_of_embedding /= len(entity_lists[ent])
    globals()['entity_{}'.format(ent)] = list(sum_of_embedding)

<h2 style="color:red">Test</h2>

In [303]:
sent1 = 'is it going to rain in texas often sunday with a R&B and some pants?'
sent = 'is it cold tomorrow in tehran'
sent = 'add song to playlist'
sent = 'i want to hear something from micheal jackson'
sent = "how is the weather in newyork tomorrow"
classify(sent1)
# print("city in iran: " ,return_entity(sent , entity_city_iran))
# print("adverb: ", return_entity(sent, entity_adverb))
# print("time: ", return_entity(sent, entity_day))
# print("genre: ", return_entity(sent, entity_music_genre))
# print("clothes: ", return_entity(sent, entity_cloth))

you requested sunday's weather for sunday?
