In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout, Dense, Activation, Flatten, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential,Model,load_model
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping

import re
from nltk.corpus import stopwords
from nltk import word_tokenize
stopwordset = set(stopwords.words('english'))

In [3]:
df = pd.read_csv("/kaggle/input/intentdata/intent_data.csv")

In [4]:
df.shape

Check null row and fill it with 0 value 

In [5]:
df.isnull().sum()

In [6]:
df.fillna(0,inplace=True)

In [7]:
df.isnull().sum()

In [8]:
df.to_csv('intent_data2.csv')

Clean the text by removing special characters and stop words

In [9]:
# Preprocessing for text

replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
stopwordset = set(stopwords.words('english'))

def clean_text(text):    
    
    text = replace_by_space_re.sub(' ', text) 
    text = ' '.join(word for word in text.split() if word not in stopwordset) 
    return text

In [10]:
df['sentence'] = df['sentence'].apply(clean_text)

In [11]:
# Shuffle the data to make random distribution
df = shuffle(df)

In [12]:
df.head()

In [13]:
X = df["sentence"]
y = df[["BookRestaurant", "GetWeather", "PlayMusic", "RateBook", "GetFact"]].values
X.shape, y.shape

In [14]:
docs = df['sentence'].values
# create the tokenizer
max_nwords = 50000
max_sentence_len = 4000
embedding_dim = 32

tokenizer = Tokenizer(num_words=max_nwords, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(docs)
word_index = tokenizer.word_index
token_vocab_size = len(word_index)
print('Found %s unique tokens.' % token_vocab_size)

In [15]:
X = tokenizer.texts_to_sequences(docs)
X = pad_sequences(X, maxlen=max_sentence_len)
print('Shape of data tensor:', X.shape)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 22)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)


In [17]:
#X_test.shape

In [18]:
Tokenizer_vocab_size = len(tokenizer.word_index) + 1

In [19]:
model = Sequential()

model.add(Embedding(Tokenizer_vocab_size, embedding_dim, input_length = max_sentence_len)) 

model.add(LSTM(10))
model.add(Dropout(0.5))
model.add(Dense(800, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(200, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))

model.summary()

In [20]:
epochs = 5
batch_size = 64

In [21]:
Nadam = tf.keras.optimizers.Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08, schedule_decay=0.004)
model.compile(loss='categorical_crossentropy', optimizer=Nadam, metrics=['accuracy'])


history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [38]:
model.save("Intent_Classification2.h5")

In [39]:
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test some sentences

In [47]:
def intention_predict_input(sentence):
    model = load_model("/kaggle/working/Intent_Classification2.h5")
    tokens = tokenizer.texts_to_sequences([sentence])
    tokens = pad_sequences(tokens, maxlen = 6000)
    prediction = model.predict(np.array(tokens))
    pred = np.argmax(prediction)
    classes = ['BookRestaurant','GetWeather','PlayMusic','RateBook','GetFact']
    intent = classes[pred]
    print(intent)

In [49]:
sentence = "i would like to book a table at hotel Orion for 29th june"
intention_predict_input(sentence)

In [50]:
sentence = "What the weather in New York city today?"
intention_predict_input(sentence)

In [51]:
sentence = "I want to listen to the song hello"
intention_predict_input(sentence)

In [54]:
sentence = "When did Chopin die ?"
intention_predict_input(sentence)

In [55]:
sentence = "What is the name of the most famous video game nowadays ?"
intention_predict_input(sentence)

In [53]:
sentence = "I rate this ticket book 5 stars"
intention_predict_input(sentence)