In [159]:
!pip install h5py==2.10.0



In [160]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint


In [161]:
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])
  df.drop(index=df.index[0], axis=0, inplace=True)
  print(df.head())
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)
  


In [162]:
df = pd.read_csv('data/train.csv')
df.head(100)

Unnamed: 0,Text,Intents
0,hi,GREETINGS
1,hello,GREETINGS
2,hey,GREETINGS
3,helloo,GREETINGS
4,hellooo,GREETINGS
...,...,...
78,Famous woman,CODE_WOMAN
79,Role models,CODE_WOMAN
80,list some iconic women in STEM who have had a ...,CODE_WOMAN
81,Can you name some famous female scientists in ...,CODE_WOMAN


In [163]:
intent, unique_intent, sentences = load_dataset("data/train.csv")

   Sentence     Intent
1        hi  GREETINGS
2     hello  GREETINGS
3       hey  GREETINGS
4    helloo  GREETINGS
5   hellooo  GREETINGS


In [164]:
print(sentences[:5])

['hi', ' hello', ' hey', ' helloo', ' hellooo']


In [165]:
import os
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [166]:
#define stemmer
stemmer = LancasterStemmer()

In [167]:
def cleaning(sentences):
  words = []
  for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean)
    #stemming
    words.append([i.lower() for i in w])
    
  return words  

In [168]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])  
  


83
[['hi'], ['hello']]


In [169]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [170]:
def max_length(words):
  return(len(max(words, key = len)))
  

In [197]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
# max_length = max_length(cleaned_words)
max_length = 1000

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 176 and Maximum length = 1000


In [198]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(word_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [199]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [200]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [201]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [202]:
padded_doc = padding_doc(encoded_doc, max_length)

In [203]:
padded_doc[:5]

array([[58,  0,  0, ...,  0,  0,  0],
       [59,  0,  0, ...,  0,  0,  0],
       [60,  0,  0, ...,  0,  0,  0],
       [61,  0,  0, ...,  0,  0,  0],
       [62,  0,  0, ...,  0,  0,  0]], dtype=int32)

In [204]:
print("Shape of padded docs = ",padded_doc.shape)

Shape of padded docs =  (83, 1000)


In [205]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')


In [206]:
output_tokenizer.word_index

{'greetings': 1, 'code_woman': 2, 'conversation': 3, 'code_career': 4}

In [207]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [208]:
x=[]
for i in encoded_output:
    x.append(i[0])
encoded_output=x

In [209]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [210]:
encoded_output

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [3],
       [4],
       [4],
       [4],
       [4],
       [4],
       [4],
       [2],
       [2],
       [2],
       [2],
       [2],
       [2]])

In [211]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [212]:
output_one_hot = one_hot(encoded_output)

In [213]:
output_one_hot

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],


In [214]:
from sklearn.model_selection import train_test_split

In [215]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)


In [216]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (66, 1000) and train_Y = (66, 4)
Shape of val_X = (17, 1000) and val_Y = (17, 4)


In [217]:
from tensorflow.keras import layers
import tensorflow as tf

max_features = 4650
embedding_dim = 128
sequence_length = 500
# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 6, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 6, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(4, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [218]:
#model = create_model(vocab_size, max_length)
import tensorflow
opt=tensorflow.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(loss = "categorical_crossentropy", optimizer = opt, metrics = ["accuracy"])
model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_6 (Embedding)      (None, None, 128)         595200    
_________________________________________________________________
dropout_12 (Dropout)         (None, None, 128)         0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, None, 128)         98432     
_________________________________________________________________
conv1d_13 (Conv1D)           (None, None, 128)         98432     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 128)               1651

In [219]:
filename = 'intent.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Epoch 1/100
Epoch 00001: val_loss improved from inf to 1.38417, saving model to intent.h5
Epoch 2/100
Epoch 00002: val_loss improved from 1.38417 to 1.38392, saving model to intent.h5
Epoch 3/100
Epoch 00003: val_loss improved from 1.38392 to 1.38366, saving model to intent.h5
Epoch 4/100
Epoch 00004: val_loss improved from 1.38366 to 1.38340, saving model to intent.h5
Epoch 5/100
Epoch 00005: val_loss improved from 1.38340 to 1.38311, saving model to intent.h5
Epoch 6/100
Epoch 00006: val_loss improved from 1.38311 to 1.38281, saving model to intent.h5
Epoch 7/100
Epoch 00007: val_loss improved from 1.38281 to 1.38251, saving model to intent.h5
Epoch 8/100
Epoch 00008: val_loss improved from 1.38251 to 1.38221, saving model to intent.h5
Epoch 9/100
Epoch 00009: val_loss improved from 1.38221 to 1.38193, saving model to intent.h5
Epoch 10/100
Epoch 00010: val_loss improved from 1.38193 to 1.38166, saving model to intent.h5
Epoch 11/100
Epoch 00011: val_loss improved from 1.38166 to 1.3

In [220]:
model.save("intent.h5")

In [None]:
# model = load_model("intent.h5")

In [221]:
def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  print(test_word)
  #Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict(x)
  
  
  return pred


  

In [222]:
def get_final_output(pred, classes):
  predictions = pred[0]
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
 
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))



In [223]:
text = "I want to be a doctor when I growup"
pred = predictions(text)
get_final_output(pred, unique_intent)

['i', 'want', 'to', 'be', 'a', 'doctor', 'when', 'i', 'growup']
CONVERSATION has confidence = 0.5416126
GREETINGS has confidence = 0.5298082
CODE_CAREER has confidence = 0.48259538
CODE_WOMAN has confidence = 0.46001616


In [None]:
pred

array([[0.50607824, 0.5036283 , 0.485618  ]], dtype=float32)