In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import re
from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
nltk.download("punkt")

Using TensorFlow backend.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anmoljeet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [48]:
def clean(sentences):
    wordList = []
    for i in sentences:
        subbedSentence = re.sub(r'[^ a-z A-Z 0-9]', " ", i)
        words = word_tokenize(subbedSentence)
        wordList.append([j.lower() for j in words]) 
    return wordList

def tokens(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    token = Tokenizer(filters = filters)
    token.fit_on_texts(words)
    return token

def maxLength(words):
    return(len(max(words, key = len)))

def encoder(tokens, words):
    return(tokens.texts_to_sequences(words))

def padder(words, maxLen):
    return(pad_sequences(words, maxlen = maxLengthz, padding = "post"))

def oneHotEncoder(encode):
    oneHot = OneHotEncoder(sparse = False)
    return(oneHot.fit_transform(encode))


def create_model(vocabSizez):

    model = Sequential()
    model.add(Embedding(vocabSizez, 128))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(64, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation = "softmax"))
  
    return model

def predict(text):
    cleanedSentence = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
    testWord = word_tokenize(cleanedSentence)
    testWord = [w.lower() for w in testWord]
    testTokens = wordTokens.texts_to_sequences(testWord)
    print(testWord)
    if [] in testTokens: #Check for unknown words
        testTokens = list(filter(None, testTokens))
    testTokens = np.array(testTokens).reshape(1, len(testTokens))
    x = padder(testTokens, maxLengthz)
    pred = model.predict_proba(x)
  
    return pred

def getIntent(predicts, intents):
    prediction = predicts[0]
    intents = np.array(intents)
    ids = np.argsort(-prediction)
    intents = intents[ids]
    predictions = -np.sort(-prediction)
    for i in range(predicts.shape[1]):
        print("%s has confidence = %s" % (intents[i], (predictions[i])))
    result = np.where(predictions == np.amax(predictions))
    return intents[result]

In [29]:
dataSet = pd.read_csv('merged.csv', encoding = "latin1", names = ["Sentence", "Intent"])

dataSet, test = train_test_split(dataSet, shuffle = True, test_size=0.1)

intent = dataSet["Intent"]
keyIntent = list(set(intent))
sentences = list(dataSet["Sentence"])
  
cleanedSetences = clean(sentences)

wordTokens = tokens(cleanedSetences)

vocabSize = len(wordTokens.word_index) + 1
maxLengthz = maxLength(cleanedSetences)

encodedSentences = encoder(wordTokens, cleanedSetences)

paddedSentences = padder(encodedSentences, maxLengthz)

intentTokens = tokens(keyIntent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')
tokenOutput = encoder(intentTokens, intent)
tokenOutput = np.array(tokenOutput).reshape(len(tokenOutput), 1)
oneHotOutput = oneHotEncoder(tokenOutput)

xTrain, xValid, yTrain, yValid = train_test_split(paddedSentences, oneHotOutput, shuffle = True, test_size = 0.11111111111111)



In [30]:
input_shape = xTrain.shape
#print(input_shape)
model = create_model(vocabSize)
model.summary()
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])

modelFileName = 'modelLSTM.h5'
checkpoint = ModelCheckpoint(modelFileName, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(xTrain, yTrain, epochs = 150, batch_size = 32, validation_data = (xValid, yValid), callbacks = [checkpoint])

model.save(modelFileName)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 128)         257792    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_4 (Dense)              (None, 64)                16448     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_4 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 165       
Total para


Epoch 00036: val_loss did not improve from 0.00000
Epoch 37/150

Epoch 00037: val_loss did not improve from 0.00000
Epoch 38/150

Epoch 00038: val_loss did not improve from 0.00000
Epoch 39/150

Epoch 00039: val_loss did not improve from 0.00000
Epoch 40/150

Epoch 00040: val_loss did not improve from 0.00000
Epoch 41/150

Epoch 00041: val_loss did not improve from 0.00000
Epoch 42/150

Epoch 00042: val_loss did not improve from 0.00000
Epoch 43/150

Epoch 00043: val_loss did not improve from 0.00000
Epoch 44/150

Epoch 00044: val_loss did not improve from 0.00000
Epoch 45/150

Epoch 00045: val_loss did not improve from 0.00000
Epoch 46/150

Epoch 00046: val_loss did not improve from 0.00000
Epoch 47/150

Epoch 00047: val_loss did not improve from 0.00000
Epoch 48/150

Epoch 00048: val_loss did not improve from 0.00000
Epoch 49/150

Epoch 00049: val_loss did not improve from 0.00000
Epoch 50/150

Epoch 00050: val_loss did not improve from 0.00000
Epoch 51/150

Epoch 00051: val_loss di


Epoch 00078: val_loss did not improve from 0.00000
Epoch 79/150

Epoch 00079: val_loss did not improve from 0.00000
Epoch 80/150

Epoch 00080: val_loss did not improve from 0.00000
Epoch 81/150

Epoch 00081: val_loss did not improve from 0.00000
Epoch 82/150

Epoch 00082: val_loss did not improve from 0.00000
Epoch 83/150

Epoch 00083: val_loss did not improve from 0.00000
Epoch 84/150

Epoch 00084: val_loss did not improve from 0.00000
Epoch 85/150

Epoch 00085: val_loss did not improve from 0.00000
Epoch 86/150

Epoch 00086: val_loss did not improve from 0.00000
Epoch 87/150

Epoch 00087: val_loss did not improve from 0.00000
Epoch 88/150

Epoch 00088: val_loss did not improve from 0.00000
Epoch 89/150

Epoch 00089: val_loss did not improve from 0.00000
Epoch 90/150

Epoch 00090: val_loss did not improve from 0.00000
Epoch 91/150

Epoch 00091: val_loss did not improve from 0.00000
Epoch 92/150

Epoch 00092: val_loss did not improve from 0.00000
Epoch 93/150

Epoch 00093: val_loss di


Epoch 00120: val_loss did not improve from 0.00000
Epoch 121/150

Epoch 00121: val_loss did not improve from 0.00000
Epoch 122/150

Epoch 00122: val_loss did not improve from 0.00000
Epoch 123/150

Epoch 00123: val_loss did not improve from 0.00000
Epoch 124/150

Epoch 00124: val_loss did not improve from 0.00000
Epoch 125/150

Epoch 00125: val_loss did not improve from 0.00000
Epoch 126/150

Epoch 00126: val_loss did not improve from 0.00000
Epoch 127/150

Epoch 00127: val_loss did not improve from 0.00000
Epoch 128/150

Epoch 00128: val_loss did not improve from 0.00000
Epoch 129/150

Epoch 00129: val_loss did not improve from 0.00000
Epoch 130/150

Epoch 00130: val_loss did not improve from 0.00000
Epoch 131/150

Epoch 00131: val_loss did not improve from 0.00000
Epoch 132/150

Epoch 00132: val_loss did not improve from 0.00000
Epoch 133/150

Epoch 00133: val_loss did not improve from 0.00000
Epoch 134/150

Epoch 00134: val_loss did not improve from 0.00000
Epoch 135/150

Epoch 001

In [5]:
model = load_model('modelLSTM.h5')

In [49]:
text = "Create the cent"
pred = predict(text)
print(getIntent(pred, keyIntent))

['create', 'the', 'cent']
add_class has confidence = 1.0
add_attribute has confidence = 0.0
create_inheritance has confidence = 0.0
create_composition has confidence = 0.0
create_association has confidence = 0.0
['add_class']


In [60]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

complexTest = pd.read_csv('complexTestData.csv', encoding = "latin1", names = ["Sentence", "Intent"])

yPred = []
yTrue = []
#plug in test or complexTest
for i, j in complexTest.iterrows(): #i in index, j is value at row i
    pred = predict(j['Sentence'])
    intents = getIntent(pred, keyIntent)
    yPred.append(intents)
    yTrue.append(j['Intent'])
    
    
    


['create', 'a', 'school']
add_class has confidence = 1.0
add_attribute has confidence = 0.0
create_inheritance has confidence = 0.0
create_composition has confidence = 0.0
create_association has confidence = 0.0
['create', 'a', 'playing', 'card']
add_class has confidence = 1.0
add_attribute has confidence = 0.0
create_inheritance has confidence = 0.0
create_composition has confidence = 0.0
create_association has confidence = 0.0
['create', 'an', 'alumni']
add_class has confidence = 1.0
add_attribute has confidence = 0.0
create_inheritance has confidence = 0.0
create_composition has confidence = 0.0
create_association has confidence = 0.0
['make', 'a', 'lose', 'turn', 'action', 'card']
add_class has confidence = 1.0
add_attribute has confidence = 0.0
create_inheritance has confidence = 0.0
create_composition has confidence = 0.0
create_association has confidence = 0.0
['add', 'work', 'in', 'person']
create_composition has confidence = 1.0
create_association has confidence = 3.0052626e-3

In [61]:
print(accuracy_score(yTrue, yPred))
print(f1_score(yTrue, yPred, average='macro'))

0.5833333333333334
0.5091687979539642


  'precision', 'predicted', average, warn_for)


In [34]:
#test.to_csv (r'testData.csv', index = None, header = None)