In [1]:
import keras

Using TensorFlow backend.


In [2]:
from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import collections
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
from nltk.corpus import nps_chat
from sklearn.preprocessing import LabelBinarizer

In [3]:
def get_dialogue_data(output_file):
    open(output_file,'w').close();
    posts=nps_chat.xml_posts()
    with open(output_file,'a') as fileWrite:
        for post in posts:
            fileWrite.write(post.text+'\t'+post.get('class').lower())
            fileWrite.write('\n')
            
get_dialogue_data("nps_chat_data.txt")

In [4]:
maxlen = 0
word_freqs = collections.Counter()
num_recs = 0
labels=[]
ftrain = open(os.path.join("nps_chat_data.txt"), 'rb')
for line in ftrain:
    line=str(line.decode('ascii',"ignore").lower().strip())
    line.strip().split("\t")
    label, sentence = line.strip().split("\t")[1], line.strip().split("\t")[0]
    words = nltk.word_tokenize(sentence)
    labels.append(label)
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
        word_freqs[word] += 1
    num_recs += 1
ftrain.close()
labels=list(set(labels))
classes=len(labels)


In [5]:
MAX_FEATURES = 6000
MAX_SENTENCE_LENGTH = 85

In [6]:
vocab_size = min(MAX_FEATURES, len(word_freqs)) + 2
word2index = {x[0]: i+2 for i, x in enumerate(word_freqs.most_common(MAX_FEATURES))}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v:k for k, v in word2index.items()}

In [7]:
X = np.empty((num_recs, ), dtype=list)
y = np.empty((num_recs, ), dtype=list)
i = 0
ftrain = open(os.path.join("nps_chat_data.txt"), 'rb')
for line in ftrain:
    line=str(line.decode('ascii',"ignore").lower().strip())
    line.strip().split("\t")
    label, sentence = line.strip().split("\t")[1], line.strip().split("\t")[0]
    words = nltk.word_tokenize(sentence)
    seqs = []
    for word in words:
        #print(word)
        if word in word2index:
            seqs.append(word2index[word])
        else:
            seqs.append(word2index["UNK"])
    X[i] = seqs
    y[i] = label
    i += 1
ftrain.close()
X = sequence.pad_sequences(X, maxlen=MAX_SENTENCE_LENGTH)
encoder = LabelBinarizer()
Y = encoder.fit_transform(y)

In [8]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

In [9]:
EMBEDDING_SIZE = 128
HIDDEN_LAYER_SIZE = 64
BATCH_SIZE = 32
NUM_EPOCHS = 10
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE,input_length=MAX_SENTENCE_LENGTH))
model.add(LSTM(HIDDEN_LAYER_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(15))
#model.add(Activation("sigmoid"))
model.add(Activation("softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam",metrics=["accuracy"])

In [10]:
model.fit(Xtrain, ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,validation_data=[Xtest, ytest])

Train on 8453 samples, validate on 2114 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2312848a6d8>

In [11]:
score, acc = model.evaluate(Xtest, ytest, batch_size=BATCH_SIZE)
print("Test score: %.3f, accuracy: %.3f" % (score, acc))

Test score: 0.847, accuracy: 0.805
