In [92]:
import pandas as pd
import numpy as np

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [22]:
dataset = pd.read_csv('data/04_dataset.tsv', sep='\t')
print('Shape: ' + str(dataset.shape))
dataset.head(20)

Shape: (250394, 6)


Unnamed: 0.1,Unnamed: 0,user,sess_id,token,token_type,line_id
0,0,USER8,0,X,cmd,0
1,1,USER8,0,z,cmd,1
2,2,USER8,2,cd,cmd,0
3,3,USER8,2,<1>,args,0
4,4,USER8,2,cd,cmd,1
5,5,USER8,2,<1>,args,1
6,6,USER8,2,ll,cmd,2
7,7,USER8,2,vi,cmd,3
8,8,USER8,2,<1>,args,3
9,9,USER8,2,vi,cmd,4


## Model 1: One-Word-In, One-Word-Out Sequences
This is what we would use if we want to predict the next word as you're typing.

In [120]:
%%time
# make corpus
corpus = list()

def make_sentence(df):
    sentence = ''
    for t in df.token:
        sentence = sentence + str(t) + ' '
    sentence = sentence + '\n'
    corpus.append(sentence)
    
dataset.groupby(['user', 'sess_id', 'line_id']).apply(make_sentence)

CPU times: user 6.94 s, sys: 42 ms, total: 6.98 s
Wall time: 6.99 s


In [122]:
corpus = ''.join(corpus)

In [141]:
%%time
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
encoded = tokenizer.texts_to_sequences([corpus])[0]

CPU times: user 181 ms, sys: 16.1 ms, total: 198 ms
Wall time: 197 ms


In [142]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index)+1
print('Vocabulary Size: %d'  % vocab_size)

Vocabulary Size: 1864


In [143]:
%%time
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 248732
CPU times: user 351 ms, sys: 9.3 ms, total: 360 ms
Wall time: 359 ms


In [144]:
# split into X and y elements
sequences = array(sequences)
X, y = sequences[:,0], sequences[:,1]

In [145]:
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)

In [146]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 1, 10)             18640     
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_9 (Dense)              (None, 1864)              188264    
Total params: 251,304
Trainable params: 251,304
Non-trainable params: 0
_________________________________________________________________
None


In [147]:
%%time

# compile net
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit net
model.fit(X, y, epochs=10, verbose=2)

Epoch 1/10
 - 38s - loss: 3.1863 - acc: 0.3772
Epoch 2/10
 - 36s - loss: 2.7910 - acc: 0.4232
Epoch 3/10
 - 36s - loss: 2.6940 - acc: 0.4317
Epoch 4/10
 - 36s - loss: 2.6493 - acc: 0.4356
Epoch 5/10
 - 40s - loss: 2.6215 - acc: 0.4371
Epoch 6/10
 - 44s - loss: 2.6028 - acc: 0.4382
Epoch 7/10
 - 43s - loss: 2.5897 - acc: 0.4388
Epoch 8/10
 - 43s - loss: 2.5793 - acc: 0.4391
Epoch 9/10
 - 44s - loss: 2.5716 - acc: 0.4398
Epoch 10/10
 - 40s - loss: 2.5642 - acc: 0.4402
CPU times: user 22min 23s, sys: 4min 32s, total: 26min 56s
Wall time: 6min 40s


In [148]:
# generate sequence of commands

def generate_seq(model, tokenizer, seed_text, n_words):
	in_text, result = seed_text, seed_text
	
    # generate a fixed number of words
	for _ in range(n_words):
		
        # encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		encoded = array(encoded)
		
        # predict a word in the vocabulary
		yhat = model.predict(encoded, verbose=0)
		
        # map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		
        # append to input
		in_text, result = out_word, result + ' ' + out_word
	return result

In [149]:
# evaluate
result = generate_seq(model, tokenizer, 'cd', 3)
result

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [87]:
# evaluate
in_text = 'cd <1>\nls'
print(in_text)
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = array(encoded)
yhat = model.predict(encoded, verbose=0)
for word, index in tokenizer.word_index.items():
	if index == yhat:
		print(word)

cd <1>
ls


## Model 2: Line by Line Sequence
This approach may allow the model to use the context of each line to help the model in those cases where a simple one-word-in-and-out model creates ambiguity.

In [93]:
# prepare the tokenizer on the source text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])

# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# create line-based sequences
sequences = list()
for line in ''.join(corpus).split('\n'):
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[:i+1]
		sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

#pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit network
model.fit(X, y, epochs=10, verbose=2)

Vocabulary Size: 1864
Total Sequences: 103014
Max Sequence Length: 32
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 31, 10)            18640     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_2 (Dense)              (None, 1864)              188264    
Total params: 251,304
Trainable params: 251,304
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
 - 79s - loss: 1.5611 - acc: 0.7301
Epoch 2/10
 - 78s - loss: 1.3113 - acc: 0.7492
Epoch 3/10
 - 78s - loss: 1.2021 - acc: 0.7504
Epoch 4/10
 - 80s - loss: 1.1349 - acc: 0.7564
Epoch 5/10
 - 79s - loss: 1.0909 - acc: 0.7630
Epoch 6/10
 - 78s - loss: 1.0533 - acc: 0.7665
Epoch 7/10
 - 78s - loss: 1.0267 - acc: 0.7

<keras.callbacks.History at 0xb2e58bd30>

## Model 3: Two-Words-In, One-Word-Out Sequence
Intermediate between the one-word-in and the whole-sentence-in approaches and pass in sub-sequences of sentences as input. This will provide a trade-off between the two previous models allowing new lines to be generated and for generation to be picked up mid line.

In [132]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
encoded = tokenizer.texts_to_sequences([corpus])[0]

# retrieve vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# encode 2 words -> 1 word
sequences = list()
for i in range(2, len(encoded)):
	sequence = encoded[i-2:i+1]
	sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit network
model.fit(X, y, epochs=10, verbose=2)

Vocabulary Size: 1864
Total Sequences: 248731
Max Sequence Length: 3
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 2, 10)             18640     
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_7 (Dense)              (None, 1864)              188264    
Total params: 251,304
Trainable params: 251,304
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
 - 41s - loss: 3.0479 - acc: 0.3972
Epoch 2/10
 - 40s - loss: 2.4773 - acc: 0.4740
Epoch 3/10
 - 41s - loss: 2.3231 - acc: 0.4913
Epoch 4/10
 - 40s - loss: 2.2477 - acc: 0.4995
Epoch 5/10
 - 39s - loss: 2.2017 - acc: 0.5035
Epoch 6/10
 - 41s - loss: 2.1700 - acc: 0.5072
Epoch 7/10
 - 39s - loss: 2.1459 - acc: 0.50

<keras.callbacks.History at 0xb337e8c88>