In [1]:
import pandas as pd
import numpy as np

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [22]:
dataset = pd.read_csv('data/04_dataset.tsv', sep='\t')
print('Shape: ' + str(dataset.shape))
dataset.head(20)

Shape: (250394, 6)


Unnamed: 0.1,Unnamed: 0,user,sess_id,token,token_type,line_id
0,0,USER8,0,X,cmd,0
1,1,USER8,0,z,cmd,1
2,2,USER8,2,cd,cmd,0
3,3,USER8,2,<1>,args,0
4,4,USER8,2,cd,cmd,1
5,5,USER8,2,<1>,args,1
6,6,USER8,2,ll,cmd,2
7,7,USER8,2,vi,cmd,3
8,8,USER8,2,<1>,args,3
9,9,USER8,2,vi,cmd,4


## Model 1: One-Word-In, One-Word-Out Sequences
This is what we would use if we want to predict the next word as you're typing.

In [38]:
%%time
# make corpus
corpus = list()

def make_sentence(df):
    sentence = ''
    for t in df.token:
        sentence = sentence + str(t) + ' '
    sentence = sentence + '\n'
    corpus.append(sentence)
    

dataset.groupby(['user', 'sess_id', 'line_id']).apply(make_sentence)

CPU times: user 7.51 s, sys: 64.6 ms, total: 7.58 s
Wall time: 7.6 s


In [48]:
%%time
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
encoded = tokenizer.texts_to_sequences([corpus])[0]

CPU times: user 120 ms, sys: 5.8 ms, total: 126 ms
Wall time: 126 ms


In [50]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index)+1
print('Vocabulary Size: %d'  % vocab_size)

Vocabulary Size: 3909


In [51]:
%%time
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 147275
CPU times: user 184 ms, sys: 6.77 ms, total: 191 ms
Wall time: 190 ms


In [52]:
# split into X and y elements
sequences = array(sequences)
X, y = sequences[:,0], sequences[:,1]

In [53]:
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)

In [54]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             39090     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_1 (Dense)              (None, 3909)              394809    
Total params: 478,299
Trainable params: 478,299
Non-trainable params: 0
_________________________________________________________________
None


In [57]:
%%time
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=1, verbose=2)

Epoch 1/1
 - 42s - loss: 3.3402 - acc: 0.2984
CPU times: user 2min 41s, sys: 40.9 s, total: 3min 22s
Wall time: 42.7 s


The best accuracy after a total of just under 20 epochs of training is **0.2984**.

In [62]:
# generate sequence of commands

def generate_seq(model, tokenizer, seed_text, n_words):
	in_text, result = seed_text, seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		encoded = array(encoded)
        print()
		# predict a word in the vocabulary
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text, result = out_word, result + ' ' + out_word
	return result

In [63]:
# evaluate
result = generate_seq(model, tokenizer, 'cd', 3)

AttributeError: 'list' object has no attribute 'shape'

In [68]:
# evaluate
in_text = 'cd'
print(in_text)
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = array(encoded)
yhat = model.predict_classes(encoded, verbose=0)
for word, index in tokenizer.word_index.items():
	if index == yhat:
		print(word)

cd


AttributeError: 'list' object has no attribute 'shape'

## Model 2: Line by Line Sequence

In [67]:
??model.predict

[0;31mSignature:[0m [0mmodel[0m[0;34m.[0m[0mpredict[0m[0;34m([0m[0mx[0m[0;34m,[0m [0mbatch_size[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mverbose[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0msteps[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mpredict[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mx[0m[0;34m,[0m[0;34m[0m
[0;34m[0m                [0mbatch_size[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m                [0mverbose[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m                [0msteps[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"""Generates output predictions for the input samples.[0m
[0;34m[0m
[0;34m        Computation is done in batches.[0m
[0;34m[0m
[0;34m        # Arguments[0m
[0;34m            x: The input data, as a Numpy array[0m
[0;34m                (or list of Numpy arrays if the model 