In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Our Helper Functions
from helper import *

Using TensorFlow backend.


# Read Dataset

In [2]:
lines = pd.read_pickle("./data/02_bin/01_lines.bin")

In [3]:
lines.head()

Unnamed: 0,user,sess_id,line_id,line_text
0,USER1+0,USER0->0,0,whoami
1,USER1+0,USER0->0,1,pwd
2,USER1+0,USER0->0,2,ls
3,USER1+0,USER0->0,3,dir
4,USER1+0,USER0->0,4,vi


# Tokenize Dataset

In [4]:
corpus = lines.line_text
tokenizer = Tokenizer(lower=False, split=' ', filters="")
tokenizer.fit_on_texts(corpus)
vocab_size = len(tokenizer.word_index)+1
encoded = tokenizer.texts_to_sequences(corpus)

In [5]:
corpus.to_frame().assign(encoded=encoded).head(10)

Unnamed: 0,line_text,encoded
0,whoami,[485]
1,pwd,[42]
2,ls,[3]
3,dir,[22]
4,vi,[6]
5,source <1>,"[100, 1]"
6,source <1>,"[100, 1]"
7,exit,[13]
8,whereis <1>,"[293, 1]"
9,mkdir <1>,"[74, 1]"


# Model 1: One-Word-In, One-Word-Out Sequences

In [6]:
corpus.iloc[60:65].to_frame().assign(
    model_inputs = [ tokenizer.sequences_to_texts(segment_and_pad(t)) for t in encoded[60:65]],
    model_inputs_as_nums = [ segment_and_pad(t) for t in encoded[60:65]]
)

Unnamed: 0,line_text,model_inputs,model_inputs_as_nums
60,inger <1>,[inger <1>],"[[1177, 1]]"
61,r -l <2>,"[r -l, -l <2>]","[[79, 15], [15, 4]]"
62,exit,[exit],"[[13, 0]]"
63,elm,[elm],"[[7, 0]]"
64,elm,[elm],"[[7, 0]]"


In [7]:
sequences = segment_and_pad_all(encoded, size=2)

Segmenting and padding...: 100%|██████████| 145687/145687 [03:41<00:00, 658.52it/s] 


In [8]:
sequences = np.array(sequences)
sequences.shape

(162049, 2)

In [None]:
X = sequences[:,0]
y = to_categorical(sequences[:,1], num_classes=vocab_size)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.50, 
    random_state=2019
)

In [None]:
model1 = Sequential([
    Embedding(vocab_size, 100, input_length=1),
    LSTM(100, dropout = 0.2),
    Dense(vocab_size, activation='softmax')
])
model1.compile(
    loss='categorical_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)

history = model1.fit(X_train, y_train, validation_split=0.25, epochs=10, batch_size=16, verbose=1)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 60768 samples, validate on 20256 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
y_pred = model1.predict_classes(X_test)
y_truth = y_test.argmax(axis=1)
print(
    f"  Holdout Set Accuracy = {round(accuracy_score(y_truth, y_pred)*100,2)}%"
)

# Model 2: Padded Lines

In [None]:
corpus.iloc[100:110].to_frame().assign(
    model_inputs_as_nums = [ segment_and_pad(t, size=4, pad_pre=True) for t in encoded[100:110]]
)

In [None]:
sequences = segment_and_pad_all(encoded, size=4, pad_pre=True)

In [None]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = to_categorical(sequences[:,-1], num_classes=vocab_size)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.50, 
    random_state=2019
)

In [None]:
model2 = Sequential([
    Embedding(vocab_size, 10, input_length=3),
    LSTM(max_length, dropout = 0.2),
    Dense(vocab_size, activation='softmax')
])
model2.compile(
    loss='categorical_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)

history = model2.fit(X_train, y_train, validation_split=0.25, epochs=10, batch_size=16, verbose=1)

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
y_pred = model2.predict_classes(X_test)
y_truth = y_test.argmax(axis=1)
print(
    f"  Holdout Set Accuracy = {round(accuracy_score(y_truth, y_pred)*100,2)}%"
)