### Exploration of LSTMs


In [27]:
from __future__ import print_function
import re
import urllib.request
import zipfile
import lxml.etree
import itertools
import numpy as np
import tensorflow as tf
import time
import pickle
import os
import random
import sys
import h5py
import pickle
import pandas as pd
from utils import *

from keras.models import Sequential, load_model, model_from_json
from keras.layers import Dense, Activation, Dropout, LSTM, GRU
from keras.optimizers import RMSprop, Adam
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.layers.wrappers import TimeDistributed
from keras.callbacks import LearningRateScheduler
from keras import regularizers

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
time_percentage = 0.9
explained_variance = 0.9
df = pd.read_csv("data/rescuetime_data-ac-min.csv")
data_pd = Clean_DF(df)
data_pd.clean_data(time_percentage=time_percentage)
data_pd.clean_df = data_pd.clean_df.reset_index()
data_pd.get_pca(explained_variance=explained_variance)
data_pd.get_day_time()

In [None]:
# Saving the objects:
# with open('data_pd_80.pickle', 'wb') as f:  # Python 3: open(..., 'wb')
#     pickle.dump(data_pd, f)

# # Getting back the objects:
with open('data_pd.pickle', 'rb') as f:  # Python 3: open(..., 'rb')
    data_pd = pickle.load(f)


In [3]:
print("Dataset size:", data_pd.clean_df.shape,'\n')
print("Number of apps that consume", time_percentage*100, "% of all users time: ",len(data_pd.popular_apps), '\n')
print("Cleaned dataset columns:",'\n', data_pd.clean_df.columns.values, '\n')
print("Number of components that explain", explained_variance*100,"% of the data: ",data_pd.pca_data.shape[1], '\n')

Dataset size: (16704, 9) 

Number of apps that consume 90.0 % of all users time:  99 

Cleaned dataset columns: 
 ['Date' 'Time Spent (seconds)' 'Activity' 'Category' 'Productivity'
 'Activity Vector' 'Productivity Score' 'Day' 'Time'] 

Number of components that explain 90.0 % of the data:  30 



In [17]:
FLAGS = tf.flags
FLAGS.look_back = 24
FLAGS.batch_size = 8
FLAGS.inputlength = data_pd.activity_vector.shape[1]
np.random.seed(7)
dataset = data_pd.activity_vector

In [5]:
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), :]
        dataX.append(a)
        dataY.append(dataset[i + look_back, :])
    return np.array(dataX), np.array(dataY)

In [10]:
# split into train and test sets
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
print(len(train), len(test))

13363 3341


In [11]:
trainX, trainY = create_dataset(train, FLAGS.look_back)
testX, testY = create_dataset(test, FLAGS.look_back)

In [12]:
trainX.shape

(13338, 24, 99)

## Build model

In [40]:
# RNN parameters
N_HIDDEN = 64
LEARNING_RATE = 0.001
EPOCHS = 20

In [41]:
print('Building training model...')
model = Sequential()
model.add(GRU(N_HIDDEN, dropout_U=0.2, dropout_W=0.2, input_shape=(FLAGS.look_back, FLAGS.inputlength)))
model.add(Dense(N_HIDDEN, activation='relu'))
model.add(Dense(FLAGS.inputlength, activation='softmax'))  # Add another dense layer with the desired output size.
model.compile(loss='mean_squared_error', optimizer = RMSprop(lr=LEARNING_RATE, clipnorm=5))

print(model.summary()) # Convenient function to see details about the network model.

Building training model...


  This is separate from the ipykernel package so we can avoid doing imports until


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_12 (GRU)                 (None, 64)                31488     
_________________________________________________________________
dense_19 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_20 (Dense)             (None, 99)                6435      
Total params: 42,083.0
Trainable params: 42,083
Non-trainable params: 0.0
_________________________________________________________________
None


## Build inference model
Note: the inference model will have only one time step as we will feed each predicted character back into the rnn as a seed for predicting the next character. It will also be stateful so as to 'remember' previous states.

In [42]:
model.fit(x=trainX, y=trainY, epochs=EPOCHS, batch_size=FLAGS.batch_size, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb0e34d7898>

In [46]:
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

In [56]:
np.sqrt(np.mean( (trainPredict- trainY)**2, axis=0))

array([ 0.19882072,  0.27684878,  0.16375223,  0.0431559 ,  0.14130197,
        0.14666278,  0.12639584,  0.08613823,  0.09059908,  0.10579388,
        0.0844413 ,  0.06987463,  0.03658202,  0.08005445,  0.04340203,
        0.07851077,  0.06595309,  0.07102945,  0.06807439,  0.079595  ,
        0.0531204 ,  0.04994744,  0.04911935,  0.02983522,  0.04010495,
        0.04750076,  0.0421062 ,  0.05969562,  0.03622231,  0.04754525,
        0.03022813,  0.04791903,  0.04165391,  0.03771969,  0.03483052,
        0.04465509,  0.01087982,  0.02505208,  0.04163561,  0.04135814,
        0.04643211,  0.01780086,  0.02397728,  0.0343868 ,  0.03971455,
        0.00884566,  0.00098255,  0.04289035,  0.04329126,  0.01467267,
        0.02386329,  0.03317891,  0.02456031,  0.02826623,  0.00098222,
        0.01958819,  0.01607286,  0.02521249,  0.03053943,  0.01932337,
        0.02376817,  0.03543101,  0.01542163,  0.02286555,  0.00098236,
        0.02057724,  0.03210474,  0.01567693,  0.0270482 ,  0.02

In [48]:
rmse_test = np.sqrt(((testY - testPredict) ** 2).mean(axis=0))