### Exploration of LSTMs


In [6]:
from __future__ import print_function
import re
import urllib.request
import zipfile
import lxml.etree
import itertools
import numpy as np
import tensorflow as tf
import time
import pickle
import os
import random
import sys
import h5py
import pickle
import pandas as pd
from utils import *

from keras.models import Sequential, load_model, model_from_json
from keras.layers import Dense, Activation, Dropout, LSTM, GRU
from keras.optimizers import RMSprop, Adam
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.layers.wrappers import TimeDistributed
from keras.callbacks import LearningRateScheduler
from keras import regularizers

%load_ext autoreload
%autoreload 2

time_percentage = 0.9
explained_variance = 0.9

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df = pd.read_csv("data/rescuetime_data-ac-min.csv")
data_pd = Clean_DF(df)
data_pd.clean_data(time_percentage=time_percentage)
data_pd.clean_df = data_pd.clean_df.reset_index()
data_pd.get_pca(explained_variance=explained_variance)
data_pd.get_day_time()

In [7]:
# Saving the objects:
# with open('data_pd_80.pickle', 'wb') as f:  # Python 3: open(..., 'wb')
#     pickle.dump(data_pd, f)

# # Getting back the objects:
with open('data_pd.pickle', 'rb') as f:  # Python 3: open(..., 'rb')
    data_pd = pickle.load(f)


In [8]:
print("Dataset size:", data_pd.clean_df.shape,'\n')
print("Number of apps that consume", time_percentage*100, "% of all users time: ",len(data_pd.popular_apps), '\n')
print("Cleaned dataset columns:",'\n', data_pd.clean_df.columns.values, '\n')
print("Number of components that explain", explained_variance*100,"% of the data: ",data_pd.pca_data.shape[1], '\n')

Dataset size: (16704, 9) 

Number of apps that consume 90.0 % of all users time:  99 

Cleaned dataset columns: 
 ['Date' 'Time Spent (seconds)' 'Activity' 'Category' 'Productivity'
 'Activity Vector' 'Productivity Score' 'Day' 'Time'] 

Number of components that explain 90.0 % of the data:  30 



In [9]:
FLAGS = tf.flags
FLAGS.look_back = 24
FLAGS.batch_size = 8
FLAGS.inputlength = data_pd.activity_vector.shape[1]
np.random.seed(7)
dataset = data_pd.activity_vector

In [10]:
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), :]
        dataX.append(a)
        dataY.append(dataset[i + look_back, :])
    return np.array(dataX), np.array(dataY)

In [20]:
# split into train and test sets
train_size = int(len(dataset) * 0.9)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
print(len(train), len(test))

15033 1671


In [21]:
trainX, trainY = create_dataset(train, FLAGS.look_back)
testX, testY = create_dataset(test, FLAGS.look_back)

In [22]:
trainX.shape

(15008, 24, 99)

## Build model

In [None]:
N_HIDDEN = 32
N_DENSE = 64
LEARNING_RATE = 0.005
DECAY = 0.001
EPOCHS = 10

In [26]:
print('Building training model...')
model = Sequential()
model.add(GRU(N_HIDDEN, dropout_U=0.2, dropout_W=0.2, input_shape=(FLAGS.look_back, FLAGS.inputlength)))
model.add(Dense(N_HIDDEN, activation='sigmoid'))
model.add(Dense(FLAGS.inputlength, activation='softmax'))  # Add another dense layer with the desired output size.
model.compile(loss='mean_squared_error', optimizer = RMSprop(lr=LEARNING_RATE, clipnorm=5))

print(model.summary()) # Convenient function to see details about the network model.

Building training model...


  This is separate from the ipykernel package so we can avoid doing imports until


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_4 (GRU)                  (None, 32)                12672     
_________________________________________________________________
dense_7 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_8 (Dense)              (None, 99)                3267      
Total params: 16,995.0
Trainable params: 16,995
Non-trainable params: 0.0
_________________________________________________________________
None


## Build inference model
Note: the inference model will have only one time step as we will feed each predicted character back into the rnn as a seed for predicting the next character. It will also be stateful so as to 'remember' previous states.

In [25]:
model.fit(x=trainX, y=trainY, validation_data=(testX, testY), epochs=EPOCHS, batch_size=FLAGS.batch_size, verbose=1)

Train on 15008 samples, validate on 1646 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
  776/15008 [>.............................] - ETA: 31s - loss: 0.0031

KeyboardInterrupt: 

In [27]:
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

In [29]:
trainPredict.shape

(15008, 99)

In [28]:
np.sqrt(np.mean( (trainPredict- trainY)**2, axis=1))

array([ 0.08623799,  0.09671605,  0.09967631, ...,  0.09974472,
        0.08680497,  0.09438508])

In [48]:
rmse_test = np.sqrt(((testY - testPredict) ** 2).mean(axis=0))