<h1 align="center" style="background-color:#616161;color:white">RNN Model 4</h1>

Adapted from: https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/recurrent_network.py

<h3 style="background-color:#616161;color:white">0. Setup</h3>

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Input Parameters</div>

In [1]:
# Root path
#root = "C:/DS/Github/MusicRecommendation"  # BA, Windows
root = "/home/badrul/git/EventPrediction" # BA, Linux

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Common Libraries</div>

In [2]:
# Core
import numpy as np
import pandas as pd
from IPython.core.debugger import Tracer    # Used for debugging
import logging
from random import *

# File and database management
import csv
import os
import sys
import json
import sqlite3
from pathlib import Path

# Date/Time
import datetime
import time
#from datetime import timedelta # Deprecated

# Visualization
import matplotlib.pyplot as plt             # Quick
%matplotlib inline

# Misc
import random
import importlib
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(filename='RNN.log',level=logging.DEBUG)

#-------------- Custom Libs -----------------#
os.chdir(root)

# Import the codebase module
fPath = root + "/1_codemodule"
if fPath not in sys.path: sys.path.append(fPath)

# Custom Libs
import coreCode as cc
import lastfmCode as fm

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Page Specific Libraries</div>

In [3]:
# Data science (comment out if not needed)
#from sklearn.manifold import TSNE
import tensorflow as tf
from tensorflow.contrib import rnn
from tensorflow.python.framework import ops
ops.reset_default_graph()
from sklearn import metrics
from sklearn import preprocessing

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Load settings</div>

In [4]:
settingsDict =  cc.loadSettings()
dbPath = root + settingsDict['mainDbPath_sml']
fmSimilarDbPath = root + settingsDict['fmSimilarDbPath']
fmTagsDbPath = root + settingsDict['fmTagsDbPath']
trackMetaDbPath = root + settingsDict['trackmetadata']
periodGranularity = int(settingsDict['periodGranularity'])

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Set parameters</div>

In [5]:
# Model setup
loadFromSave = False
n_steps = 1 # timesteps
n_hidden = 160 # hidden layer num of features
n_classes = 2
batch_size = 336
learning_rate = 0.002
cellType = "BasicLSTMCell"  # Choose: TimeFreqLSTMCell BasicLSTMCell

#fieldList="UserID, t, HrsFrom5pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat, t1,t2,t3,t4,t5,t10,t12hrs,t23_5hrs,t24hrs,t24_5hrs,t1wk,t2wks,t3wks,t4wks"
fieldList="UserID, t, HrsFrom6pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat, t10,t12hrs,t24hrs,t1wk,t2wks,t3wks,t4wks"

# Training parameters
training_iterations = 5
sample_iteration = 1
display_step = 5
userSample =1
timeStepSkip =4

<h3 style="background-color:#616161;color:white">1. Build Model</h3>

In [6]:
def RNN(x, weights, biases,n_steps):
    # Current data input shape: (batch_size, n_steps, n_input)
    # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
    
    # Unstack to get a list of 'n_steps' tensors of shape (batch_size, n_input)
    x = tf.unstack(x, 336, 0)  # See https://stackoverflow.com/questions/45278276/tensorflow-lstm-dropout-implementation-shape-problems/45279243#45279243
    
    # Define a lstm cell with tensorflow
    #lstm_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
    if cellType == "BasicLSTMCell":
        lstm_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
        outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
    elif cellType == "TimeFreqLSTMCell":
        lstm_cell =rnn.TimeFreqLSTMCell(n_hidden, use_peepholes=True, feature_size= 22, forget_bias=1.0)
        outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
    elif cellType == "GridLSTMCell":
        lstm_cell =rnn.GridLSTMCell(n_hidden, forget_bias=1.0)
        outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)        
    else:
        print("Did not recognize {}".format(cellType))
    # Get lstm cell output
    

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

def buildGraph(n_steps,n_input):
    global x, y, pred, cost, optimizer,accuracy
    
    tf.reset_default_graph()
    # tf Graph input
    
    x = tf.placeholder("float", [None, n_steps, n_input])
    y = tf.placeholder("float", [None, n_classes])

    # Define weights
    weights = {
        'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
    }
    biases = {
        'out': tf.Variable(tf.random_normal([n_classes]))
    }

    pred = RNN(x, weights, biases,n_steps)  # We only want the last item in the predictions
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
    
    # Evaluate model
    correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [7]:
n_input = len(fieldList.split(","))-2 # -2 as we drop UserID and t

# Build graph
buildGraph(n_steps,n_input = n_input)
# Initializing the variables
sess = tf.Session()
init = tf.global_variables_initializer()
saver = tf.train.Saver()
if loadFromSave:
    saver.restore(sess,'./3_Data/saves/model.ckpt')
else:
    sess.run(init)

<h3 style="background-color:#616161;color:white">2. Train Model</h3>

In [None]:
# Launch the graph
def trainModel(_X, _Y, sess,training_iterations = 5):
    # Training cycle
    l=np.shape(_X)[0]
    for i in range(training_iterations):
        if (training_iterations % 10) == 0: print("Now on iteration {}".format(i))
        #logging.info("Now on iteration {}".format(i))
        # Loop over all rows in order of earliest to latest
        for pos in range(0+batch_size, l,timeStepSkip):
            if (pos % 10000) == 0: 
                #print("Now on pos {} of {} ({}%)".format(pos,l,round((pos/l)*100,2)))
                logging.info("Now on pos {} of {} ({}%)".format(pos,l,round((pos/l)*100,2)))
            
            # For each row, collect the previous batch_size num of rows
            batch_x = _X[pos-batch_size:pos]
            #batch_y = _Y[pos-batch_size:pos]                        
            batch_y = _Y[pos]                        
            #if np.mod(len(batch_x),batch_size) == 0:batch_x, batch_y, _ = cc.padRows(batch_x, batch_y, batch_size)
            batch_x = batch_x.reshape((batch_size, n_steps, n_input))  # Rehsape into 3d, even though n_steps is 1            
    
            batch_y = batch_y.reshape((-1, n_classes))  # Rehsape into 3d, even though n_steps is 1            
            
            sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})
    
        # Calculate train accuracy
        pos = randint(0+batch_size, l)  # Randomly select a pos in the current dataset
        # Copied from above
        batch_x = _X[pos-batch_size:pos]
        batch_y = _Y[pos]                        
        batch_x = batch_x.reshape((batch_size, n_steps, n_input))  # Rehsape into 3d, even though n_steps is 1            
        batch_y = batch_y.reshape((-1, n_classes))  # Rehsape into 3d, even though n_steps is 1            

        # Calculate loss & accuracy
        acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y})
        loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y})
        print ("Iter {}. Minibatch Loss={:.6f}".format(iteration, loss) + ", Training Accuracy= " + "{:.5f}".format(acc))

In [None]:
training_iterations =10  # 20
sample_iteration = 10
display_step = 5
userSample =20

for s in range(sample_iteration):
    print('Now processing sample {}'.format(s))
    logging.info('Now processing sample {}'.format(s))
    users=cc.getUsers(dbPath).sample(userSample)
    for usr in users.itertuples():
        print('Now processing User {}'.format(usr.userID))
        logging.info('Now processing User {}'.format(usr.userID))
        xTrain, yTrain_onehot, xTest, yTest_onehot = cc.getHiddenPeriodsData(dbPath,fieldList,oneHot=True,periodGranularity=periodGranularity,userIDs=[usr.userID])
        
        if xTrain is not None:
            if np.shape(yTrain_onehot)[1] !=1:  # Results have to have both 0's and 1's in them
                trainModel(xTrain, yTrain_onehot, sess,training_iterations)
        saver.save(sess,"./3_Data/saves/model.ckpt")
print('Ok')

<h3 style="background-color:#616161;color:white">3. Test Model</h3>

In [None]:
def TestPredictions(X,Y):
    predictions=[]
    l=np.shape(X)[0]
    
    # Testing cycle
    print("Now testing {} rows".format(l))
    logging.info("Now testing {} rows".format(l))
    
    # Pad rows at the beginning so we can get a prediction for every entry
    padX=np.zeros([batch_size-1,X.shape[1]])
    Y = Y.reshape(-1,n_classes)
    padY=np.zeros([batch_size-1,n_classes])
    
    X = np.append(padX, X, axis=0)
    Y = np.append(padY, Y, axis=0)
    l=np.shape(X)[0]  # Update length
    
    # Loop over all rows in order of earliest to lates
    
    for pos in range(batch_size, l+1):
        
        if (pos % 20000) == 0: 
                print("Now on pos {} of {} ({}%)".format(pos,l,round((pos/l)*100,2)))
                logging.info("Now on pos {} of {} ({}%)".format(pos,l,round((pos/l)*100,2)))
        
        # For each row, bring up the history of length bath size
        _x = X[pos-batch_size:pos].reshape((batch_size, n_steps, n_input))  # Rehsape into 3d, even though n_steps is 1            
        _y = Y[pos-batch_size:pos]                        
    
        # Predict!
        p = 1*sess.run(pred, feed_dict={x: _x, y: _y})[-1]
        p=p.reshape(-1,n_classes)
        
        if predictions == []:
            predictions = p
        else:
            predictions= np.append(predictions,p,axis=0)
             
    # Remove padding and return predictions
    predictions = np.argmax(predictions,1)
    
    return predictions

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Test hidden periods</div>

In [None]:
print('Hidden Periods\n')
print ("Cell type= {}, learning_rate = {}, Iterations = {}, batch size = {}, Steps = {}, Hidden Layers = {}, Classes = {}\n".format(cellType,learning_rate,training_iterations,batch_size, n_steps ,n_hidden,n_classes))

predictions = TestPredictions(xTrain,yTrain_onehot)

print(metrics.classification_report(yTrain_onehot[:,1],predictions))  # Need to feed it yTest not yTest_OneHot here
print(metrics.confusion_matrix(yTrain_onehot[:,1],predictions))
np.shape(predictions), np.shape(yTrain_onehot),np.shape(xTrain)

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Test hidden users</div>

In [None]:
# Get hidden users
users=cc.getUsers(dbPath).sample(2)
u=users.userID.values
_,_,xTest, yTest_onehot = cc.getHiddenPeriodsData(dbPath,fieldList,oneHot=True,periodGranularity=periodGranularity,userIDs=u)
print ('{} users selected for testing. Total rows {}'.format(len(u), len(xTest)))

xTest2, yTest2_onehot, testDf2 = cc.getHiddenUsersData(dbPath,fieldList,oneHot= True,firstNPerc=0.5,periodGranularity=periodGranularity)

In [None]:
print('\nHidden Users')
predictions = getTestPredictions(xTest2,yTest2_onehot)
print(metrics.classification_report(yTest2_onehot[:,1],predictions))  # Need to feed it yTest not yTest_OneHot here
print(np.shape(xTest2),np.shape(yTest2_onehot))

<h3 style="background-color:#616161;color:white">Appendices</h3>

<h4 style="background-color:#616161;color:white">Results</h4>

In [None]:
def load_data(seq_len, normalise_window):
    f = open("sinwave.csv", 'rb').read()
    data = f.decode().split('\n')

    sequence_length = seq_len + 1
    result = []
    for index in range(len(data) - sequence_length):
        result.append(data[index: index + sequence_length])

    result = np.array(result)

    row = round(0.9 * result.shape[0])
    train = result[:int(row), :]
    np.random.shuffle(train)
    x_train = train[:, :-1]
    y_train = train[:, -1]
    x_test = result[int(row):, :-1]
    y_test = result[int(row):, -1]

    x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))  

    return [x_train, y_train, x_test, y_test]

seq_len = 1
xdTrain, ydTrain, Xd_test, yd_test = load_data(seq_len, True)

In [None]:
# Dummy test
n_steps = 1 # timesteps
n_hidden = 160 # hidden layer num of features
n_classes = 1
batch_size = 20 #1344
training_iterations=100
learning_rate = 0.001
cellType = "BasicLSTMCell"  # Choose: TimeFreqLSTMCell BasicLSTMCell

#fieldList="UserID, t, HrsFrom5pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat, t1,t2,t3,t4,t5,t10,t12hrs,t23_5hrs,t24hrs,t24_5hrs,t1wk,t2wks,t3wks,t4wks"
fieldList="UserID, t, HrsFrom6pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat, t10,t12hrs,t24hrs,t1wk,t2wks,t3wks,t4wks"
n_input = 1

# Build graph
buildGraph(n_steps,n_input = n_input)
# Initializing the variables
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

trainModel(xdTrain,ydTrain,sess,training_iterations)

xdDummy = xdTrain.reshape(-1,1)
predictions = getTestPredictions(xdTrain,ydTrain)
print(metrics.classification_report(ydTrain[:,1],predictions))  # Need to feed it yTest not yTest_OneHot here