<h1 align="center" style="background-color:#616161;color:white">RNN Model</h1>

Adapted from: https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/recurrent_network.py

<h3 style="background-color:#616161;color:white">0. Setup</h3>

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Input Parameters</div>

In [None]:
# Root path
#root = "C:/DS/Github/MusicRecommendation"  # BA, Windows
root = "/home/badrul/git/EventPrediction" # BA, Linux

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Common Libraries</div>

In [None]:
# Core
import numpy as np
import pandas as pd
from IPython.core.debugger import Tracer    # Used for debugging
import logging
from random import *

# File and database management
import csv
import os
import sys
import json
import sqlite3
from pathlib import Path

# Date/Time
import datetime
import time
#from datetime import timedelta # Deprecated

# Visualization
import matplotlib.pyplot as plt             # Quick
%matplotlib inline

# Misc
import random

#-------------- Custom Libs -----------------#
os.chdir(root)

# Import the codebase module
fPath = root + "/1_codemodule"
if fPath not in sys.path: sys.path.append(fPath)

# Custom Libs
import coreCode as cc
import lastfmCode as fm

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Page Specific Libraries</div>

In [3]:
# Data science (comment out if not needed)
#from sklearn.manifold import TSNE
import tensorflow as tf
from tensorflow.contrib import rnn
from tensorflow.python.framework import ops
ops.reset_default_graph()
from sklearn import metrics
from sklearn import preprocessing

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Declare Functions</div>

In [4]:
fieldList="t, UserID, HrsFrom5pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat,t1,t2,t3,t4,t5,t10,t12hrs,t23_5hrs,t24hrs,t24_5hrs,t1wk,t2wks,t3wks,t4wks"

def getTrainAndTestData():
    con = sqlite3.connect(dbPath)
    c = con.cursor()

    # Get list of UserIDs 
    users = pd.read_sql_query("Select UserID from tblUsers Where tblUsers.TestUser = 0",con)

    trainDf=pd.DataFrame(columns=[fieldList])  # Create an emmpty df
    testDf=pd.DataFrame(columns=[fieldList])  # Create an emmpty df
    periodsInAMonth=int(60/periodGranularity)*24*7*4

    totalRows=0
    
    for user in users.itertuples():
        
        # Get training dataset
        SqlStr="SELECT {} from tblTimeSeriesData where UserID = {}".format(fieldList,user.userID)
        df = pd.read_sql_query(SqlStr, con)
        
        if len(df)>int(periodsInAMonth*3):  # user must have at least 3 months worth of data
            totalRows += len(df)
    
            # Cut-off 1
            k = random.randint(periodsInAMonth, len(df))

            testDf = testDf.append(df.iloc[k:k+periodsInAMonth])[df.columns.tolist()]

            tmp = df.drop(df.index[k:k+periodsInAMonth])

            # Cut-off 2
            k = random.randint(periodsInAMonth, len(tmp))
            testDf = testDf.append(tmp.iloc[k:k+periodsInAMonth])[df.columns.tolist()]
            trainDf = trainDf.append(tmp.drop(tmp.index[k:k+periodsInAMonth]))[df.columns.tolist()]
        else:
            print('Skipping user {} as not enough periods ({})'.format(user.userID,len(df)))
    
    if len(trainDf)+len(testDf) == totalRows:
        print('Ok')
    else:
        print("Incorrect. Total Rows = {}. TestDf+TrainDf rows = {}+{}={}".format(totalRows,len(testDf),len(trainDf),len(testDf)+len(trainDf)))
        
    return trainDf, testDf

def getHiddenTestUsers(firstNPerc=1.0):
    con = sqlite3.connect(dbPath)
    c = con.cursor()

    # Get list of UserIDs 
    users = pd.read_sql_query("Select UserID from tblUsers Where tblUsers.TestUser = 1",con)

    #fieldList="t, PeriodID, UserID, HrsFrom6pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat,t1,t2,t3,t4,t5,t10,t12hrs,t24hrs,t1wk,t2wks,t3wks,t4wks"
    testDf=pd.DataFrame(columns=[fieldList])  # Create an emmpty df
    periodsInAMonth=int(60/periodGranularity)*24*7*4

    totalRows=0
    
    for user in users.itertuples():
        # Get training dataset
        
        SqlStr="SELECT {} from tblTimeSeriesData where UserID = {}".format(fieldList + ",PeriodID",user.userID)    
        df = pd.read_sql_query(SqlStr, con)
        df["PeriodID"] = df["PeriodID"].astype(int)
        df.sort_values(['PeriodID'])
        totalRows += len(df)
        # Caluclate period cutt-off
        cutoff = int(len(df)*firstNPerc)
        testDf = testDf.append(df.iloc[0:cutoff])[df.columns.tolist()]
    
    testDf["PeriodID"] =  testDf["PeriodID"].astype(int)
    testDf["UserID"] =  testDf["UserID"].astype(int)
    testDf.sort_values(['UserID','PeriodID'], inplace=True)
    return testDf


<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Load settings</div>

In [5]:
settingsDict =  cc.loadSettings()
dbPath = root + settingsDict['mainDbPath']
fmSimilarDbPath = root + settingsDict['fmSimilarDbPath']
fmTagsDbPath = root + settingsDict['fmTagsDbPath']
trackMetaDbPath = root + settingsDict['trackmetadata']
periodGranularity = int(settingsDict['periodGranularity'])

<h3 style="background-color:#616161;color:white">1. Load data</h3>

In [13]:
# Train Data
trainDf,testDf = getTrainAndTestData()
xTrain = trainDf.drop(['t','t1','t2','t3','t4','t5','t10','t12hrs','t23_5hrs','t24hrs','t24_5hrs'], 1).values
yTrain_onehot = pd.get_dummies(trainDf['t']).values.astype(float) # One-Hot version

# Test data
xTest= testDf.drop(['t','UserID','t1','t2','t3','t4','t5','t10','t12hrs','t23_5hrs','t24hrs','t24_5hrs'], 1).values
yTest = testDf['t'].values.astype(int)
#yTest = np.array([1 if y==1 else -1 for y in y_vals_test])
yTest=yTest.reshape(len(yTest),1)

# One-Hot version
yTest_onehot = pd.get_dummies(testDf['t']).values.astype(float)

# Get hidden users data
testDf = getHiddenTestUsers(firstNPerc=0.5)  # Get the first half of everyones history
xTest2= testDf.drop(['t','UserID','t1','t2','t3','t4','t5','t10','t12hrs','t23_5hrs','t24hrs','t24_5hrs','PeriodID'], 1).values
yTest2 = testDf['t'].values.astype(int)
yTest2=yTest2.reshape(len(yTest2),1)

# One-Hot version
yTest2_onehot = pd.get_dummies(testDf['t']).values.astype(float)

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Confirm dimensions</div>

In [14]:
np.shape(xTrain),np.shape(yTrain_onehot)

((3516965, 13), (3516965, 2))

In [15]:
np.shape(xTest), np.shape( yTest),np.shape(yTest_onehot)

((240862, 13), (240862, 1), (240862, 2))

<h3 style="background-color:#616161;color:white">3. RNN Model</h3>

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Define the model</div>

In [None]:
# Key parameters
batch_rows = 1344
batch_depth = 1
hiddenLayers =128
learning_rate =0.001
training_iteration = 1
global x, y,optimizer,model

In [17]:
def RNN(x, weights, biases,batch_depth,hiddenLayers):

    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, n_steps, n_input)
    # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
    
    # Unstack to get a list of 'n_steps' tensors of shape (batch_size, n_input)
    x = tf.unstack(x, batch_depth, 1)

    # Define a lstm cell with tensorflow
    lstm_cell = rnn.BasicLSTMCell(hiddenLayers, forget_bias=1.0)

    # Get lstm cell output
    outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

def buildGraph(_batchX_cols, _batchY_cols, _batch_rows = 24, _batch_depth = 7, _hiddenLayers =128,_learning_rate =0.001):
    tf.reset_default_graph()
    global x,y,optimizer
    # Define input variables:
    x = tf.placeholder("float", [None, batch_depth, batchX_cols])
    y = tf.placeholder("float", [None, batchY_cols])

    # Define weights:
    weights = {
        'out': tf.Variable(tf.random_normal([hiddenLayers, batchY_cols]))
    }
    biases = {
        'out': tf.Variable(tf.random_normal([batchY_cols]))
    }

    # Define main model:
    model = RNN(x, weights, biases,batch_depth,hiddenLayers)

    # Define accuracy
    correct_pred = tf.equal(tf.argmax(model,1), tf.argmax(y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
    # Define loss and optimizer
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=model, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
    return model, cost, optimizer

batchX_cols=np.shape(xTrain)[1]
batchY_cols=np.shape(yTrain_onehot)[1]
buildGraph(batchX_cols, batchY_cols, _batch_rows = batch_rows, _batch_depth = batch_depth, _hiddenLayers =hiddenLayers,_learning_rate =learning_rate)

(<tf.Tensor 'add:0' shape=(?, 2) dtype=float32>,
 <tf.Tensor 'Mean_1:0' shape=() dtype=float32>,
 <tf.Operation 'Adam' type=NoOp>)

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Train the model</div>

<h4>def trainSequence</h4>
* Takes a a list of data in which the lis is ordered in a sequential way and turns it into a cube of data for RNNs (rows x depth x columns)
* rows are what you may think of as batch size. It's the max that we feed into the model at any time.
* for each batch,the depth (aka time_steps) represents a lead training example (say row n of the original seq)  and the preceding examples that will be linked to it in the RNN (see RNN literature)
* columns are the number of feature columns
* Here's the important thing. As this is sequential data (presumably time) we want to give each entry of our sequentiual
 list the chance to be the 'leader'. So for each training example i, we will read in i-1, i-2 ... i-depth to form one slice of the cube then move onto i-2 and repeat for the next slice, stacking them up on top till we reach our rows (i.e. batch size) limit
* And to be even more explicit - this means almost all rows in our origianl sequence will be used more than once as a 'leader' of their own slice as well as a 'follower' in other slices
    
     How we do this in code:
     
     S = Training data with x rows and y columns
     Iterate, i,through S from top to bottom. Leader = row i
    
         for each leader, 
             grab their followers (i+1 .. i+24) and reshape from a 24 by 1 by y matrix, into 1 by 24 by y 
             add the slice onto cube
             if cube_size has reached the bach_rows limit, transmit the cube to the RNN model and reset cube
         move onto the next i


<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Test hidden periods</div>

In [None]:
def testSequence(X, Y, sess,batch_rows = 7, batch_depth=48):     
    totalRows = int(X.shape[0])
    cube_x=[]
    cube_y=[]
    predictions=[]
    print(batch_depth,batch_rows)
    for leaderPos in range(0+batch_depth,batch_rows):
        print(2)
        # Scan through each minibatch and turn it into slices for the cube
        slice_x = X[leaderPos-batch_depth : leaderPos].reshape(1,batch_depth,-1)
        slice_y = Y[leaderPos-batch_depth : leaderPos].reshape(1,batch_depth,-1)  
        print(2)
        if len(cube_x) == 0:
            cube_x=slice_x
            cube_y=slice_y
        else:
            
            cube_x=np.append(cube_x,slice_x, axis=0)
            cube_y=np.append(cube_y,slice_y, axis=0)
        print(3)
        if np.shape(cube_x)[0] == batch_rows:
            p= 1*sess.run(model, feed_dict={x: batch_xs, y: batch_ys})
            if predictions == []:
                predictions = p
            else:
                predictions= np.append(predictions,p,axis=0)
            
            cube_x =[]
            cube_y =[]
    return predictions

predictions = testSequence(xTest,yTest_onehot,sess,batch_rows=7,batch_depth=24)
print ("Test Finished!")

In [None]:
def getTestPredictions(X,Y):
    predictions=[]
    # Testing cycle
    total_batch = int(len(X)/batch_rows)

    if np.mod(len(X),batch_rows) != 0:
        # tf requires consistent inputs so need to pad
        rowsToPad=batch_size-np.mod(len(X),batch_rows)
        padRows=np.zeros([rowsToPad,X.shape[1]])
        X = np.append(X,padRows,axis=0)

        padRows=np.zeros([rowsToPad,Y.shape[1]])
        Y = np.append(Y,padRows,axis=0)
        total_batch += 1

    # Loop over all batches
    for i in range(total_batch):
        Tracer()()
        batch_xs = X[i*batch_size:(i*batch_rows)+batch_rows]
        batch_ys = Y[i*batch_size:(i*batch_rows)+batch_rows]                
        
        test_len = np.shape(batch_xs)[0]
        
        batch_xs = batch_xs[:test_len].reshape((-1, batch_depth, np.shape(batch_xs)[1]))
        batch_ys = batch_ys[:test_len]

        p= 1*sess.run(pred, feed_dict={x: batch_xs, y: batch_ys})
        if predictions == []:
            predictions = p
        else:
            predictions= np.append(predictions,p,axis=0)
        
              
    n=len(predictions)-rowsToPad
    predictions=predictions[0:n]
    predictions = np.argmax(predictions,1)
    return predictions

getTestPredictions(xTrain,yTrain_onehot)

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Test hidden periods</div>

In [None]:
predictions = getTestPredictions(xTest,yTest_onehot)
print(metrics.classification_report(yTest,predictions))  # Need to feed it yTest not yTest_OneHot here
print(metrics.confusion_matrix(yTest,predictions))
print("* Precision = labelled as x / how many were actually x in the ones that were labelled")
print("* Recall = labelled as x / how many were actually x in the dataset")

In [None]:
predictions = getTestPredictions(xTest2,yTest2_onehot)
print(metrics.classification_report(yTest2,predictions))  # Need to feed it yTest not yTest_OneHot here
print(metrics.confusion_matrix(yTest2,predictions))
print("* Precision = labelled as x / how many were actually x in the ones that were labelled")
print("* Recall = labelled as x / how many were actually x in the dataset")

In [27]:
#Archive

# We want to predict time-stamp t, using last 7 days worth of features, feeding the data in, in batches of a 1 month
# In half-hourly model, 7 days = 2*24*7 = 336. Batch depth = 336 .    # 1 month = 1344
# So shape needed is 1344 x 7 x features. 

def trainSequence(X, Y, sess,batch_row, batch_depth):     
    totalRows = int(X.shape[0])
    cube_x=[]
    cube_y=[]
    
    
    for leaderPos in range(0+batch_depth,np.shape(X)[0]):
        # Scan through each minibatch and turn it into slices for the cube
        slice_x = X[leaderPos-batch_depth : leaderPos].reshape(1,batch_depth,-1)
        slice_y = Y[leaderPos].reshape(1,-1)
        
        if len(cube_x) == 0:
            cube_x=slice_x
            cube_y=slice_y
        else:
            
            cube_x=np.append(cube_x,slice_x, axis=0)
            cube_y=np.append(cube_y,slice_y, axis=0)
        
        if np.shape(cube_x)[0] == batch_rows:
            # Process then reset cube
            #Tracer()()
            
            sess.run(optimizer, feed_dict={x: cube_x, y: cube_y})
            print('  Just processed row number {} of {}. {}%'.format(leaderPos,np.shape(X)[0],round(leaderPos/np.shape(X)[0],2)))
            cube_x =[]
            cube_y =[]
        
                    
# Training cycle
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
for iteration in range(training_iteration):
    print("Iteration {}".format(iteration))
    trainSequence(xTrain,yTrain_onehot,sess,batch_rows,batch_depth)  # This is the main line
    
    #if iteration % display_step == 0:
    #    # Calculate train accuracy
    #    i = randint(1, total_batch)  # Randomly select a batch
    #    batch_x = xTrain[i*batch_size:(i*batch_size)+batch_size]
    #    batch_y = yTrain_onehot[i*batch_size:(i*batch_size)+batch_size]                        
    #    batch_x = batch_x.reshape((batch_size, n_steps, n_input))        

    #    acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y})
    #    loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y})
    #    # Calculate loss
    #    print ("Iter {}. Minibatch Loss={:.6f}".format(iteration, loss) + ", Training Accuracy= " + "{:.5f}".format(acc))

print ("Optimization Finished!")

Iteration 0
  Just processed row number 1679 of 3516965. 0.0%
  Just processed row number 3023 of 3516965. 0.0%
  Just processed row number 4367 of 3516965. 0.0%
  Just processed row number 5711 of 3516965. 0.0%
  Just processed row number 7055 of 3516965. 0.0%
  Just processed row number 8399 of 3516965. 0.0%
  Just processed row number 9743 of 3516965. 0.0%
  Just processed row number 11087 of 3516965. 0.0%
  Just processed row number 12431 of 3516965. 0.0%
  Just processed row number 13775 of 3516965. 0.0%
  Just processed row number 15119 of 3516965. 0.0%
  Just processed row number 16463 of 3516965. 0.0%
  Just processed row number 17807 of 3516965. 0.01%
  Just processed row number 19151 of 3516965. 0.01%
  Just processed row number 20495 of 3516965. 0.01%
  Just processed row number 21839 of 3516965. 0.01%
  Just processed row number 23183 of 3516965. 0.01%
  Just processed row number 24527 of 3516965. 0.01%
  Just processed row number 25871 of 3516965. 0.01%
  Just processed ro

  Just processed row number 212687 of 3516965. 0.06%
  Just processed row number 214031 of 3516965. 0.06%
  Just processed row number 215375 of 3516965. 0.06%
  Just processed row number 216719 of 3516965. 0.06%
  Just processed row number 218063 of 3516965. 0.06%
  Just processed row number 219407 of 3516965. 0.06%


KeyboardInterrupt: 