<h1 align="center" style="background-color:#616161;color:white">RNN Model</h1>

Adapted from: https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/recurrent_network.py

<h3 style="background-color:#616161;color:white">0. Setup</h3>

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Input Parameters</div>

In [1]:
# Root path
#root = "C:/DS/Github/MusicRecommendation"  # BA, Windows
root = "/home/badrul/git/EventPrediction" # BA, Linux

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Common Libraries</div>

In [2]:
# Core
import numpy as np
import pandas as pd
from IPython.core.debugger import Tracer    # Used for debugging
import logging
from random import *

# File and database management
import csv
import os
import sys
import json
import sqlite3
from pathlib import Path

# Date/Time
import datetime
import time
#from datetime import timedelta # Deprecated

# Visualization
import matplotlib.pyplot as plt             # Quick
%matplotlib inline

# Misc
import random
import importlib
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(filename='RNN.log',level=logging.DEBUG)

#-------------- Custom Libs -----------------#
os.chdir(root)

# Import the codebase module
fPath = root + "/1_codemodule"
if fPath not in sys.path: sys.path.append(fPath)

# Custom Libs
import coreCode as cc
import lastfmCode as fm
print ('Ok')

Ok


<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Page Specific Libraries</div>

In [3]:
# Data science (comment out if not needed)
#from sklearn.manifold import TSNE
import tensorflow as tf
from tensorflow.contrib import rnn
from tensorflow.python.framework import ops
ops.reset_default_graph()
from sklearn import metrics
from sklearn import preprocessing
print ('Ok')

Ok


<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Load settings</div>

In [4]:
settingsDict =  cc.loadSettings()
dbPath = root + settingsDict['mainDbPath_sml']
fmSimilarDbPath = root + settingsDict['fmSimilarDbPath']
fmTagsDbPath = root + settingsDict['fmTagsDbPath']
trackMetaDbPath = root + settingsDict['trackmetadata']
periodGranularity = int(settingsDict['periodGranularity'])
print ('Ok')

Ok


<h3 style="background-color:#616161;color:white">1. Build Model</h3>

In [6]:
def RNN(x, weights, biases,n_steps):
    # Current data input shape: (batchRows, n_steps, n_input)
    # Required shape: 'n_steps' tensors list of shape (batchRows, n_input)
    
    # Unstack to get a list of 'n_steps' tensors of shape (batchRows, n_input)
    x = tf.unstack(x, n_steps, 1)  # See https://stackoverflow.com/questions/45278276/tensorflow-lstm-dropout-implementation-shape-problems/45279243#45279243
    
    # Define a lstm cell with tensorflow
    #lstm_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
    if cellType == "BasicLSTMCell":
        lstm_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
        outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
    elif cellType == "TimeFreqLSTMCell":
        lstm_cell =rnn.TimeFreqLSTMCell(n_hidden, use_peepholes=True, feature_size= 22, forget_bias=1.0)
        outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
    elif cellType == "GridLSTMCell":
        lstm_cell =rnn.GridLSTMCell(n_hidden, forget_bias=1.0)
        outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)        
    else:
        print("Did not recognize {}".format(cellType))
    # Get lstm cell output
    

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

In [7]:
def _buildGraph(n_steps,n_input, classWeights = None):
    global x, y, _pred, _predProb, _logits, _cost, optimizer, _accuracy,_correct_pred
    
    tf.reset_default_graph()
    # tf Graph input
    x = tf.placeholder("float", [None, n_steps, n_input])
    y = tf.placeholder("int64", [None])

    # Define weights
    weights = {'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))}
    biases = {'out': tf.Variable(tf.random_normal([n_classes]))}

    # Evaluate model
    _logits = RNN(x, weights, biases,n_steps)
    _cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=_logits, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(_cost)
    
    _predProb =tf.nn.softmax(_logits)  # Convert to proper probs
    _pred =tf.argmax(_predProb,1)  # Take the highest prob
    _correct_pred = tf.equal(_pred, y)
    _accuracy = tf.reduce_mean(tf.cast(_correct_pred, tf.float32))
print('Ok')

Ok


In [8]:
def ResetModel():
    try:
        sess.close()
        tf.reset_default_graph()
    except NameError:
        return
    
def initializeModel():    
    global sess
    n_input = len(fieldList.split(","))-2 # -2 as we drop UserID and t

    # Build graph
    _buildGraph(n_steps,n_input = n_input)

    # Initializing the variables

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    sess = tf.Session() # Has tome come after init
    if loadFromSave:
        saver.restore(sess,'./3_Data/saves/model.ckpt')
    else:
        sess.run(init)
    print('Model initialized')

<h3 style="background-color:#616161;color:white">2. Model Training Functions</h3>

In [9]:
# Launch the graph
def randomSelectFromData(_X, _Y,_batchRows = 10):
    # Num of periodss = batch size
    
    # Training cycle
    
    XCols=np.shape(_X)[1]
    YCols=np.shape(_Y)[1]
    depth = n_steps
    
    totalRows=np.shape(_X)[0]
    # Select random periods (ones where we will always get enough history to go with it)
    periodsList = random.sample(range(batchRows+depth, totalRows), _batchRows)
    
    # Debuigging...
    #for i in range(_batchRows -1):
    #    periodsList[i+1]=periodsList[i]-1
    
    # Pre-Initialize batch arrays
    batch_x=np.zeros([_batchRows,depth,XCols])
    batch_y=np.zeros([_batchRows])

    batch_row =0
    
    for periodPos in periodsList:            
        # Log every so often 
        if (periodPos % 1) == 0: 
            timeNow =str(datetime.datetime.now())
            #print("{} Now adding random period {} into batch_row {}. ({}%)".format(timeNow,idx1,batch_row, round((batch_row/_batchRows)*100,2)))
            #logging.info("{} Now adding random period {} into batch_row ({}%)".format(timeNow, idx1,batch_row, round((batch_row/_batchRows)*100,2)))

        batch_x[batch_row] = _X[periodPos-depth:periodPos].reshape(1,depth,XCols)
        batch_y[batch_row] = _Y[periodPos]
        batch_row +=1
    return batch_x, batch_y
print ('Ok')

Ok


In [10]:
##############################################
###           MODEL 1    Train             ###
##############################################
def trainModel_1(batch_x, batch_y):        
    sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})

    # Calculate loss & accuracy
    loss = sess.run(_cost,feed_dict={x: batch_x, y: batch_y})
    acc = sess.run(_accuracy, feed_dict={x: batch_x, y: batch_y})
    
    return loss, acc
print('Ok')

Ok


<h3 style="background-color:#616161;color:white">3. Model Testing Functions</h3>

In [11]:
# Launch the graph
def TestPredictions1(_X, _Y, sess, _batchRows = 10, testPeriods = None):
    # Training cycle
    totalRows=np.shape(_X)[0]
    XCols=np.shape(_X)[1]
    YCols=_Y
    depth = n_steps
    
    # If no test periods were provided generate your own
    if testPeriods is None:
        # Select periods where we will always get enough history to go with it
        
        testPeriods = random.sample(range(batchRows+depth, totalRows), _batchRows)
    else:
        testPeriods = testPeriods + batchRows+depth-1
        _batchRows = len(testPeriods)
     
    # Pre-Initialize batch arrays
    batch_x=np.zeros([_batchRows,depth,XCols])
    batch_y=np.zeros([_batchRows])

    batch_row =0
    for idx1 in testPeriods:            
        if (idx1 % 1) == 0: 
            timeNow =str(datetime.datetime.now())
            #print("{} Now testing on period {} ({}%)".format(timeNow,idx1,round((batch_row/_batchRows)*100,2)))
            logging.info("{} Now testing period {} ({}%)".format(timeNow, idx1,round((batch_row/_batchRows)*100,2)))

        batch_x[batch_row] = _X[idx1-depth:idx1].reshape(1,depth,XCols)
        batch_y[batch_row] = _Y[idx1]
        batch_row +=1
    
    print ("Processed {}".format(_batchRows))
    # Predict for this period
    prob = sess.run(_predProb, feed_dict={x: batch_x, y: batch_y})
    predictions = sess.run(_pred, feed_dict={x: batch_x, y: batch_y})
    
    return predictions, batch_y



<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Test hidden periods</div>

In [12]:
def TestHiddenPeriods(hiddenTestPeriods=50, useTestData = False):

    print('{} Hidden Periods\n'.format(hiddenTestPeriods))
    print ("Cell type= {}, learning_rate = {}, Iterations = {}, batch size = {}, Steps = {}, Hidden Layers = {}, Classes = {}\n".format(cellType,learning_rate,period_Iterations,batch_size, n_steps ,n_hidden,n_classes))

    if useTestData == False:
        predictions,labels = TestPredictions1(xTrain,yTrain,sess,numOfPeriods=hiddenTestPeriods)
    else:
        predictions,labels = TestPredictions1(xTest,yTest,sess,numOfPeriods=hiddenTestPeriods)
    
    print(np.shape(labels),np.shape(predictions))    
    print(metrics.classification_report(labels,predictions))  # Need to feed it yTest not yTest_OneHot here
    return labels, predictions

In [28]:
def TrainModel():
    counter =0
    batch_y=[]
    batch_x=[]
    
    loss=np.zeros([user_iteration*period_Iterations*batch_Iterations])
    acc=np.zeros([user_iteration*period_Iterations*batch_Iterations])
    
    for userCount in range(user_iteration):  # Iterate through user selection
        timeNow =str(datetime.datetime.now())

        if dummyTest:  # If working on the dummy data then set to user
            users = pd.DataFrame(data={'userID': [3]})
        else:
            users=cc.getUsers(dbPath).sample(1) # Randomly select 1 user
 
        userID = users.iloc[0].userID
        timeNow =datetime.datetime.now().strftime('%D %H:%M:%S')
        
        # Select user data
        xTrain, yTrain, xTest, yTest = cc.SelectUserData_TrainTest(dbPath,tblName,fieldList,userIDs=[userID],oneHot=False,periodGranularity=periodGranularity)

        print('{} User Iteration {} Random user {} (Total plays {})'.format(timeNow, userCount,userID,sum(yTrain)))
        logging.info('{} Now processing random user {}'.format(timeNow, userID))

        if xTrain is not None:
            for i in range(period_Iterations):
                while sum(batch_y) == 0:
                    batch_x, batch_y = randomSelectFromData(xTrain, yTrain, _batchRows=batchRows)

                for j in range(batch_Iterations):
                    loss[counter],acc[counter] = trainModel_1(batch_x,batch_y)
                    timeNow =datetime.datetime.now().strftime('%D %H:%M:%S')
                    s="  User {} Mini-batch {} Iteration {} Loss={:.6f}, Training Accuracy={:.5f}".format(userCount,i, j, loss[counter], acc[counter])
                    print(s)
                    logging.info(s)
                    counter+=1

                
                if i % 10 == 0:
                    predictions = sess.run(_pred, feed_dict={x: batch_x, y: batch_y})
                    logits= sess.run(_logits, feed_dict={x: batch_x, y: batch_y})
                    prob= sess.run(_predProb, feed_dict={x: batch_x, y: batch_y})

                    print(metrics.classification_report(batch_y,predictions))  # Need to feed it yTest not yTest_OneHot here
                    saver = tf.train.Saver()
                    saver.save(sess,"./3_Data/saves/model.ckpt")
                    
print('Training completed')

Training completed


<h3 style="background-color:green;color:white">4. ...And action!</h3>

In [29]:
# Model setup
loadFromSave = False
user_iteration = 3  # How many iterations of user selection
period_Iterations = 2  # How many iterations of random periods selection
batchRows = 100   # How many periods to select in each user_iteration
batch_Iterations = 5
n_steps = 672         # How many time steps (i.e. depth) to have
learning_rate = 0.001
n_hidden = 250 # hidden layer num of features
n_classes = 2  # 2 for one-hot
cellType = "BasicLSTMCell"  # Choose: TimeFreqLSTMCell BasicLSTMCell
fieldList="UserID, t, HrsFrom5pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat, t1,t2,t3,t4,t5,t10,t12hrs,t23_5hrs,t24hrs,t24_5hrs,t1wk,t2wks,t3wks,t4wks"
tblName='tblTimeSeriesData'


#ResetModel()
#initializeModel()
TrainModel()
print ('Ok')

08/12/17 13:15:15 User Iteration 0 Random user 78 (Total plays [2617])
User 0 Mini-batch 0 Iteration 0 Loss=0.299888, Training Accuracy=0.92000
User 0 Mini-batch 0 Iteration 1 Loss=0.322693, Training Accuracy=0.94000
User 0 Mini-batch 0 Iteration 2 Loss=0.347542, Training Accuracy=0.81000
User 0 Mini-batch 0 Iteration 3 Loss=0.267634, Training Accuracy=0.93000
User 0 Mini-batch 0 Iteration 4 Loss=0.241045, Training Accuracy=0.92000
             precision    recall  f1-score   support

        0.0       0.92      1.00      0.96        92
        1.0       0.00      0.00      0.00         8

avg / total       0.85      0.92      0.88       100

User 0 Mini-batch 1 Iteration 0 Loss=0.239867, Training Accuracy=0.92000
User 0 Mini-batch 1 Iteration 1 Loss=0.218134, Training Accuracy=0.92000
User 0 Mini-batch 1 Iteration 2 Loss=0.174601, Training Accuracy=0.92000
User 0 Mini-batch 1 Iteration 3 Loss=0.136069, Training Accuracy=0.90000
User 0 Mini-batch 1 Iteration 4 Loss=0.138935, Training A

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Test</div>

In [32]:
def TestModel():
    print('Testing held out users')
    totalLabels = []
    totalPred = []
    avLoss =0
    avAcc =0
    users=cc.getUsers(dbPath,testUserEquals = 1)  # Get all test users

    usrCount = 1
    for usr in users.itertuples(): # For each test user    
        # Select all data for each user
        xTest, yTest = cc.SelectUserData_All(dbPath,tblName,fieldList,userIDs=[usr.userID],oneHot=False,periodGranularity=periodGranularity)
        
        if xTest is not None:
            for i in range(10):  # Select a random batch, 10 times
                batch_x, batch_y = randomSelectFromData(xTest, yTest, _batchRows=batchRows)

                p = sess.run(_pred, feed_dict={x: batch_x, y: batch_y})
                p =batch_x['t1']
                loss = sess.run(_cost,feed_dict={x: batch_x, y: batch_y})
                acc = sess.run(_accuracy, feed_dict={x: batch_x, y: batch_y})
                avLoss+=loss
                avAcc+=acc

                if totalLabels ==[]:
                    totalLabels = batch_y
                    totalPred = p

                else:
                    totalLabels = np.append(totalLabels,batch_y)
                    totalPred = np.append(totalPred,p)
        print('User {} of {} Av loss {} Av acc {}'.format(usrCount , len(users), np.round(avLoss/10,3), np.round(avAcc/10,3)))
        avLoss=0
        avAcc = 0
        usrCount+=1                

    print('Overall results:')
    totalPredBaseline = xTest[:,8]
    print(metrics.classification_report(totalLabels,totalPred))  # Need to feed it yTest not yTest_OneHot here
    print(metrics.classification_report(totalLabels,totalPred))  # Need to feed it yTest not yTest_OneHot here
    saver = tf.train.Saver()
    saver.save(sess,"./3_Data/saves/model.ckpt")
    print('Testing complete')

Testing held out users
User 1 of 10 Av loss 0.321 Av acc 0.898
User 2 of 10 Av loss 0.692 Av acc 0.886
User 3 of 10 Av loss 0.437 Av acc 0.914
User 4 of 10 Av loss 1.681 Av acc 0.759
User 5 of 10 Av loss 0.391 Av acc 0.889
User 6 of 10 Av loss 0.542 Av acc 0.882
User 7 of 10 Av loss 0.672 Av acc 0.832
User 8 of 10 Av loss 0.212 Av acc 0.939
User 9 of 10 Av loss 4.043 Av acc 0.549
User 10 of 10 Av loss 0.78 Av acc 0.87
             precision    recall  f1-score   support

        0.0       0.87      0.96      0.91      8726
        1.0       0.11      0.03      0.05      1274

avg / total       0.77      0.84      0.80     10000

             precision    recall  f1-score   support

        0.0       0.87      0.96      0.91      8726
        1.0       0.11      0.03      0.05      1274

avg / total       0.77      0.84      0.80     10000

Testing complete


In [78]:
def BaselineTest():
    print('Testing held out users')
    totalLabels = []
    totalPred = []
    users=cc.getUsers(dbPath,testUserEquals = 1)  # Get all test users

    usrCount = 1
    for usr in users.itertuples(): # For each test user    
        # Select all data for each user
        xTest, yTest = cc.SelectUserData_All(dbPath,tblName,fieldList,userIDs=[usr.userID],oneHot=False,periodGranularity=periodGranularity)
        totalLabels = np.append(totalLabels,yTest)
        totalPred = np.append(totalPred,xTest[:,8])
    
    print(metrics.classification_report(totalLabels,totalPred))  # Need to feed it yTest not yTest_OneHot here
    print('Testing complete')
BaselineTest()

Testing held out users
             precision    recall  f1-score   support

        0.0       0.97      0.97      0.97    390773
        1.0       0.77      0.77      0.77     43696

avg / total       0.95      0.95      0.95    434469

Testing complete
