<h1 align="center" style="background-color:#616161;color:white">RNN Model</h1>

Adapted from: https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/recurrent_network.py

<h3 style="background-color:#616161;color:white">0. Setup</h3>

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Input Parameters</div>

In [1]:
# Root path
#root = "C:/DS/Github/MusicRecommendation"  # BA, Windows
root = "/home/badrul/git/EventPrediction" # BA, Linux

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Common Libraries</div>

In [2]:
# Core
import numpy as np
import pandas as pd
from IPython.core.debugger import Tracer    # Used for debugging
import logging
from random import *

# File and database management
import csv
import os
import sys
import json
import sqlite3
from pathlib import Path

# Date/Time
import datetime
import time
#from datetime import timedelta # Deprecated

# Visualization
import matplotlib.pyplot as plt             # Quick
%matplotlib inline

# Misc
import random
import warnings
warnings.filterwarnings('ignore')

#-------------- Custom Libs -----------------#
os.chdir(root)

# Import the codebase module
fPath = root + "/1_codemodule"
if fPath not in sys.path: sys.path.append(fPath)

# Custom Libs
import coreCode as cc
import lastfmCode as fm

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Page Specific Libraries</div>

In [3]:
# Data science (comment out if not needed)
#from sklearn.manifold import TSNE
import tensorflow as tf
from tensorflow.contrib import rnn
from tensorflow.python.framework import ops
ops.reset_default_graph()
from sklearn import metrics
from sklearn import preprocessing

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Declare Functions</div>

In [4]:
def padRows(X,Y,batch_size):
    rowsToPad=0
    if np.mod(len(X),batch_size) != 0:
        # tf requires consistent inputs so need to pad
        rowsToPad=batch_size-np.mod(len(X),batch_size)
        p=np.zeros([rowsToPad,X.shape[1]])
        X = np.append(X,p,axis=0)

        p=np.zeros([rowsToPad,Y.shape[1]])
        Y = np.append(Y,p,axis=0)
    return (X,Y,rowsToPad)

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Load settings</div>

In [5]:
settingsDict =  cc.loadSettings()
dbPath = root + settingsDict['mainDbPath']
fmSimilarDbPath = root + settingsDict['fmSimilarDbPath']
fmTagsDbPath = root + settingsDict['fmTagsDbPath']
trackMetaDbPath = root + settingsDict['trackmetadata']
periodGranularity = int(settingsDict['periodGranularity'])

In [6]:
def getUsers(dbPath):
    con = sqlite3.connect(dbPath)
    c = con.cursor()

    # Get list of UserIDs 
    users = pd.read_sql_query("Select UserID from tblUsers Where tblUsers.TestUser = 0",con)
    con.close()
    return users
    
def getTrainAndTestData_Df(fieldList, userIDs = None):
    con = sqlite3.connect(dbPath)
    c = con.cursor()

    users = pd.read_sql_query("Select UserID from tblUsers Where tblUsers.TestUser = 0",con)
    
    if userIDs is None: userIDs = users.userID.values
    trainDf=pd.DataFrame(columns=[fieldList])  # Create an emmpty df
    testDf=pd.DataFrame(columns=[fieldList])  # Create an emmpty df
    periodsInAMonth=int(60/periodGranularity)*24*7*4

    totalRows=0
    
    for u in userIDs:
        
        # Get training dataset
        SqlStr="SELECT {} from tblTimeSeriesData where UserID = {}".format(fieldList,u)
        df = pd.read_sql_query(SqlStr, con)
        
        if len(df)>int(periodsInAMonth*3):  # user must have at least 3 months worth of data
            totalRows += len(df)
    
            # Cut-off 1
            k = random.randint(periodsInAMonth, len(df))

            testDf = testDf.append(df.iloc[k:k+periodsInAMonth])[df.columns.tolist()]

            tmp = df.drop(df.index[k:k+periodsInAMonth])

            # Cut-off 2
            k = random.randint(periodsInAMonth, len(tmp))
            testDf = testDf.append(tmp.iloc[k:k+periodsInAMonth])[df.columns.tolist()]
            trainDf = trainDf.append(tmp.drop(tmp.index[k:k+periodsInAMonth]))[df.columns.tolist()]
        else:
            print('Skipping user {} as not enough periods ({})'.format(u,len(df)))
    
    #if len(trainDf)+len(testDf) == totalRows:
    #    print('Ok')
    #else:
    #    print("Incorrect. Total Rows = {}. TestDf+TrainDf rows = {}+{}={}".format(totalRows,len(testDf),len(trainDf),len(testDf)+len(trainDf)))
        
    return trainDf, testDf

def getHiddenTestUsers_Df(fieldList="",firstNPerc=1.0):
    con = sqlite3.connect(dbPath)
    c = con.cursor()

    # Get list of UserIDs 
    users = pd.read_sql_query("Select UserID from tblUsers Where tblUsers.TestUser = 1",con)

    #fieldList="t, PeriodID, UserID, HrsFrom6pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat,t1,t2,t3,t4,t5,t10,t12hrs,t24hrs,t1wk,t2wks,t3wks,t4wks"
    testDf=pd.DataFrame(columns=[fieldList])  # Create an emmpty df
    periodsInAMonth=int(60/periodGranularity)*24*7*4

    totalRows=0
    
    for user in users.itertuples():
        # Get training dataset
        
        SqlStr="SELECT {} from tblTimeSeriesData where UserID = {}".format(fieldList + ",PeriodID",user.userID)    
        df = pd.read_sql_query(SqlStr, con)
        df["PeriodID"] = df["PeriodID"].astype(int)
        df.sort_values(['PeriodID'])
        totalRows += len(df)
        # Caluclate period cutt-off
        cutoff = int(len(df)*firstNPerc)
        testDf = testDf.append(df.iloc[0:cutoff])[df.columns.tolist()]
    
    testDf["PeriodID"] =  testDf["PeriodID"].astype(int)
    testDf["UserID"] =  testDf["UserID"].astype(int)
    testDf.sort_values(['UserID','PeriodID'], inplace=True)
    return testDf

def getTrainData(fieldList,oneHot,userIDs=None):
    trainDf,_ = cc.getTrainTestData(dbPath,fieldList,userIDs, periodGranularity)
    if trainDf.shape[0] == 0:
        # No rows
        return None,None
    
    xTrain = trainDf.drop(['t','UserID'], 1).values

    if oneHot:
        # One-Hot version
        yTrain_onehot = pd.get_dummies(trainDf['t']).values.astype(float) # One-Hot version
        return xTrain, yTrain_onehot
    else:
        yTrain = trainDf['t'].values.astype(int)
        yTrain=yTrain.reshape(len(yTrain),1)
        return xTrain, yTrain

def getTestData(fieldList,oneHot):
    testDf2 = getHiddenTestUsers_Df(fieldList,firstNPerc=0.5)  # Get the first half of everyones history
    _,testDf = cc.getTrainTestData(dbPath,fieldList, periodGranularity = periodGranularity)
    # Test data
    xTest= testDf.drop(['t','UserID'], 1).values
    yTest = testDf['t'].values.astype(int)
    #yTest = np.array([1 if y==1 else -1 for y in y_vals_test])
    yTest=yTest.reshape(len(yTest),1)

    # Get hidden users data
    xTest2= testDf2.drop(['t','UserID','PeriodID'], 1).values
    yTest2 = testDf2['t'].values.astype(int)
    yTest2=yTest2.reshape(len(yTest2),1)
   
    if oneHot:
        # One-Hot version
        yTest_onehot = pd.get_dummies(testDf['t']).values.astype(float)
        yTest2_onehot = pd.get_dummies(testDf2['t']).values.astype(float)
        return xTest, yTest_onehot, xTest2, yTest2_onehot
    else:
        return xTest, yTest, xTest2, yTest2

<h3 style="background-color:#616161;color:white">1. RNN Model</h3>

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Define the model</div>

In [7]:
def RNN(x, weights, biases,n_steps):
    # Current data input shape: (batch_size, n_steps, n_input)
    # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
    
    # Unstack to get a list of 'n_steps' tensors of shape (batch_size, n_input)
    x = tf.unstack(x, n_steps, 1)  # See https://stackoverflow.com/questions/45278276/tensorflow-lstm-dropout-implementation-shape-problems/45279243#45279243

    # Define a lstm cell with tensorflow
    #lstm_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
    if cellType == "BasicLSTMCell":
        lstm_cell = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0)
        outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
    elif cellType == "TimeFreqLSTMCell":
        lstm_cell =rnn.TimeFreqLSTMCell(n_hidden, use_peepholes=True, feature_size= 22, forget_bias=1.0)
        outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
    elif cellType == "GridLSTMCell":
        lstm_cell =rnn.GridLSTMCell(n_hidden, forget_bias=1.0)
        outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)        
    else:
        print("Did not recognize {}".format(cellType))
    # Get lstm cell output
    

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

def buildGraph(n_steps,n_input):
    global x, y, pred, cost, optimizer,accuracy
    
    tf.reset_default_graph()
    # tf Graph input
    
    x = tf.placeholder("float", [None, n_steps, n_input])
    y = tf.placeholder("float", [None, n_classes])

    # Define weights
    weights = {
        'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
    }
    biases = {
        'out': tf.Variable(tf.random_normal([n_classes]))
    }

    pred = RNN(x, weights, biases,n_steps)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
    # Evaluate model
    correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Train the model</div>

In [8]:
# Launch the graph
def trainModel(X, Y, sess):
    # Initializing the variables
    init = tf.global_variables_initializer()
    sess.run(init)

    # Training cycle
    for iteration in range(training_iteration):
        avg_cost = 0.
        total_batch = int(len(xTrain)/batch_size)

        # Loop over all batches
        for i in range(total_batch):
            batch_x = xTrain[i*batch_size:(i*batch_size)+batch_size]
            batch_y = yTrain_onehot[i*batch_size:(i*batch_size)+batch_size]                        
            if np.mod(len(batch_x),batch_size) == 0:
                batch_x, batch_y, _ = padRows(batch_x, batch_y, batch_size)
            
            batch_x = batch_x.reshape((batch_size, n_steps, n_input)) 
            
            sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})

        #if iteration % display_step == 0:
        # Calculate train accuracy
        #    i = randint(1, total_batch)  # Randomly select a batch
        #    batch_x = xTrain[i*batch_size:(i*batch_size)+batch_size]
        #    batch_y = yTrain_onehot[i*batch_size:(i*batch_size)+batch_size]                        
        #    batch_x = batch_x.reshape((batch_size, n_steps, n_input))        

        #    acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y})
        #    # Calculate loss
        #    loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y})
            #print ("Iter {}. Minibatch Loss={:.6f}".format(iteration, loss) + ", Training Accuracy= " + \
            #          "{:.5f}".format(acc))
    #print ("Optimization Finished!")
#print ('Ok')

In [10]:
learning_rate = 0.001
training_iteration = 7
n_steps = 1 # timesteps
n_hidden = 120 # hidden layer num of features
n_classes = 2
display_step = 5
batch_size = 2688
cellType = "BasicLSTMCell"  # Choose: TimeFreqLSTMCell BasicLSTMCell
#fieldList="UserID, t, HrsFrom5pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat, t1,t2,t3,t4,t5,t10,t12hrs,t23_5hrs,t24hrs,t24_5hrs,t1wk,t2wks,t3wks,t4wks"
#n_input = 22  # Hardcoded for now
fieldList="UserID, t, HrsFrom5pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat, t10,t12hrs,t23_5hrs,t24hrs,t24_5hrs,t1wk,t2wks,t3wks,t4wks"
n_input = 17


# Build graph
buildGraph(n_steps,n_input = n_input)

# Train the model

sess = tf.Session()
users=getUsers(dbPath)
for u in users.itertuples():
    xTrain, yTrain_onehot = getTrainData(fieldList,oneHot=True,userIDs=[u.userID])
    if xTrain is not None:
        #print ('Training user {}. {} rows'.format(u.userID, np.shape(xTrain)[0]))
        if np.shape(yTrain_onehot)[1] !=1:
            trainModel(xTrain, yTrain_onehot, sess)

xTest, yTest_onehot, xTest2, yTest2_onehot = getTestData(fieldList,oneHot= True)

Skipping user 73 as not enough periods (1882)
Skipping user 97 as not enough periods (0)
Skipping user 73 as not enough periods (1882)
Skipping user 97 as not enough periods (0)


In [11]:
def getTestPredictions(X,Y):
    predictions=[]
    
    # Testing cycle
    total_batch = int(len(X)/batch_size)
     
    if np.mod(len(X),batch_size) != 0: 
        X, Y, rowsToPad = padRows(X, Y, batch_size)
        total_batch += 1
    else:
        rowsToPad=0

    # Loop over all batches
    for i in range(total_batch):
        
        batch_x = X[i*batch_size:(i*batch_size)+batch_size]
        batch_y = Y[i*batch_size:(i*batch_size)+batch_size]                
        
        test_len = np.shape(batch_x)[0]
        
        #try:
        batch_x = batch_x[:test_len].reshape((-1, n_steps, n_input))
        batch_y = batch_y[:test_len]
        #except ValueError:
        #    print('Rows to pad', rowsToPad)
        #    Tracer()()

        p= 1*sess.run(pred, feed_dict={x: batch_x, y: batch_y})
        if predictions == []:
            predictions = p
        else:
            predictions= np.append(predictions,p,axis=0)
        
    
    print(np.shape(predictions))
    n=len(predictions)-rowsToPad
    predictions=predictions[0:n]
    predictions = np.argmax(predictions,1)
    print(np.shape(predictions),rowsToPad)
    return predictions

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Test hidden periods</div>

In [12]:
def TestModel():
    print ("Cell type= {}, learning_rate = {}, Iterations = {}, batch size = {}, Steps = {}, Hidden Layers = {}, Classes = {}".format(cellType,learning_rate,training_iteration,batch_size, n_steps ,n_hidden,n_classes))
    print('Hidden Periods\n\n')
    predictions = getTestPredictions(xTest,yTest_onehot)
    print(metrics.classification_report(yTest_onehot[:,1],predictions))  # Need to feed it yTest not yTest_OneHot here

    print('\nHidden Users')
    predictions = getTestPredictions(xTest2,yTest2_onehot)
    print(metrics.classification_report(yTest2_onehot[:,1],predictions))  # Need to feed it yTest not yTest_OneHot here

TestModel()

Cell type= BasicLSTMCell, learning_rate = 0.001, Iterations = 7, batch size = 2688, Steps = 1, Hidden Layers = 120, Classes = 2
Hidden Periods


(225792, 2)
(223325,) 2467
             precision    recall  f1-score   support

        0.0       0.92      1.00      0.95    204260
        1.0       0.33      0.01      0.02     19065

avg / total       0.87      0.91      0.88    223325


Hidden Users
(241920, 2)
(240862,) 1058
             precision    recall  f1-score   support

        0.0       0.92      1.00      0.96    222473
        1.0       0.35      0.02      0.03     18389

avg / total       0.88      0.92      0.89    240862



<h3 style="background-color:#616161;color:white">Results</h3>

fieldList1, learning_rate = 0.001 Iterations = 7 Steps = 1 Hidden Layers = 168 Classes = 2, batch size 20

Hidden Periods


             precision    recall  f1-score   support

        0.0       0.97      0.98      0.98    204415
        1.0       0.77      0.76      0.76     21367

Hidden Users
             precision    recall  f1-score   support

        0.0       0.98      0.98      0.98    222473
        1.0       0.73      0.71      0.72     18389


learning_rate = 0.001 Iterations = 7 batch size = 10 Steps = 1 Hidden Layers = 168 Classes = 2
Hidden Periods


             precision    recall  f1-score   support

        0.0       0.98      0.98      0.98    204170
        1.0       0.79      0.77      0.78     20609

avg / total       0.96      0.96      0.96    224779

Hidden Users
             precision    recall  f1-score   support

        0.0       0.98      0.98      0.98    222473
        1.0       0.73      0.71      0.72     18389

avg / total       0.96      0.96      0.96    240862


Cell type= BasicLSTMCell, learning_rate = 0.001, Iterations = 7, batch size = 1344, Steps = 1, Hidden Layers = 168, Classes = 2
Hidden Periods


             precision    recall  f1-score   support

        0.0       0.98      0.98      0.98    203219
        1.0       0.76      0.76      0.76     20209

avg / total       0.96      0.96      0.96    223428


Hidden Users
             precision    recall  f1-score   support

        0.0       0.98      0.98      0.98    222473
        1.0       0.72      0.72      0.72     18389

avg / total       0.96      0.96      0.96    240862

#### Cell type= BasicLSTMCell, learning_rate = 0.001, Iterations = 7, batch size = 1334, Steps = 1, Hidden Layers = 168, Classes = 2
Hidden Periods


(225792, 2)
(225030,) 762
             precision    recall  f1-score   support

        0.0       0.97      0.98      0.98    205194
        1.0       0.78      0.73      0.75     19836

avg / total       0.96      0.96      0.96    225030


Hidden Users
(241920, 2)
(240862,) 1058
             precision    recall  f1-score   support

        0.0       0.97      0.98      0.98    222473
        1.0       0.73      0.68      0.70     18389

avg / total       0.95      0.96      0.96    240862


Cell type= BasicLSTMCell, learning_rate = 0.001, Iterations = 7, batch size = 336, Steps = 1, Hidden Layers = 168, Classes = 2
Hidden Periods


(223440, 2)
(223379,) 61
             precision    recall  f1-score   support

        0.0       0.97      0.98      0.98    203454
        1.0       0.78      0.68      0.73     19925

avg / total       0.95      0.95      0.95    223379


Hidden Users
(240912, 2)
(240862,) 50
             precision    recall  f1-score   support

        0.0       0.97      0.98      0.98    222473
        1.0       0.73      0.63      0.68     18389

avg / total       0.95      0.95      0.95    240862



### Fieldlist without t1-t5

Cell type= BasicLSTMCell, learning_rate = 0.001, Iterations = 7, batch size = 1344, Steps = 1, Hidden Layers = 168, Classes = 2
Hidden Periods


(227136, 2)
(226327,) 809
             precision    recall  f1-score   support

        0.0       0.91      1.00      0.96    206650
        1.0       0.71      0.02      0.04     19677

avg / total       0.90      0.91      0.88    226327


Hidden Users
(241920, 2)
(240862,) 1058
             precision    recall  f1-score   support

        0.0       0.92      1.00      0.96    222473
        1.0       0.66      0.01      0.03     18389

avg / total       0.90      0.92      0.89    240862

Cell type= BasicLSTMCell, learning_rate = 0.001, Iterations = 7, batch size = 1344, Steps = 1, Hidden Layers = 100, Classes = 2
Hidden Periods


(225792, 2)
(225391,) 401
             precision    recall  f1-score   support

        0.0       0.91      1.00      0.95    204962
        1.0       0.66      0.05      0.09     20429

avg / total       0.89      0.91      0.88    225391


Hidden Users
(241920, 2)
(240862,) 1058
             precision    recall  f1-score   support

        0.0       0.93      1.00      0.96    222473
        1.0       0.60      0.03      0.06     18389

avg / total       0.90      0.92      0.89    240862


Cell type= BasicLSTMCell, learning_rate = 0.001, Iterations = 20, batch size = 1344, Steps = 1, Hidden Layers = 120, Classes = 2
Hidden Periods


(223104, 2)
(222384,) 720
             precision    recall  f1-score   support

        0.0       0.92      1.00      0.96    203475
        1.0       0.58      0.01      0.01     18909

avg / total       0.89      0.92      0.88    222384


Hidden Users
(241920, 2)
(240862,) 1058
             precision    recall  f1-score   support

        0.0       0.92      1.00      0.96    222473
        1.0       0.65      0.01      0.02     18389

avg / total       0.90      0.92      0.89    240862s

ell type= BasicLSTMCell, learning_rate = 0.001, Iterations = 7, batch size = 2688, Steps = 1, Hidden Layers = 120, Classes = 2
Hidden Periods


(225792, 2)
(223325,) 2467
             precision    recall  f1-score   support

        0.0       0.92      1.00      0.95    204260
        1.0       0.33      0.01      0.02     19065

avg / total       0.87      0.91      0.88    223325


Hidden Users
(241920, 2)
(240862,) 1058
             precision    recall  f1-score   support

        0.0       0.92      1.00      0.96    222473
        1.0       0.35      0.02      0.03     18389

avg / total       0.88      0.92      0.89    240862