<h1 align="center" style="background-color:#616161;color:white">RNN Model</h1>

Adapted from: https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/recurrent_network.py

<h3 style="background-color:#616161;color:white">0. Setup</h3>

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Input Parameters</div>

In [1]:
# Root path
#root = "C:/DS/Github/MusicRecommendation"  # BA, Windows
root = "/home/badrul/git/EventPrediction" # BA, Linux

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Common Libraries</div>

In [2]:
# Core
import numpy as np
import pandas as pd
from IPython.core.debugger import Tracer    # Used for debugging
import logging
from random import *

# File and database management
import csv
import os
import sys
import json
import sqlite3
from pathlib import Path

# Date/Time
import datetime
import time
#from datetime import timedelta # Deprecated

# Visualization
import matplotlib.pyplot as plt             # Quick
%matplotlib inline

# Misc
import random
import importlib
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(filename='RNN.log',level=logging.DEBUG)

#-------------- Custom Libs -----------------#
os.chdir(root)

# Import the codebase module
fPath = root + "/1_codemodule"
if fPath not in sys.path: sys.path.append(fPath)

# Custom Libs
import coreCode as cc
import lastfmCode as fm
print ('Ok')

Ok


<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Page Specific Libraries</div>

In [3]:
# Data science (comment out if not needed)
#from sklearn.manifold import TSNE
import tensorflow as tf
from tensorflow.contrib import rnn
from tensorflow.python.framework import ops
ops.reset_default_graph()
from sklearn import metrics
from sklearn import preprocessing
print ('Ok')

Ok


<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Load settings</div>

In [4]:
settingsDict =  cc.loadSettings()
dbPath = root + settingsDict['mainDbPath_sml']
fmSimilarDbPath = root + settingsDict['fmSimilarDbPath']
fmTagsDbPath = root + settingsDict['fmTagsDbPath']
trackMetaDbPath = root + settingsDict['trackmetadata']
periodGranularity = int(settingsDict['periodGranularity'])
print ('Ok')

Ok


<h3 style="background-color:#616161;color:white">1. Build Model</h3>

In [5]:
def RNN(x, weights, biases,n_steps):
    # Current data input shape: (batchRows, n_steps, n_input)
    # Required shape: 'n_steps' tensors list of shape (batchRows, n_input)
    
    # Unstack to get a list of 'n_steps' tensors of shape (batchRows, n_input)
    x = tf.unstack(x, n_steps, 1)  # See https://stackoverflow.com/questions/45278276/tensorflow-lstm-dropout-implementation-shape-problems/45279243#45279243
    
    # Define a lstm cell with tensorflow
    if cellType == "LSTMCell":
        stacked_rnn=[]
        for i in range(n_layers):
            stacked_rnn.append(tf.nn.rnn_cell.LSTMCell(num_units=n_hidden, forget_bias=1., state_is_tuple=True))
        multiRNNCell = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn, state_is_tuple=True)
        outputs, states = rnn.static_rnn(multiRNNCell, x, dtype=tf.float32)
        
                
        #lstm_cell = rnn.BasicLSTMCell(num_units=n_hidden, forget_bias=1.0)
        #outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
    
    elif cellType == "TimeFreqLSTMCell":
        lstm_cell =rnn.TimeFreqLSTMCell(n_hidden, use_peepholes=True, feature_size= 22, forget_bias=1.0)
        outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)
    elif cellType == "GridLSTMCell":
        lstm_cell =rnn.GridLSTMCell(n_hidden, forget_bias=1.0)
        outputs, states = rnn.static_rnn(lstm_cell, x, dtype=tf.float32)        
    else:
        print("Did not recognize {}".format(cellType))
    # Get lstm cell output
    

    # Linear activation, using rnn inner loop last output
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

In [6]:
def _buildGraph(n_steps,n_input, n_weighting):
    global x, y, _pred, _predProb, _logits, _cost, optimizer, _accuracy,_correct_pred
    
    tf.reset_default_graph()
    # tf Graph input
    x = tf.placeholder("float", [None, n_steps, n_input])
    y = tf.placeholder("int64", [None])
    
    # Define weights
    weights = {'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))}
    biases = {'out': tf.Variable(tf.random_normal([n_classes]))}

    # Evaluate model
    _logits = RNN(x, weights, biases,n_steps)
    lossW = tf.add(1,tf.multiply(tf.cast(tf.equal(y,1),'int32'),n_weighting))
    _cost = tf.reduce_mean(tf.losses.sparse_softmax_cross_entropy(logits=_logits, labels=y,weights=lossW))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(_cost)
    
    _predProb =tf.nn.softmax(_logits)  # Convert to proper probs
    _pred =tf.argmax(_predProb,1)  # Take the highest prob
    _correct_pred = tf.equal(_pred, y)
    _accuracy = tf.reduce_mean(tf.cast(_correct_pred, tf.float32))
print('Ok')

Ok


In [7]:
def ResetModel():
    try:
        sess.close()
        tf.reset_default_graph()
    except NameError:
        return
    
def initializeModel(n_steps,n_input,n_weighting,loadFromSave):
    global sess

    # Build graph
    _buildGraph(n_steps,n_input,n_weighting)

    # Initializing the variables

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    sess = tf.Session() # Has tome come after init
    if loadFromSave:
        saver.restore(sess,'./3_Data/saves/model.ckpt')
    else:
        sess.run(init)
    print('Model initialized')

<h3 style="background-color:#616161;color:white">2. Model Training Functions</h3>

In [8]:
def randomSelectFromData(_X, _Y,_batchRows = 10, numOfSamples =1):
    
    # Num of periods = batch size
    
    # Training cycle
    
    XCols=np.shape(_X)[1]
    YCols=np.shape(_Y)[1]
    depth = n_steps
    
    totalRows=np.shape(_X)[0]
    # Select random periods (ones where we will always get enough history to go with it)
    periodsList = random.sample(range(batchRows+depth, totalRows), _batchRows)
    
    # Debugging...
    #for i in range(_batchRows -1):
    #    periodsList[i+1]=periodsList[i]-1
    
    # Pre-Initialize batch arrays
    batch_x=np.zeros([_batchRows,depth,XCols])
    batch_y=np.zeros([_batchRows])

    batch_row =0
    
    for periodPos in periodsList:            
        # Log every so often 
        if (periodPos % 1) == 0: 
            timeNow =str(datetime.datetime.now())
            #print("{} Now adding random period {} into batch_row {}. ({}%)".format(timeNow,idx1,batch_row, round((batch_row/_batchRows)*100,2)))
            #logging.info("{} Now adding random period {} into batch_row ({}%)".format(timeNow, idx1,batch_row, round((batch_row/_batchRows)*100,2)))

        batch_x[batch_row] = _X[periodPos-depth:periodPos].reshape(1,depth,XCols)
        batch_y[batch_row] = _Y[periodPos]
        batch_row +=1
        
    return batch_x, batch_y
print ('Ok')

Ok


In [9]:
##############################################
###           MODEL 1    Train             ###
##############################################
def trainModel_1(batch_x, batch_y):        
    sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})

    # Calculate loss & accuracy
    loss = sess.run(_cost,feed_dict={x: batch_x, y: batch_y})
    acc = sess.run(_accuracy, feed_dict={x: batch_x, y: batch_y})
    
    return loss, acc
print('Ok')

Ok


<h3 style="background-color:#616161;color:white">3. Model Testing Functions</h3>

In [10]:
# Launch the graph
def TestPredictions1(_X, _Y, sess, _batchRows = 10, testPeriods = None):
    # Training cycle
    totalRows=np.shape(_X)[0]
    XCols=np.shape(_X)[1]
    YCols=_Y
    depth = n_steps
    
    # If no test periods were provided generate your own
    if testPeriods is None:
        # Select periods where we will always get enough history to go with it
        
        testPeriods = random.sample(range(batchRows+depth, totalRows), _batchRows)
    else:
        testPeriods = testPeriods + batchRows+depth-1
        _batchRows = len(testPeriods)
     
    # Pre-Initialize batch arrays
    batch_x=np.zeros([_batchRows,depth,XCols])
    batch_y=np.zeros([_batchRows])

    batch_row =0
    for idx1 in testPeriods:            
        if (idx1 % 1) == 0: 
            timeNow =str(datetime.datetime.now())
            #print("{} Now testing on period {} ({}%)".format(timeNow,idx1,round((batch_row/_batchRows)*100,2)))
            logging.info("{} Now testing period {} ({}%)".format(timeNow, idx1,round((batch_row/_batchRows)*100,2)))

        batch_x[batch_row] = _X[idx1-depth:idx1].reshape(1,depth,XCols)
        batch_y[batch_row] = _Y[idx1]
        batch_row +=1
    
    print ("Processed {}".format(_batchRows))
    # Predict for this period
    prob = sess.run(_predProb, feed_dict={x: batch_x, y: batch_y})
    predictions = sess.run(_pred, feed_dict={x: batch_x, y: batch_y})
    
    return predictions, batch_y



<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Test hidden periods</div>

In [11]:
def TestHiddenPeriods(hiddenTestPeriods=50, useTestData = False):

    print('{} Hidden Periods\n'.format(hiddenTestPeriods))
    print ("Cell type= {}, learning_rate = {}, Iterations = {}, batch size = {}, Steps = {}, Hidden Layers = {}, Classes = {}\n".format(cellType,learning_rate,samplesPerUser,batch_size, n_steps ,n_hidden,n_classes))

    if useTestData == False:
        predictions,labels = TestPredictions1(xTrain,yTrain,sess,numOfPeriods=hiddenTestPeriods)
    else:
        predictions,labels = TestPredictions1(xTest,yTest,sess,numOfPeriods=hiddenTestPeriods)
    
    print(np.shape(labels),np.shape(predictions))    
    print(metrics.classification_report(labels,predictions))  # Need to feed it yTest not yTest_OneHot here
    return labels, predictions

In [16]:
def TrainModel(printOnBatchIteration=True,printOnSampleIteration=1,enableLogging=False, _saveModel=False):
    
    counter =0 # Used to keep track of every iteration. (Does not loop back to 0).
    batch_y=[]
    batch_x=[]
    
    loss=np.zeros([user_iteration*samplesPerUser*batch_Iterations])
    acc=np.zeros([user_iteration*samplesPerUser*batch_Iterations])
    
    
    trainUsers=(cc.getUsers(dbPath,testUserFlag=0))  # Get list of users
    np.random.shuffle(trainUsers)

    for userCount in range(user_iteration):  # Iterate through user selection
        # Get user data
        userID = int(trainUsers[userCount]) # Randomly select 1 user
        xTrain, yTrain, xTest, yTest = cc.SelectUserData_TrainTest(dbPath,tblName,fieldList,userIDs=[userID],oneHot=False,periodGranularity=periodGranularity)

        if xTrain is not None: # Make sure we have data
            # Print message
            timeNow =datetime.datetime.now().strftime('%D %H:%M:%S')
            print('{} User {} UserID {} (Total plays {})'.format(timeNow, userCount,userID,sum(yTrain)))
            if enableLogging: logging.info('{} Now processing random user {}'.format(timeNow, userID))

            for i in range(samplesPerUser):  # Num of mini-batches
                
                # Randomly select from the train data
                batch_x, batch_y = randomSelectFromData(xTrain, yTrain, _batchRows=batchRows)
                
                for j in range(batch_Iterations): # Num of times to iterate over the batch
                    loss[counter],acc[counter] = trainModel_1(batch_x,batch_y)
                    
                    if printOnBatchIteration:
                        timeNow =datetime.datetime.now().strftime('%D %H:%M:%S')
                        print("  User {} Mini-batch {} Iteration {} Loss={:.6f}, Training Accuracy={:.5f}".format(userCount,i, j, loss[counter], acc[counter]))
                        if enableLogging: logging.info(s)
                    counter+=1

                
                if i % printOnSampleIteration == 0 or i==samplesPerUser:
                    # Sample results
                    predictions = sess.run(_pred, feed_dict={x: batch_x, y: batch_y})
                    logits= sess.run(_logits, feed_dict={x: batch_x, y: batch_y})
                    prob= sess.run(_predProb, feed_dict={x: batch_x, y: batch_y})                    
                    
                    prec,rec, _, _ = metrics.precision_recall_fscore_support(batch_y,predictions, average='binary')
                    timeNow =datetime.datetime.now().strftime('%D %H:%M:%S')
                    s='{} Sample {} of {} Precision {} Recall {}\n'.format(timeNow, i, samplesPerUser,np.round(prec,3),np.round(rec,3))
                    print(s)
                    if enableLogging: logging.info(s)
                    
            # End of user training - Perform test
            xTest, yTest = randomSelectFromData(xTest, yTest, _batchRows=batchRows, numOfSamples=2)
            predictions = sess.run(_pred, feed_dict={x: xTest, y: yTest})
            logits= sess.run(_logits, feed_dict={x: xTest, y: yTest})
            prob= sess.run(_predProb, feed_dict={x: xTest, y: yTest})                    

            prec,rec, _, _ = metrics.precision_recall_fscore_support(yTest,predictions, average='binary')
            
            timeNow =datetime.datetime.now().strftime('%D %H:%M:%S')
            s='{} UserID {} Test: Precision {} Recall {}\n'.format(timeNow, userID,np.round(prec,3),np.round(rec,3))
            print(s)
            if enableLogging: logging.info(s)
            print(metrics.classification_report(yTest,predictions))  # Need to feed it yTest not yTest_OneHot here
            
            if _saveModel:
                saver = tf.train.Saver()
                saver.save(sess,"./3_Data/saves/model.ckpt")

print('Training completed')

Training completed


In [13]:
### Code needs tidying up

def TestModel():
    print('Testing held out users')
    userLabels=[]
    userPred=[]
    totalLabels = []
    totalPred = []
    avLoss =0
    avAcc =0
    
    
    users=cc.getUsers(dbPath,testUserFlag = 1)  # Get all test users

    prec= np.zeros(len(users))
    rec = np.zeros(len(users))
    
    usrCount = 0
    for usr in users: # For each test user    
        # Select all data for each user
        xTest, yTest = cc.SelectTestUserData(dbPath,tblName,fieldList,userIDs=usr,oneHot=False,periodGranularity=periodGranularity)
        
        if xTest is not None:
            for i in range(10):  # Select 10 random batches
                batch_x, batch_y = randomSelectFromData(xTest, yTest, _batchRows=batchRows)

                p = sess.run(_pred, feed_dict={x: batch_x, y: batch_y})
                
                loss = sess.run(_cost,feed_dict={x: batch_x, y: batch_y})
                acc = sess.run(_accuracy, feed_dict={x: batch_x, y: batch_y})
                avLoss+=loss
                avAcc+=acc

                if userLabels == []:
                    userLabels= batch_y
                    userPred = p
                else:
                    userLabels = np.append(userLabels,batch_y)
                    userPred = np.append(userPred,p)
        
        prec[usrCount],rec[usrCount], _, _ = metrics.precision_recall_fscore_support(userLabels,userPred, average='binary')
        print('User {} of {} Av loss {} Av acc {}'.format(usrCount , len(users), np.round(avLoss/10,3), np.round(avAcc/10,3)))
        print('User {} of {} Precision {} Recall {}'.format(usrCount , len(users), np.round(prec[usrCount],3),np.round(rec[usrCount],3)))
        avLoss=0
        avAcc = 0
        if totalLabels ==[]:
            totalLabels = userLabels
            totalPred = userPred
        else:
            totalLabels = np.append(totalLabels,userLabels)
            totalPred = np.append(totalPred,userPred)
        usrCount+=1                

    print('Overall results:')
    print(metrics.classification_report(totalLabels,totalPred))  # Need to feed it yTest not yTest_OneHot here
    pMn=round(prec.mean(),3)
    pSd=round(prec.std(),3)
    rMn=round(rec.mean(),3)
    rSd=round(rec.std(),3)
    print ("Av. precision {} +/- {}, Av. recall {}+/{},".format(pMn,pSd,rMn,rSd))
    
    
    saver = tf.train.Saver()
    saver.save(sess,"./3_Data/saves/model.ckpt")
    print('Testing complete')


In [14]:
def SelectTestUsers(newUsers=10):
    newUsers = 10   # Num of randomly selected users to separate out
    con = sqlite3.connect(dbPath)

    # First reset back to 0
    con.execute("Update tblUsers Set TestUser = 0")
    con.commit()

    # Select random users
    sqlStr= "SELECT UserID FROM tblUsers Group by UserID ORDER BY RANDOM() LIMIT {}".format(newUsers)

    newUsersList = pd.read_sql_query(sqlStr, con)
    for row in newUsersList.itertuples():
        sqlStr = "Update tblUsers Set TestUser = 1 where UserID = {}".format(row[1])
        con.execute(sqlStr)
    con.commit()
    con.close()

    np.array(newUsersList).reshape(1,10)

<h3 style="background-color:green;color:white">4. ...And action!</h3>

In [18]:
###### Model setup
loadFromSave = False
user_iteration = 10      # How many iterations of user selection
samplesPerUser = 20      # How many times to randomly sample from each user
batchRows = 50           # How many periods to select sample (batch size)
batch_Iterations = 5    # How many iterations to perform on one batch

n_steps = 672         # How many time steps (i.e. depth) to have
learning_rate = 0.002
n_hidden = 250 # hidden layer num of features
n_layers = 4
class1Weighting = 4
n_classes = 2  # 2 for one-hot
cellType = "LSTMCell"  # Choose: TimeFreqLSTMCell BasicLSTMCell
#fieldList="UserID, t, HrsFrom5pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat, t1,t2, t23_5hrs,t24hrs,t24_5hrs"
fieldList="UserID, t, t1"
featureLen = len(fieldList.split(","))-2 # -2 as we drop UserID and t
tblName='tblTimeSeriesData'

ResetModel()
initializeModel(n_steps,featureLen,class1Weighting,loadFromSave)
TrainModel(printOnBatchIteration=False,printOnSampleIteration=1)
TestModel()

print('Features len: {} User Iteration {} Samples per user: {} Batch iterations: {} Batch size {} N_steps {} Units {} Layers {} Weighting {}'.format(featureLen,user_iteration, samplesPerUser, batch_Iterations, batchRows, n_Steps, n_hidden, n_layers, class1Weighting))

Model initialized
Skipping user 8 as not enough periods (4862)
08/22/17 20:15:49 User 1 UserID 16 (Total plays [3886])
08/22/17 20:19:09 Sample 0 of 50 Precision 0.0 Recall 0.0

08/22/17 20:24:16 Sample 1 of 50 Precision 1.0 Recall 0.667

08/22/17 20:27:30 Sample 2 of 50 Precision 0.0 Recall 0.0

08/22/17 20:30:37 Sample 3 of 50 Precision 0.5 Recall 0.5

08/22/17 20:35:52 Sample 4 of 50 Precision 0.333 Recall 0.667

08/22/17 20:39:07 Sample 5 of 50 Precision 0.0 Recall 0.0

08/22/17 20:42:24 Sample 6 of 50 Precision 1.0 Recall 0.5

08/22/17 20:45:38 Sample 7 of 50 Precision 0.5 Recall 0.667

08/22/17 20:48:49 Sample 8 of 50 Precision 0.667 Recall 1.0

08/22/17 20:52:07 Sample 9 of 50 Precision 0.5 Recall 0.5

08/22/17 20:55:21 Sample 10 of 50 Precision 0.857 Recall 0.857

08/22/17 20:58:34 Sample 11 of 50 Precision 0.667 Recall 1.0

08/22/17 21:01:47 Sample 12 of 50 Precision 0.0 Recall 0.0

08/22/17 21:05:02 Sample 13 of 50 Precision 0.5 Recall 0.667

08/22/17 21:08:19 Sample 14 of 50

08/23/17 03:04:48 Sample 23 of 50 Precision 1.0 Recall 0.889

08/23/17 03:08:06 Sample 24 of 50 Precision 0.8 Recall 1.0

08/23/17 03:11:22 Sample 25 of 50 Precision 0.8 Recall 0.8

08/23/17 03:14:45 Sample 26 of 50 Precision 0.8 Recall 1.0

08/23/17 03:18:10 Sample 27 of 50 Precision 1.0 Recall 0.5

08/23/17 03:21:28 Sample 28 of 50 Precision 0.455 Recall 0.833

08/23/17 03:24:45 Sample 29 of 50 Precision 1.0 Recall 1.0

08/23/17 03:27:57 Sample 30 of 50 Precision 0.0 Recall 0.0

08/23/17 03:31:14 Sample 31 of 50 Precision 0.667 Recall 0.667

08/23/17 03:34:32 Sample 32 of 50 Precision 0.875 Recall 0.875

08/23/17 03:37:48 Sample 33 of 50 Precision 0.714 Recall 0.833

08/23/17 03:41:06 Sample 34 of 50 Precision 1.0 Recall 1.0

08/23/17 03:44:25 Sample 35 of 50 Precision 1.0 Recall 0.667

08/23/17 03:47:39 Sample 36 of 50 Precision 0.5 Recall 0.5

08/23/17 03:50:54 Sample 37 of 50 Precision 0.75 Recall 0.6

08/23/17 03:54:14 Sample 38 of 50 Precision 0.889 Recall 1.0

08/23/17 03:57:30

08/23/17 09:49:05 Sample 46 of 50 Precision 0.6 Recall 0.75

08/23/17 09:52:19 Sample 47 of 50 Precision 0.375 Recall 0.75

08/23/17 09:55:30 Sample 48 of 50 Precision 0.667 Recall 0.75

08/23/17 09:58:44 Sample 49 of 50 Precision 0.714 Recall 0.714

08/23/17 09:58:54 UserID 52 Test: Precision 0.444 Recall 0.571

             precision    recall  f1-score   support

        0.0       0.93      0.88      0.90        43
        1.0       0.44      0.57      0.50         7

avg / total       0.86      0.84      0.85        50

08/23/17 09:58:54 User 6 UserID 96 (Total plays [2336])
08/23/17 10:02:08 Sample 0 of 50 Precision 0.5 Recall 0.5

08/23/17 10:05:23 Sample 1 of 50 Precision 0.75 Recall 0.75

08/23/17 10:08:36 Sample 2 of 50 Precision 1.0 Recall 0.667

08/23/17 10:11:48 Sample 3 of 50 Precision 0.4 Recall 0.5

08/23/17 10:15:01 Sample 4 of 50 Precision 0.0 Recall 0.0

08/23/17 10:18:14 Sample 5 of 50 Precision 0.0 Recall 0.0

08/23/17 10:21:31 Sample 6 of 50 Precision 0.4 Recall 0.

08/23/17 16:16:34 Sample 13 of 50 Precision 0.333 Recall 0.25

08/23/17 16:19:36 Sample 14 of 50 Precision 1.0 Recall 0.8

08/23/17 16:22:52 Sample 15 of 50 Precision 0.333 Recall 1.0

08/23/17 16:25:56 Sample 16 of 50 Precision 0.625 Recall 0.714

08/23/17 16:28:57 Sample 17 of 50 Precision 0.5 Recall 1.0

08/23/17 16:32:07 Sample 18 of 50 Precision 0.667 Recall 0.857

08/23/17 16:35:07 Sample 19 of 50 Precision 0.333 Recall 0.667

08/23/17 16:38:07 Sample 20 of 50 Precision 0.5 Recall 0.6

08/23/17 16:41:12 Sample 21 of 50 Precision 0.5 Recall 0.5

08/23/17 16:44:11 Sample 22 of 50 Precision 0.4 Recall 0.667

08/23/17 16:47:13 Sample 23 of 50 Precision 1.0 Recall 0.75

08/23/17 16:50:19 Sample 24 of 50 Precision 0.75 Recall 0.75

08/23/17 16:53:29 Sample 25 of 50 Precision 0.636 Recall 0.875

08/23/17 16:56:34 Sample 26 of 50 Precision 0.4 Recall 0.5

08/23/17 16:59:40 Sample 27 of 50 Precision 0.5 Recall 0.333

08/23/17 17:02:38 Sample 28 of 50 Precision 0.8 Recall 0.571

08/23/17 1

08/23/17 23:28:46 Sample 38 of 50 Precision 0.0 Recall 0.0

08/23/17 23:31:59 Sample 39 of 50 Precision 0.0 Recall 0.0

08/23/17 23:35:15 Sample 40 of 50 Precision 0.0 Recall 0.0

08/23/17 23:38:29 Sample 41 of 50 Precision 0.0 Recall 0.0

08/23/17 23:41:42 Sample 42 of 50 Precision 0.0 Recall 0.0

08/23/17 23:44:52 Sample 43 of 50 Precision 0.0 Recall 0.0

08/23/17 23:48:10 Sample 44 of 50 Precision 0.0 Recall 0.0

08/23/17 23:51:11 Sample 45 of 50 Precision 0.0 Recall 0.0

08/23/17 23:55:48 Sample 46 of 50 Precision 0.0 Recall 0.0

08/23/17 23:59:38 Sample 47 of 50 Precision 0.0 Recall 0.0

08/24/17 00:02:49 Sample 48 of 50 Precision 0.0 Recall 0.0

08/24/17 00:06:05 Sample 49 of 50 Precision 0.0 Recall 0.0

08/24/17 00:06:17 UserID 32 Test: Precision 0.0 Recall 0.0

             precision    recall  f1-score   support

        0.0       0.96      1.00      0.98        48
        1.0       0.00      0.00      0.00         2

avg / total       0.92      0.96      0.94        50

08/24

KeyboardInterrupt: 

<h3 style="background-color:#616161;color:white">6. Other Methods</h3>

In [None]:
from sklearn import svm
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from datetime import datetime

#from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score

def getData(fieldList,tblName):
    con = sqlite3.connect(dbPath)
    c = con.cursor()
    # Get list of UserIDs
    _df = pd.read_sql_query("Select {} from {}".format(fieldList,tblName),con)
    _x = _df.drop(['t'], 1).values
    _y = _df['t'].values.astype(int)
    con.close()
    return _x, _y

def getSample(_x,_y, _sampleSize):
    idx = np.random.choice(np.arange(len(_x)), _sampleSize, replace=False)
    _xSample = x[idx]
    _ySample = y[idx]
    return _xSample,_ySample

In [None]:
sampleSize = 100000
fieldList="t, t1,t2, t3,t4,t5 t23_5hrs,t24hrs,t24_5hrs, HrsFrom5pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat"
x,y = getData(fieldList,'tblTimeSeriesData')
x,y = getSample(x,y,sampleSize)

# Only keep first plays
y=(x[:,0]==0)*(x[:,1]==0)*(x[:,2]==0)*(x[:,3]==0)*y
# Run baseline Model
#Baseline2(x,y)

#LinearKernel2(x,y,False)
#LinearKernel2(x,y,True)

#LogisticModel2(x,y,False)
#LogisticModel2(x,y,True)

RBFKernel2(x,y,False)
RBFKernel2(x,y,True)


Av. precision 0.802 +/- 0.033, Av. recall 0.695+/0.025

In [None]:
def BaseLineModel(x,y):
    # Get predictions
    print('\n--------------------------------------------------------')
    print('1. Baseline model')
    print('--------------------------------------------------------')

    print(metrics.precision_score(y,x[:,8]))
    print(metrics.recall_score(y,x[:,8]))
    print(metrics.classification_report(y,x[:,8])) 

In [None]:
def LinearKernel(x,y):

    print('\n--------------------------------------------------------')
    print('3. Linear Kernel')
    print('--------------------------------------------------------')
    
    startTime = datetime.now()
    print('Start time {}'.format(startTime))

    clf = svm.SVC(kernel='linear', C=1, random_state=0)
    # Increase weight where t-1 is 0 and t is 1
    sampleWeights =  1+(y[:] == 1) * (x[:,1] ==0)
    scoring = ['precision_macro', 'recall_macro']
    scores = cross_validate(clf, x, y, scoring=scoring,
    cv=5, return_train_score=False, n_jobs=-1,fit_params={'sample_weight': sampleWeights})
    #cv=5, return_train_score=False, n_jobs=-1)
    

    pMn=round(scores['test_precision_macro'].mean(),3)
    pSd=round(scores['test_precision_macro'].std(),3)
    rMn=round(scores['test_recall_macro'].mean(),3)
    rSd=round(scores['test_recall_macro'].std(),3)
    print ("Av. precision {} +/- {}, Av. recall {}+/{},".format(pMn,pSd,rMn,rSd))

    timeElapsed=datetime.now()-startTime
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

    #predicted = cross_val_predict(clf, x, y, cv=5, n_jobs=-1)
    #print(metrics.roc_auc_score(y,predicted))  
    #print(metrics.classification_report(y,predicted))  # Need to feed it yTest not yTest_OneHot here
    return scores

In [None]:
def RBFKernel(x,y):

    print('\n--------------------------------------------------------')
    print('3. SVM- RBF Kernel')
    print('--------------------------------------------------------')
    
    startTime = datetime.now()
    print('Start time {}'.format(startTime))

    clf = svm.SVC(kernel='rbf', C=1, random_state=0)
    # Increase weight where t-1 is 0 and t is 1
    sampleWeights =  1+(y[:] == 1) * (x[:,1] ==0)
    scoring = ['precision_macro', 'recall_macro']
    scores = cross_validate(clf, x, y, scoring=scoring,
    cv=5, return_train_score=False, n_jobs=-1,fit_params={'sample_weight': sampleWeights})
    #cv=5, return_train_score=False, n_jobs=-1)
    

    pMn=round(scores['test_precision_macro'].mean(),3)
    pSd=round(scores['test_precision_macro'].std(),3)
    rMn=round(scores['test_recall_macro'].mean(),3)
    rSd=round(scores['test_recall_macro'].std(),3)
    print ("Av. precision {} +/- {}, Av. recall {}+/{},".format(pMn,pSd,rMn,rSd))

    timeElapsed=datetime.now()-startTime
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

    #predicted = cross_val_predict(clf, x, y, cv=5, n_jobs=-1)
    #print(metrics.roc_auc_score(y,predicted))  
    #print(metrics.classification_report(y,predicted))  # Need to feed it yTest not yTest_OneHot here
    return scores

In [None]:
def LogisticModel(x,y):

    print('\n--------------------------------------------------------')
    print('4. Logistic Model')
    print('--------------------------------------------------------')
    
    startTime = datetime.now()
    print('Start time {}'.format(startTime))

    clf = LogisticRegression(C=1,class_weight ='balanced')
    # Increase weight where t-1 is 0 and t is 1
    sampleWeights =  1+(y[:] == 1) * (x[:,1] ==0)
    scoring = ['precision_macro', 'recall_macro']
    scores = cross_validate(clf, x, y, scoring=scoring,cv=5, return_train_score=False, n_jobs=-1)
    
    pMn=round(scores['test_precision_macro'].mean(),3)
    pSd=round(scores['test_precision_macro'].std(),3)
    rMn=round(scores['test_recall_macro'].mean(),3)
    rSd=round(scores['test_recall_macro'].std(),3)
    print ("Av. precision {} +/- {}, Av. recall {}+/{},".format(pMn,pSd,rMn,rSd))

    timeElapsed=datetime.now()-startTime
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

    return scores

In [None]:
def Baseline2(x,y):

    print('\n--------------------------------------------------------')
    print('3. Baseline')
    print('--------------------------------------------------------')
    
    startTime = datetime.now()
    print('Start time {}'.format(startTime))
    prec= np.zeros(5)
    rec = np.zeros(5)
    
    i=0
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(x):
        xTrain, xTest = x[train_index], x[test_index]
        yTrain, yTest = y[train_index], y[test_index]
        
        pred = xTest[:,8]
        prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary') 
        
        #print (metrics.classification_report(yTest,pred))
        i+=1
    
    pMn=round(prec.mean(),3)
    pSd=round(prec.std(),3)
    rMn=round(rec.mean(),3)
    rSd=round(rec.std(),3)
    print ("Av. precision {} +/- {}, Av. recall {}+/{},".format(pMn,pSd,rMn,rSd))

    timeElapsed=datetime.now()-startTime
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

    return

In [None]:
def LinearKernel2(x,y,weighted):

    print('\n--------------------------------------------------------')
    print('3. Linear Kernel (weighted = {})'.format(weighted))
    print('--------------------------------------------------------')
    
    startTime = datetime.now()
    print('Start time {}'.format(startTime))

    clf = svm.SVC(kernel='linear', C=1, random_state=0)
    # Increase weight where t-1 is 0 and t is 1
    
    prec= np.zeros(5)
    rec = np.zeros(5)
    
    i=0
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(x):
        xTrain, xTest = x[train_index], x[test_index]
        yTrain, yTest = y[train_index], y[test_index]
        
        clf.fit(xTrain,yTrain)
        pred = clf.predict(xTest)
        
        if weighted:
            sampleWeights =  1+(yTest[:] == 1) * (xTest[:,1] ==0)
            prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary',sample_weight=sampleWeights) 
        else:
            prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary') 
        
        #print (metrics.classification_report(yTest,pred))
        i+=1
    
    pMn=round(prec.mean(),3)
    pSd=round(prec.std(),3)
    rMn=round(rec.mean(),3)
    rSd=round(rec.std(),3)
    print ("Av. precision {} +/- {}, Av. recall {}+/{},".format(pMn,pSd,rMn,rSd))

    coeffs = np.reshape(np.round(clf.coef_,5),(-1,1))
    coeffs=np.concatenate((np.reshape(fieldList.split(',')[1:],(-1,1)),coeffs),axis=1)
    print(pd.DataFrame(coeffs,columns=['Field','Coeff']))
    
    timeElapsed=datetime.now()-startTime
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

    return

In [None]:
def RBFKernel2(x,y,weighted):

    print('\n--------------------------------------------------------')
    print('4. RBF Kernel (weighted = {})'.format(weighted))
    print('--------------------------------------------------------')
    
    startTime = datetime.now()
    print('Start time {}'.format(startTime))

    clf = svm.SVC(kernel='rbf', C=1, random_state=0)
    # Increase weight where t-1 is 0 and t is 1
    
    prec= np.zeros(5)
    rec = np.zeros(5)
    
    i=0
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(x):
        xTrain, xTest = x[train_index], x[test_index]
        yTrain, yTest = y[train_index], y[test_index]
        
        clf.fit(xTrain,yTrain)
        pred = clf.predict(xTest)
        
        if weighted:
            sampleWeights =  1+(yTest[:] == 1) * (xTest[:,1] ==0)
            prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary',sample_weight=sampleWeights) 
        else:
            prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary') 
        
        #print (metrics.classification_report(yTest,pred))
        i+=1
    
    pMn=round(prec.mean(),3)
    pSd=round(prec.std(),3)
    rMn=round(rec.mean(),3)
    rSd=round(rec.std(),3)
    print ("Av. precision {} +/- {}, Av. recall {}+/{},".format(pMn,pSd,rMn,rSd))
    
    timeElapsed=datetime.now()-startTime
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

    return

In [None]:
def LogisticModel2(x,y,weighted):

    print('\n--------------------------------------------------------')
    print('4. Logistic Model (weighted = {})'.format(weighted))
    print('--------------------------------------------------------')
    
    startTime = datetime.now()
    print('Start time {}'.format(startTime))

    clf = LogisticRegression(C=1,class_weight ='balanced')
    # Increase weight where t-1 is 0 and t is 1
    sampleWeights =  1+(y[:] == 1) * (x[:,1] ==0)
    k=5
    prec= np.zeros(k)
    rec = np.zeros(k)
    i=0
    
    kf = KFold(n_splits=k)
    for train_index, test_index in kf.split(x):
        xTrain, xTest = x[train_index], x[test_index]
        yTrain, yTest = y[train_index], y[test_index]
        clf.fit(xTrain,yTrain)
        pred = clf.predict(xTest)
        
        if weighted:
            sampleWeights =  1+(yTest[:] == 1) * (xTest[:,1] ==0)
            prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary',sample_weight=sampleWeights)
            #print (metrics.classification_report(yTest,pred))
        else:
            prec[i],rec[i], _, _ = metrics.precision_recall_fscore_support(yTest,pred, average='binary')
            #print (metrics.classification_report(yTest,pred))
        
        i+=1
    
    pMn=round(prec.mean(),3)
    pSd=round(prec.std(),3)
    rMn=round(rec.mean(),3)
    rSd=round(rec.std(),3)
    print ("Av. precision {} +/- {}, Av. recall {}+/{},".format(pMn,pSd,rMn,rSd))

    coeffs = np.reshape(np.round(clf.coef_,5),(-1,1))
    coeffs=np.concatenate((np.reshape(fieldList.split(',')[1:],(-1,1)),coeffs),axis=1)
    print(pd.DataFrame(coeffs,columns=['Field','Coeff']))
    
    timeElapsed=datetime.now()-startTime
    print('Time elpased (hh:mm:ss.ms) {}'.format(timeElapsed))

    return