<h1 align="center" style="background-color:#616161;color:white">Nonlinear SVM Example in Tensorflow</h1>

Adapted from: https://github.com/nfmcclure/tensorflow_cookbook/tree/master/04_Support_Vector_Machines/03_Reduction_to_Linear_Regression


<h3 style="background-color:#616161;color:white">0. Setup</h3>

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Input Parameters</div>

In [34]:
PeriodGranularity = 30 # E.g. 15, 30, 60
# Train / Test split
newUsers = 10   # Num of randomly selected users to separate out of eval 2
rndPeriods = 3 # Num of random periods from each use to select
rndPeriodsLength = int(60/PeriodGranularity) * 24 * 7 * 4     # How long the random test period should cover

# Root path
#root = "C:/DS/Github/MusicRecommendation"  # BA, Windows
root = "/home/badrul/Documents/git/MusicRecommendation" # BA, Linux

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Common Libraries</div>

In [35]:
# Core
import numpy as np
import pandas as pd
from IPython.core.debugger import Tracer    # Used for debugging
import logging

# File and database management
import csv
import os
import sys
import json
import sqlite3
from pathlib import Path

# Date/Time
import datetime
import time
#from datetime import timedelta # Deprecated

# Visualization
import matplotlib.pyplot as plt             # Quick
%matplotlib inline

# Misc
import random

#-------------- Custom Libs -----------------#
os.chdir(root)

# Import the codebase module
fPath = root + "/1_codemodule"
if fPath not in sys.path: sys.path.append(fPath)

# Custom Libs
import coreCode as cc
import lastfmCode as fm

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Page Specific Libraries</div>

In [36]:
# Data science (comment out if not needed)
#from sklearn.manifold import TSNE
import tensorflow as tf
from tensorflow.python.framework import ops
ops.reset_default_graph()

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Declare Functions</div>

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Load settings</div>

In [37]:
settingsDict =  cc.loadSettings()
dbPath = root + settingsDict['mainDbPath']
fmSimilarDbPath = root + settingsDict['fmSimilarDbPath']
fmTagsDbPath = root + settingsDict['fmTagsDbPath']
trackMetaDbPath = root + settingsDict['trackmetadata']

Create a graph session

<h3 style="background-color:#616161;color:white">1. Load data</h3>

In [52]:
def getTrainAndTestData():
    con = sqlite3.connect(dbPath)
    c = con.cursor()

    # Get list of UserIDs 
    trainUsers = pd.read_sql_query("Select UserID from tblUsers Where tblUsers.TestUser = 0",con)

    fieldList="t, UserID, HrsFrom6pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat,t1,t2,t3,t4,t5,t10,t12hrs,t24hrs,t1wk,t2wks,t3wks,t4wks"
    trainDf=pd.DataFrame(columns=[fieldList])  # Create an emmpty df
    testDf=pd.DataFrame(columns=[fieldList])  # Create an emmpty df
    periodsInAMonth=int(60/PeriodGranularity)*24*7*4

    totalRows=0
    
    for user in trainUsers.itertuples():
        # Get training dataset
        SqlStr="SELECT {} from tblTimeSeriesData where UserID = {}".format(fieldList,user.userID)
        df = pd.read_sql_query(SqlStr, con)
        totalRows += len(df)
    
        # Cut-off 1
        k = random.randint(periodsInAMonth, len(df))
        #Tracer()()  -- for debugging purposes
        testDf = testDf.append(df.iloc[k:k+periodsInAMonth])[df.columns.tolist()]

        tmp = df.drop(df.index[k:k+periodsInAMonth])

        # Cut-off 2
        k = random.randint(periodsInAMonth, len(tmp))
        testDf = testDf.append(tmp.iloc[k:k+periodsInAMonth])[df.columns.tolist()]
        trainDf = trainDf.append(tmp.drop(tmp.index[k:k+periodsInAMonth]))[df.columns.tolist()]

    if len(trainDf)+len(testDf) == totalRows:
        print('Ok')
    else:
        print("Incorrect. Total Rows = {}. TestDf+TrainDf rows = {}+{}={}".format(totalRows,len(testDf),len(trainDf),len(testDf)+len(trainDf)))
        
    return trainDf, testDf

trainDf,testDf = getTrainAndTestData()

x_vals = trainDf.drop(['t','UserID'], 1).values
y_vals = trainDf['t'].values.astype(int) 
# Change the 0's to -1
y_vals = np.array([1 if y==1 else -1 for y in y_vals])

Ok


<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Confirm dimensions</div>

In [55]:
numOfFeatures = np.shape(x_vals)[1]
np.shape(x_vals),np.shape(y_vals)

((937054, 20), (937054,))

In [99]:
# Test data
x_vals_test= testDf.drop(['t','UserID'], 1).values
y_vals_test = testDf['t'].values.astype(int)
y_vals_test=y_vals_test.reshape(len(y_vals_test),1)
y_vals_test = np.array([1 if y==1 else -1 for y in y_vals_test])
np.shape(x_vals_test), np.shape( y_vals_test)

((55769, 20), (55769,))

<h3 style="background-color:#616161;color:white">2. Define Model</h3>

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Model Parameters</div>

We now declare our batch size, placeholders, and the fitted b-value for the SVM kernel.  Note that we will create a separate placeholder to feed in the prediction grid for plotting.

In [119]:
# SVM Regression
#----------------------------------
#
# This function shows how to use TensorFlow to
# solve support vector regression. We are going
# to find the line that has the maximum margin
# which INCLUDES as many points as possible
#
from tensorflow.python.framework import ops
ops.reset_default_graph()

# Create graph
sess = tf.Session()

# Declare batch size
batch_size = 50

# Initialize placeholders
x_data = tf.placeholder(shape=[None, numOfFeatures], dtype=tf.float32)
y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)

# Create variables for linear regression
A = tf.Variable(tf.random_normal(shape=[numOfFeatures,1]))  # Weight vector
b = tf.Variable(tf.random_normal(shape=[1,1]))              # Constant

# Declare model operations
model_output = tf.add(tf.matmul(x_data, A), b)
prediction = tf.sign(model_output-tf.reduce_mean(model_output))

# Declare loss function
# = max(0, abs(target - predicted) + epsilon)
# 1/2 margin width parameter = epsilon
epsilon = tf.constant([0.5])

# Margin term in loss
loss = tf.reduce_mean(tf.maximum(0., tf.subtract(tf.abs(tf.subtract(model_output, y_target)), epsilon)))
# Declare optimizer
my_opt = tf.train.GradientDescentOptimizer(0.075)
train_step = my_opt.minimize(loss)

# Initialize variables
init = tf.global_variables_initializer()
sess.run(init)

# Training loop
train_loss = []
test_loss = []

# Train
for i in range(500):
    # Select a batch of train data and train
    rand_index = np.random.choice(len(x_vals), size=batch_size)  
    rand_x = x_vals[rand_index]
    rand_y = np.transpose([y_vals[rand_index]])
    sess.run(train_step, feed_dict={x_data: rand_x, y_target: rand_y})
    
    # Monitor the loss on the test data
    temp_train_loss = sess.run(loss, feed_dict={x_data: x_vals, y_target: np.transpose([y_vals])})
    train_loss.append(temp_train_loss)
    
    temp_test_loss = sess.run(loss, feed_dict={x_data: x_vals_test, y_target: np.transpose([y_vals_test])})
    test_loss.append(temp_test_loss)
    if (i+1)%50==0:
        print('-----------')
        print('Generation: ' + str(i+1))
        #print('A = ' + str(sess.run(A)) + ' b = ' + str(sess.run(b)))
        print('Train Loss = ' + str(temp_train_loss))
        print('Test Loss = ' + str(temp_test_loss))



-----------
Generation: 50
Train Loss = 2.10086
Test Loss = 2.09793
-----------
Generation: 100
Train Loss = 1.77656
Test Loss = 1.7721
-----------
Generation: 150
Train Loss = 0.922745
Test Loss = 0.917819
-----------
Generation: 200
Train Loss = 2.05771
Test Loss = 2.05174
-----------
Generation: 250
Train Loss = 1.50157
Test Loss = 1.49339
-----------
Generation: 300
Train Loss = 2.21936
Test Loss = 2.21246
-----------
Generation: 350
Train Loss = 1.37115
Test Loss = 1.36452
-----------
Generation: 400
Train Loss = 0.515412
Test Loss = 0.509333
-----------
Generation: 450
Train Loss = 1.49067
Test Loss = 1.48444
-----------
Generation: 500
Train Loss = 1.16954
Test Loss = 1.16232


In [122]:
# Evaluate
test_predictions = sess.run(prediction, feed_dict={x_data: x_vals_test})

from sklearn import metrics
print(metrics.classification_report(y_vals_test,test_predictions))
print(metrics.confusion_matrix(y_vals_test,test_predictions))
print("* Precision = labelled as x / how many were actually x in the ones that were labelled")
print("* Recall = labelled as x / how many were actually x in the dataset")

             precision    recall  f1-score   support

         -1       0.95      0.49      0.64     50520
          1       0.13      0.74      0.22      5249

avg / total       0.87      0.51      0.60     55769

[[24565 25955]
 [ 1366  3883]]
* Precision = labelled as x / how many were actually x in the ones that were labelled
* Recall = labelled as x / how many were actually x in the dataset
