<h1 align="center" style="background-color:#616161;color:white">Linear Regression with SVM</h1>

Adapted from: https://github.com/nfmcclure/tensorflow_cookbook/tree/master/04_Support_Vector_Machines/03_Reduction_to_Linear_Regression


<h3 style="background-color:#616161;color:white">0. Setup</h3>

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Input Parameters</div>

In [1]:
PeriodGranularity = 30 # E.g. 15, 30, 60
# Train / Test split
newUsers = 10   # Num of randomly selected users to separate out of eval 2
rndPeriods = 3 # Num of random periods from each use to select
rndPeriodsLength = int(60/PeriodGranularity) * 24 * 7 * 4     # How long the random test period should cover

# Root path
#root = "C:/DS/Github/MusicRecommendation"  # BA, Windows
root = "/home/badrul/Documents/git/MusicRecommendation" # BA, Linux

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Common Libraries</div>

In [2]:
# Core
import numpy as np
import pandas as pd
from IPython.core.debugger import Tracer    # Used for debugging
import logging

# File and database management
import csv
import os
import sys
import json
import sqlite3
from pathlib import Path

# Date/Time
import datetime
import time
#from datetime import timedelta # Deprecated

# Visualization
import matplotlib.pyplot as plt             # Quick
%matplotlib inline

# Misc
import random

#-------------- Custom Libs -----------------#
os.chdir(root)

# Import the codebase module
fPath = root + "/1_codemodule"
if fPath not in sys.path: sys.path.append(fPath)

# Custom Libs
import coreCode as cc
import lastfmCode as fm

<div style="background-color:white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Page Specific Libraries</div>

In [3]:
# Data science (comment out if not needed)
#from sklearn.manifold import TSNE
import tensorflow as tf
from tensorflow.python.framework import ops
ops.reset_default_graph()

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Declare Functions</div>

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Load settings</div>

In [4]:
settingsDict =  cc.loadSettings()
dbPath = root + settingsDict['mainDbPath']
fmSimilarDbPath = root + settingsDict['fmSimilarDbPath']
fmTagsDbPath = root + settingsDict['fmTagsDbPath']
trackMetaDbPath = root + settingsDict['trackmetadata']

Create a graph session

<h3 style="background-color:#616161;color:white">1. Load data</h3>

In [99]:
def getTrainAndTestData():
    con = sqlite3.connect(dbPath)
    c = con.cursor()

    # Get list of UserIDs 
    trainUsers = pd.read_sql_query("Select UserID from tblUsers Where tblUsers.TestUser = 0",con)

    fieldList="t, UserID, HrsFrom6pm, isSun,isMon,isTue,isWed,isThu,isFri,isSat,t1,t2,t3,t4,t5,t10,t12hrs,t24hrs,t1wk,t2wks,t3wks,t4wks"
    trainDf=pd.DataFrame(columns=[fieldList])  # Create an emmpty df
    testDf=pd.DataFrame(columns=[fieldList])  # Create an emmpty df
    periodsInAMonth=int(60/PeriodGranularity)*24*7*4

    totalRows=0
    
    for user in trainUsers.itertuples():
        # Get training dataset
        SqlStr="SELECT {} from tblTimeSeriesData where UserID = {}".format(fieldList,user.userID)
        df = pd.read_sql_query(SqlStr, con)
        totalRows += len(df)
    
        # Cut-off 1
        k = random.randint(periodsInAMonth, len(df))
        #Tracer()()  -- for debugging purposes
        testDf = testDf.append(df.iloc[k:k+periodsInAMonth])[df.columns.tolist()]

        tmp = df.drop(df.index[k:k+periodsInAMonth])

        # Cut-off 2
        k = random.randint(periodsInAMonth, len(tmp))
        testDf = testDf.append(tmp.iloc[k:k+periodsInAMonth])[df.columns.tolist()]
        trainDf = trainDf.append(tmp.drop(tmp.index[k:k+periodsInAMonth]))[df.columns.tolist()]

    if len(trainDf)+len(testDf) == totalRows:
        print('Ok')
    else:
        print("Incorrect. Total Rows = {}. TestDf+TrainDf rows = {}+{}={}".format(totalRows,len(testDf),len(trainDf),len(testDf)+len(trainDf)))
        
    return trainDf, testDf

trainDf,testDf = getTrainAndTestData()

x_vals = trainDf.drop(['t','UserID'], 1).values
y_vals = trainDf['t'].values.astype(int)


# Change the 0's to -1
y_vals = np.array([1 if y==1 else -1 for y in y_vals])
y_vals =y_vals.reshape(len(y_vals),1)



Ok


<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Confirm dimensions</div>

In [100]:
numOfFeatures = np.shape(x_vals)[1]
np.shape(x_vals),np.shape(y_vals)

((937962, 20), (937962, 1))

In [101]:
# Test data
x_vals_test= testDf.drop(['t','UserID'], 1).values
y_vals_test = testDf['t'].values.astype(int)
y_vals_test = np.array([1 if y==1 else -1 for y in y_vals_test])
y_vals_test=y_vals_test.reshape(len(y_vals_test),1)
# Change the 0's to -1

np.shape(x_vals_test), np.shape( y_vals_test)

((54861, 20), (54861, 1))

<h3 style="background-color:#616161;color:white">2. Define Model</h3>

<div style="background-color:#white; color:#008000; font-family: 'Courier New, Monospace;font-weight: bold">Model Parameters</div>

We now declare our batch size, placeholders, and the fitted b-value for the SVM kernel.  Note that we will create a separate placeholder to feed in the prediction grid for plotting.

In [102]:
# SVM Regression
#----------------------------------
#
# This function shows how to use TensorFlow to
# solve support vector regression. We are going
# to find the line that has the maximum margin
# which INCLUDES as many points as possible
#
from tensorflow.python.framework import ops
ops.reset_default_graph()

# Create graph
sess = tf.Session()

# Declare batch size
batch_size = 50

# Initialize placeholders
x_data = tf.placeholder(shape=[None, numOfFeatures], dtype=tf.float32)
y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)

# Create variables for linear regression
A = tf.Variable(tf.random_normal(shape=[numOfFeatures,1]))  # Weight vector
b = tf.Variable(tf.random_normal(shape=[1,1]))              # Constant

# Declare model operations
model_output = tf.add(tf.matmul(x_data, A), b)
prediction = tf.sign(model_output)

# Declare loss function
# = max(0, abs(target - predicted) + epsilon)
# 1/2 margin width parameter = epsilon
epsilon = tf.constant([0.1])

# Margin term in loss - only anything a greater error than epsilon should count towards the loss: http://cs.adelaide.edu.au/~chhshen/teaching/ML_SVR.pdf
loss = tf.reduce_mean(tf.maximum(0., tf.subtract(tf.abs(tf.subtract(model_output, y_target)), epsilon)))

# Declare optimizer
my_opt = tf.train.GradientDescentOptimizer(0.075)
train_step = my_opt.minimize(loss)

# Initialize variables
init = tf.global_variables_initializer()
sess.run(init)

# Training loop
train_loss = []
test_loss = []

# Train
for i in range(500):
    # Select a batch of train data and train
    rand_index = np.random.choice(len(x_vals), size=batch_size)  
    rand_x = x_vals[rand_index]
    rand_y = y_vals[rand_index]
    sess.run(train_step, feed_dict={x_data: rand_x, y_target: rand_y})
    
    # Monitor the loss on the test data
    temp_train_loss = sess.run(loss, feed_dict={x_data: x_vals, y_target: y_vals})
    train_loss.append(temp_train_loss)
    
    temp_test_loss = sess.run(loss, feed_dict={x_data: x_vals_test, y_target: y_vals_test})
    test_loss.append(temp_test_loss)
    if (i+1)%50==0:
        print('-----------')
        print('Generation: ' + str(i+1))
        #print('A = ' + str(sess.run(A)) + ' b = ' + str(sess.run(b)))
        print('Train Loss = ' + str(temp_train_loss))
        print('Test Loss = ' + str(temp_test_loss))



-----------
Generation: 50
Train Loss = 1.07231
Test Loss = 1.0634
-----------
Generation: 100
Train Loss = 1.87533
Test Loss = 1.86903
-----------
Generation: 150
Train Loss = 2.95895
Test Loss = 2.95311
-----------
Generation: 200
Train Loss = 2.74361
Test Loss = 2.73823
-----------
Generation: 250
Train Loss = 1.4689
Test Loss = 1.46393
-----------
Generation: 300
Train Loss = 2.49283
Test Loss = 2.49298
-----------
Generation: 350
Train Loss = 2.54775
Test Loss = 2.54815
-----------
Generation: 400
Train Loss = 0.459842
Test Loss = 0.45371
-----------
Generation: 450
Train Loss = 2.55184
Test Loss = 2.55243
-----------
Generation: 500
Train Loss = 1.87979
Test Loss = 1.87604


In [103]:
# Evaluate
output=sess.run(model_output, feed_dict={x_data: x_vals})
test_predictions = sess.run(prediction, feed_dict={x_data: x_vals})

from sklearn import metrics
print(metrics.classification_report(y_vals,test_predictions))
print(metrics.confusion_matrix(y_vals,test_predictions))
print("* Precision = labelled as x / how many were actually x in the ones that were labelled")
print("* Recall = labelled as x / how many were actually x in the dataset")

             precision    recall  f1-score   support

         -1       0.92      0.99      0.96    856358
          1       0.71      0.13      0.22     81604

avg / total       0.90      0.92      0.89    937962

[[851850   4508]
 [ 70738  10866]]
* Precision = labelled as x / how many were actually x in the ones that were labelled
* Recall = labelled as x / how many were actually x in the dataset


In [104]:
print(test_predictions-y_vals)

[[ 0.]
 [ 0.]
 [ 0.]
 ..., 
 [-2.]
 [-2.]
 [ 0.]]
