Benchmarking on One Machine 
=========


**Typical User Experience**

Laptop Specs:
    
    Intel Core i7
    16gb RAM
    NVIDIA GeForce GTX 965M 2GB GDDR5 memory 
    Microsoft Windows 10
    Running Jupyter Notebooks and multiple programs in the background

In [1]:
from IPython.display import HTML

In [2]:
HTML('''<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>''')

**Import Packages**

In [124]:
import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import re
import os

import time #cpu time
import psutil #memory usage
#tensorflow
import tensorflow as tf

#Scikitlearn
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file
from scipy.sparse import coo_matrix,csr_matrix,lil_matrix
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# For Visualization
import matplotlib.pyplot as plt
#displays better in jupyter notebooks
%matplotlib inline

In [125]:
#Download data 
#!wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t

***Class and Functions***

In [126]:
#Data importing function
def get_data(file):
    data = load_svmlight_file(file)#avazu-app.tr.bz2
    return data[0], data[1]

**For Importing Raw Individual File**

(single file for raw sparse dataset)

In [127]:
mem_baseline=psutil.virtual_memory() #  physical memory usage
print('Here is the memory baseline prior to importing data:\n',mem_baseline)

Here is the memory baseline prior to importing data:
 svmem(total=17101512704, available=10098614272, percent=40.9, used=7002898432, free=10098614272)


In [144]:
file_location=os.getcwd()+'\\tensorflow-models\\data\\libsvm_data\\a1a.t'

In [145]:
X, y =get_data(file_location) #import raw data
mem_InData=psutil.virtual_memory()

In [146]:
print('Here is the memory usage after importing data:\n',mem_InData) #  physical memory usage
print('\nThe time took to import the raw data:')
exec_time1 = %%timeit -o X, y =get_data(file_location) #import raw data

Here is the memory usage after importing data:
 svmem(total=17101512704, available=10531102720, percent=38.4, used=6570409984, free=10531102720)

The time took to import the raw data:
511 ms ± 70.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [147]:
print('Here is the sparse matrix format for the data:')
X

Here is the sparse matrix format for the data:


<30956x123 sparse matrix of type '<class 'numpy.float64'>'
	with 429343 stored elements in Compressed Sparse Row format>

In [148]:
#May not be appropriate for time dependant data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

** For Importing Previously Seperated Training and Testing File**


In [154]:
train_file_location=os.getcwd()+'\\tensorflow-models\\data\\libsvm_data\\a1a'
test_file_location=os.getcwd()+'\\tensorflow-models\\data\\libsvm_data\\a1a.t'

In [155]:
mem_baseline=psutil.virtual_memory() #  physical memory usage
print('Here is the memory baseline prior to importing data:\n\n',mem_baseline)

Here is the memory baseline prior to importing data:

 svmem(total=17101512704, available=10182381568, percent=40.5, used=6919131136, free=10182381568)


In [156]:
X_train, y_train=get_data(train_file_location)
X_test, y_test=get_data(test_file_location)

In [157]:
mem_baseline=psutil.virtual_memory() #  physical memory usage
print('Here is the memory usage afer importing the data:\n\n',mem_baseline)

Here is the memory usage afer importing the data:

 svmem(total=17101512704, available=10163965952, percent=40.6, used=6937546752, free=10163965952)


Execute Logistic Regression in SciKit Learn
-------

The parameters for the logistic regression (Gradient Descent) are:
    
    Regularization: L2
    
    Regularization Threshold (C): 1.0
    
    Tolerance: 0.001
    
    Fit Intercpt: Yes (True)
    
    Processors (n_jobs): 1
    
    Max Number of Iterations: 100

In [158]:
# Instantiate a logistic regression model, and fit with X and y

model = linear_model.LogisticRegression(penalty='l2',\
                                        C=1.0,\
                                        tol=0.001,\
                                        fit_intercept=True,\
                                        n_jobs=1,\
                                        max_iter=100)

model = model.fit(X_train, y_train.ravel())#

In [159]:
exec_time2 = %%timeit -o model.fit(X_train, y_train.ravel())
print('\nThe average time taken to execute the logistic regression:',exec_time2,'seconds')

9.5 ms ± 66.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

The average time taken to execute the logistic regression: 9.5 ms ± 66.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) seconds


In [162]:
#Setup variables for evaluation metrics
y_pred = model.predict(X_test)
y_obs = y_test
y_score = y_pred

ValueError: X has 123 features per sample; expecting 119

**Model Metrics**

In [153]:
r2=metrics.r2_score(y_obs, y_pred)
accuracy=model.score(X_train, y_train)
prec=metrics.precision_score(y_obs, y_pred, labels=None, pos_label=1)
recall=metrics.recall_score(y_obs, y_pred, labels=None, pos_label=1)
f1 = metrics.f1_score(y_obs,y_pred)
ROC_AUC = metrics.roc_auc_score(y_obs, y_score)
print('The correlation coefficient:',r2,\
      '\nThe accuracy of the model:',accuracy,\
      '\nThe precision (tp / (tp + fp)):',prec,\
      '\nThe recall (tp / (tp + fn)):',recall,\
      '\nThe f1 score is:',f1,\
      '\nThe Area Under the Curve score is:',ROC_AUC)

The correlation coefficient: 0.15665340805070027 
The accuracy of the model: 0.850771456123433 
The precision (tp / (tp + fp)): 0.7170191339375629 
The recall (tp / (tp + fn)): 0.5886730053741215 
The f1 score is: 0.646538024971623 
The Area Under the Curve score is: 0.7582970003143532


Execute Logistic Regression in Tensorflow
-------------

Here the GPU is used for the task but restricted to using only 75% of GPU resources in order to prevent crashing.

In [265]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.75)

In [298]:
learning_rate =  0.001
max_iter = 100
batch_size = 30

train_file = os.getcwd()+'\\tensorflow-models\\data\\libsvm_data\\a2a'
test_file = os.getcwd()+'\\tensorflow-models\\data\\libsvm_data\\a2a.t'

***Class and Functions***

In [299]:
class DataSet(object):
    def __init__(self):
        self.iter = 0
        self.epoch_pass = 0

    def load(self, file):
        '''
        '''
        self.ins_num = 0 #<set at zero
        f = open(file, "r")
        self.y = []
        self.feature_ids = []
        self.feature_values = []
        self.ins_feature_interval = []
        self.max_ins_feature_interval = []
        self.ins_feature_interval.append(0)#makes zero the starting value in ins_feature_interval
        self.max_token=[]
        for line in f.readlines():#iterating through open file
            regexp = re.compile(r':')#<---If feature has a colon then do
            tokens = line.split(" ")#split lines in the file
            #tokens.remove('\n')
            #print(tokens[0])
            self.y.append(float(tokens[0]))#append to y the first value in tokens (+1,-1,1)
            try:
                tokens[-1] = tokens[-1].strip()#<----remove '\n'
                tokens.remove('') #<---remove '' empty in list
            except:
                pass
            
            #                         last value in list is that value + (line splits -1)<--maybe adjusting for return (\n) or y value 
            #print(self.ins_feature_interval[-1]+ len(tokens)-1)#<----stacks the batch sizes
            self.ins_feature_interval.append(self.ins_feature_interval[-1]+ len(tokens)-1)
            #print(len(self.ins_feature_interval))
            for feature in tokens:#(len(tokens)~16
              #  print(feature)
                if regexp.search(feature):#if there is a colon in feature
                    self.max_token.append(feature)#check on size
                    feature_id, feature_value = feature.split(":") #split on colon
                    if feature_id:
                        self.feature_ids.append(int(feature_id))#append to feature ids
                        self.feature_values.append(float(feature_value)) #append feature values
            self.ins_num += 1 #set ins_num to 1
        self.feature_num=max(self.feature_ids)#modify feature_num to max of ids (maximum # of features)
        #self.max_ins_feature_interval=max(self.ins_feature_interval)
        print('the max number of features:',self.feature_num)
    

    def mini_batch(self, batch_size):
        begin = self.iter #begins as 0 as defined above
        end = self.iter #starts with 0 as defined above
        if self.iter + batch_size > self.ins_num: #if 0 + batchsize(10) > ins_num(1) defined in def load
            end = self.ins_num #set end to be ins_num(1) 
            self.iter = 0 #set iter to 0
            self.epoch_pass += 1 #add +1 to epoch_pass 
        else:
            end += batch_size#add batch size to end, which should be equal to batch size
            #print('end:',end)
            #print('batch_size:',batch_size)
            self.iter = end#set self.iter to batch size
            #(begin, end)setting bounds moving across batch length in data
            #print((begin, end))#slicing action of data (0, 15) (15, 30) (30, 45) (60,...
        return self.slice(begin, end)
#Error
    def slice(self, begin, end):
            sparse_index = []
            sparse_ids = []
            sparse_values = []
            sparse_shape = []
            max_feature_num = 0
            for i in range(begin, end):#within range begin, end
            #              15,461,906,1351                  0,446,891,1338 =~15 supposed to be length of token
            #          (token length + range number +1) - (token length + range number)
                feature_num = self.ins_feature_interval[i + 1] - self.ins_feature_interval[i]
                if feature_num > max_feature_num:
                    max_feature_num = feature_num
                                    #       0,446,891,1338                  15,461,906,1351
                #(token length + range number) ,       (token length + range number +1)
                #self.max_ins_feature_interval-len(self.feature_ids)<-------------------
                #print(self.ins_feature_interval[i],self.ins_feature_interval[i + 1])
                #print(self.feature_ids[i])
                for j in range(self.ins_feature_interval[i], self.ins_feature_interval[i + 1]):
                #print(j,(len(self.feature_ids)))
                #print(self.ins_feature_interval[i + 1])
                #print(range(self.ins_feature_interval[i], self.ins_feature_interval[i + 1]))
                #15 vals:                  0  , 0-14,446-460,891-905
                    sparse_index.append([i - begin, j - self.ins_feature_interval[i]]) # index must be accent
                    #[0, 0]-[0, 14][0, 0]-[0, 12]
                    #print([i - begin, j - self.ins_feature_interval[i]])
                    sparse_ids.append(self.feature_ids[j])
                    sparse_values.append(self.feature_values[j])
            sparse_shape.append(end - begin)
            #print(end - begin)#<-----30
            sparse_shape.append(max_feature_num)
            #print(max_feature_num)#<-----15
            #       Creates array shape of 30,1 of y values  (30, 1)
            y = np.array(self.y[begin:end]).reshape((end - begin, 1))
            #begin:0,30,60,90,120,150,180,210 intervals of 30
            #end: 30,60,90,120,150,180,210,240
            #            0            0                       0                 30            30
            #print(len(sparse_index), len(sparse_ids), len(sparse_values), len(sparse_shape), len(y))
            return (sparse_index, sparse_ids, sparse_values, sparse_shape, y)

In [300]:
class BinaryLogisticRegression(object):
    def __init__(self, feature_num):
        self.feature_num = feature_num
        self.sparse_index = tf.placeholder(tf.int64)
        self.sparse_ids = tf.placeholder(tf.int64)
        self.sparse_values = tf.placeholder(tf.float32)
        self.sparse_shape = tf.placeholder(tf.int64)
        self.w = tf.Variable(tf.random_normal([self.feature_num, 1], stddev=0.1))
        self.y = tf.placeholder("float", [None, 1])

    def forward(self):
        return tf.nn.embedding_lookup_sparse(self.w,
                                             tf.SparseTensor(self.sparse_index, self.sparse_ids, self.sparse_shape),
                                             tf.SparseTensor(self.sparse_index, self.sparse_values, self.sparse_shape),
                                             combiner="sum")

In [301]:
mem_baseline=psutil.virtual_memory() #  physical memory usage
print('Here is the memory baseline prior to importing data:\n\n',mem_baseline)

Here is the memory baseline prior to importing data:

 svmem(total=17101512704, available=10327105536, percent=39.6, used=6774407168, free=10327105536)


In [302]:
train_set = DataSet()
train_set.load(train_file)
test_set = DataSet()
test_set.load(test_file)
feature_num=test_set.feature_num

the max number of features: 119
the max number of features: 123


In [303]:
mem_baseline=psutil.virtual_memory() #  physical memory usage
print('Here is the memory usage after importing data:\n',mem_baseline)

Here is the memory usage after importing data:
 svmem(total=17101512704, available=10324746240, percent=39.6, used=6776766464, free=10324746240)


In [304]:
model = BinaryLogisticRegression(feature_num)

In [305]:
y = model.forward()

In [306]:
loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=model.y))

In [307]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

In [308]:
probability_output = tf.nn.sigmoid(y)

In [309]:
session = tf.Session()
init_all_variable = tf.global_variables_initializer()
init_local_variable = tf.local_variables_initializer()
session.run([init_all_variable, init_local_variable])

[None, None]

In [310]:
num_passes=5 #number of passes in for loop

In [317]:
start = time.time()
end_list=[]
for i in range(0,num_passes):
    while train_set.epoch_pass < max_iter:
        sparse_index, sparse_ids, sparse_values, sparse_shape, mb_y = train_set.mini_batch(batch_size)
        _, loss_, prob_out = session.run([optimizer, loss, probability_output],
                                         feed_dict={model.sparse_index: sparse_index,
                                                    model.sparse_ids: sparse_ids,
                                                    model.sparse_values: sparse_values,
                                                    model.sparse_shape: sparse_shape,
                                                    model.y: mb_y})
    end = time.time()
    exec_time=(end - start)
    end_list.append(exec_time) 
    
    try:
        auc = roc_auc_score(mb_y, prob_out)
        print("epoch: ", train_set.epoch_pass, " ROC AUC score is: ", auc)

    except:
        print('\nValueError: Only one class present in y_true. ROC AUC score is not defined in that case.\n')
        print(mb_y.T)
        print(prob_out.T,'\n')

print('\nThe average time taken to execute logistic regression for '+str(num_passes)+' passes',np.array(end_list).mean(),'seconds with a standard deviation of +- '+str(np.array(end_list).std()))

epoch:  100  ROC AUC score is:  0.5
epoch:  100  ROC AUC score is:  0.5
epoch:  100  ROC AUC score is:  0.5
epoch:  100  ROC AUC score is:  0.5
epoch:  100  ROC AUC score is:  0.5

The average time taken to execute logistic regression for 5 passes 0.004765605926513672 seconds with a standard deviation of +- 0.0035704615412940513
