Benchmarking on One Machine 
=========


**Typical User Experience**

Laptop Specs:
    
    Intel Core i7
    16gb RAM
    NVIDIA GeForce GTX 965M 2GB GDDR5 memory 
    Microsoft Windows 10
    Running Jupyter Notebooks and multiple programs in the background

In [1]:
from IPython.display import HTML

In [2]:
HTML('''<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>''')

**Import Packages**

In [49]:
import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import re
import os

import time #cpu time
import psutil #memory usage
#tensorflow
import tensorflow as tf

#Scikitlearn
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file
from scipy.sparse import coo_matrix,csr_matrix,lil_matrix
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# For Visualization
import matplotlib.pyplot as plt
#displays better in jupyter notebooks
%matplotlib inline

In [4]:
#Download data 
#!wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t

In [5]:
def get_data(file):
    data = load_svmlight_file(file)#avazu-app.tr.bz2
    return data[0], data[1]

In [6]:
mem_baseline=psutil.virtual_memory() #  physical memory usage
print('Here is the memory baseline before importing data:\n',mem_baseline)

Here is the memory baseline before importing data:
 svmem(total=17101512704, available=9795440640, percent=42.7, used=7306072064, free=9795440640)


In [7]:
X, y =get_data('a1a.t') #import raw data
mem_InData=psutil.virtual_memory()

In [8]:
print('Here is the memory usage after importing data:\n',mem_InData) #  physical memory usage
print('\nThe time took to import the raw data:')
exec_time1 = %%timeit -o X, y =get_data('a1a.t') #import raw data

Here is the memory usage after importing data:
 svmem(total=17101512704, available=9743892480, percent=43.0, used=7357620224, free=9743892480)

The time took to import the raw data:
550 ms ± 107 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
#%%timeit
#X, y =get_data('a1a.t.bz2') #import binary data

In [10]:
print('Here is the sparse matrix format for the data:') 
X

Here is the sparse matrix format for the data:


<30956x123 sparse matrix of type '<class 'numpy.float64'>'
	with 429343 stored elements in Compressed Sparse Row format>

In [11]:
#May not be appropriate for time dependant data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Execute Logistic Regression in SciKit Learn
-------

In [12]:
# instantiate a logistic regression model, and fit with X and y
# Here are the 
model = linear_model.LogisticRegression(penalty='l2',\
                                        C=1.0,\
                                        tol=0.0001,\
                                        fit_intercept=True,\
                                        n_jobs=1,\
                                        max_iter=100)
start = time.time()
model = model.fit(X, y.ravel())
end = time.time()

In [13]:
exec_time2=(end - start)
print('The time taken to execute the logistic regression:',exec_time2,'seconds')

The time taken to execute the logistic regression: 1.3263437747955322 seconds


In [14]:
y_pred = model.predict(X_test)
y_obs = y_test
y_score = y_pred

**Model Metrics**

In [15]:
r2=metrics.r2_score(y_obs, y_pred)
accuracy=model.score(X_train, y_train)
prec=metrics.precision_score(y_obs, y_pred, labels=None, pos_label=1)
recall=metrics.recall_score(y_obs, y_pred, labels=None, pos_label=1)
f1 = metrics.f1_score(y_obs,y_pred)
ROC_AUC = metrics.roc_auc_score(y_obs, y_score)
print('The correlation coefficient:',r2,\
      '\nThe accuracy of the model:',accuracy,\
      '\nThe precision (tp / (tp + fp)):',prec,\
      '\nThe recall (tp / (tp + fn)):',recall,\
      '\nThe f1 score is:',f1,\
      '\nThe Area Under the Curve score is:',ROC_AUC)

The correlation coefficient: 0.1588200017358622 
The accuracy of the model: 0.8501928640308583 
The precision (tp / (tp + fp)): 0.7191295546558705 
The recall (tp / (tp + fn)): 0.5874328234807772 
The f1 score is: 0.6466439135381115 
The Area Under the Curve score is: 0.7581257999666293


Execute Logistic Regression in Tensorflow
-------------

Here the GPU is used for the task but restricted to using only 75% of GPU resources in order to prevent crashing.

In [833]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.75)

In [834]:
learning_rate =  0.01
max_iter = 10
batch_size = 30

train_file = os.getcwd()+'\\tensorflow-models\\data\\libsvm_data\\a1a'
test_file = os.getcwd()+'\\tensorflow-models\\data\\libsvm_data\\a1a.t'

In [848]:
class DataSet(object):
    feature_num=1
    
    def __init__(self):
        self.iter = 0
        self.epoch_pass = 0
        self.new_num = DataSet.feature_num    
        
    def update_num(new_num):
        while DataSet.feature_num<new_num:
            DataSet.feature_num=new_num
            return DataSet.feature_num
    
    def load(self, file):
        self.ins_num = 0
        f = open(file, "r")
        self.y = []
        self.feature_ids = []
        self.feature_values = []
        self.ins_feature_interval = []
        self.ins_feature_interval.append(0)

        for line in f.readlines():
            tokens = line.split(" ")
            self.y.append(float(tokens[0]))
            self.ins_feature_interval.append(self.ins_feature_interval[-1] + len(tokens) - 1)
            #for feature in tokens[1:]:
            regexp = re.compile(r':')
            for feature in tokens:
                if regexp.search(feature):
                    feature_id, feature_value = feature.split(":")#feature number should be max of the list, match to feature
                    if int(feature_id)>=0:
                        self.feature_ids.append(int(feature_id))
                        self.feature_values.append(float(feature_value)) 
                        #print(feature_id)
            self.ins_num += 1 
        return DataSet.update_num(max(self.feature_ids))

    def mini_batch(self, batch_size):
        begin = self.iter
        end = self.iter
        if self.iter + batch_size > self.ins_num:
            end = self.ins_num
            self.iter = 0
            self.epoch_pass += 1
        else:
            end += batch_size
            self.iter = end
            #print(begin,end)
            #print(self.slice(begin, end))
        return self.slice(begin, end)

    def slice(self, begin, end):
        sparse_index = []
        sparse_ids = []
        sparse_values = []
        sparse_shape = []
        max_feature_num = 0
        
        for i in range(begin, end):
            #print([begin,end])
            DataSet.feature_num = self.ins_feature_interval[i + 1] - self.ins_feature_interval[i]
            #print(self.ins_feature_interval[i + 1] - self.ins_feature_interval[i])
            if DataSet.feature_num > max_feature_num:
                max_feature_num = DataSet.feature_num
                #print(max_feature_num)
            for j in range(self.ins_feature_interval[i], self.ins_feature_interval[i + 1]): 
                #print(range(self.ins_feature_interval[i], self.ins_feature_interval[i + 1]))
                sparse_index.append([i - begin, j - self.ins_feature_interval[i]]) # index must be accent
                #print([i - begin, j - self.ins_feature_interval[i]])
                sparse_ids.append((self.feature_ids[j]))#<----
                #print(self.feature_ids[j])
                sparse_values.append(self.feature_values[j])
                #print(self.feature_values[j])
                
            sparse_shape.append(end - begin)
            #print(sparse_shape)
            sparse_shape.append(max_feature_num)
            #print(sparse_shape)
            y = np.array(self.y[begin:end]).reshape((end - begin, 1))
            #print(len(sparse_index), len(sparse_ids), len(sparse_values), len(sparse_shape), len(y))
            #print(sparse_index)
            #print(sparse_ids)
            #print(sparse_values)
            #print(sparse_shape)
            #print(y)
            return (sparse_index, sparse_ids, sparse_values, sparse_shape, y)

In [849]:
#train_set.mini_batch(batch_size)

In [850]:
DataSet.feature_num

1

In [851]:
class BinaryLogisticRegression(object):
    def __init__(self, feature_num):
        self.feature_num = feature_num
        self.sparse_index = tf.placeholder(tf.int64)
        self.sparse_ids = tf.placeholder(tf.int64)
        self.sparse_values = tf.placeholder(tf.float32)
        self.sparse_shape = tf.placeholder(tf.int64)
        self.w = tf.Variable(tf.random_normal([self.feature_num, 1], stddev=0.1))
        self.y = tf.placeholder("float", [None, 1])

    def forward(self):
        return tf.nn.embedding_lookup_sparse(self.w,
                                             tf.SparseTensor(self.sparse_index, self.sparse_ids, self.sparse_shape),
                                             tf.SparseTensor(self.sparse_index, self.sparse_values, self.sparse_shape),
                                             combiner="sum")

In [852]:
train_set = DataSet()
train_set.load(train_file)

119

In [853]:
test_set = DataSet()
test_set.load(test_file)

123

In [854]:
model = BinaryLogisticRegression(DataSet.feature_num)

In [855]:
y = model.forward()

In [856]:
loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=model.y))

In [857]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

In [858]:
probability_output = tf.nn.sigmoid(y)

In [859]:
session = tf.Session()
init_all_variable = tf.global_variables_initializer()
init_local_variable = tf.local_variables_initializer()
session.run([init_all_variable, init_local_variable])

[None, None]

In [860]:
while train_set.epoch_pass < max_iter:
    sparse_index, sparse_ids, sparse_values, sparse_shape, mb_y = train_set.mini_batch(batch_size)

    _, loss_, prob_out = session.run([optimizer, loss, probability_output],
                                     feed_dict={model.sparse_index: sparse_index,
                                                model.sparse_ids: sparse_ids,
                                                model.sparse_values: sparse_values,
                                                model.sparse_shape: sparse_shape,
                                                model.y: mb_y})

auc = roc_auc_score(mb_y, prob_out)
print("epoch: ", train_set.epoch_pass, " auc: ", auc)

IndexError: list index out of range

#for loop to create our own timeit