Benchmarking on One Machine 
=========


**Typical User Experience**

Laptop Specs:
    
    Intel Core i7
    16gb RAM
    NVIDIA GeForce GTX 965M 2GB GDDR5 memory 
    Microsoft Windows 10
    Running Jupyter Notebooks and multiple programs in the background

In [50]:
HTML('''<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>''')

**Import Packages**

In [10]:
from IPython.display import HTML
import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import os

import time #cpu time
import psutil #memory usage
#tensorflow
import tensorflow as tf

#Scikitlearn
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file
from scipy.sparse import coo_matrix,csr_matrix,lil_matrix
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_curve

# For Visualization
import matplotlib.pyplot as plt
#displays better in jupyter notebooks
%matplotlib inline

In [11]:
#Download data 
#!wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t

In [12]:
def get_data(file):
    data = load_svmlight_file(file)#avazu-app.tr.bz2
    return data[0], data[1]

In [13]:
mem_baseline=psutil.virtual_memory() #  physical memory usage
print('Here is the memory baseline before importing data:\n',mem_baseline)

Here is the memory baseline before importing data:
 svmem(total=17101512704, available=9936486400, percent=41.9, used=7165026304, free=9936486400)


In [14]:
X, y =get_data('a1a.t') #import raw data
mem_InData=psutil.virtual_memory()

In [15]:
print('Here is the memory usage after importing data:\n',mem_InData) #  physical memory usage
print('\nThe time took to import the raw data:')
exec_time1 = %%timeit -o X, y =get_data('a1a.t') #import raw data

Here is the memory usage after importing data:
 svmem(total=17101512704, available=9936084992, percent=41.9, used=7165427712, free=9936084992)

The time took to import the raw data:
515 ms ± 6.44 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
#%%timeit
#X, y =get_data('a1a.t.bz2') #import binary data

In [17]:
print('Here is the sparse matrix format for the data:') 
X

Here is the sparse matrix format for the data:


<30956x123 sparse matrix of type '<class 'numpy.float64'>'
	with 429343 stored elements in Compressed Sparse Row format>

In [18]:
#May not be appropriate for time dependant data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Execute Logistic Regression in SciKit Learn
-------

In [47]:
# instantiate a logistic regression model, and fit with X and y
# Here are the 
model = linear_model.LogisticRegression(penalty='l2',\
                                        C=1.0,\
                                        tol=0.0001,\
                                        fit_intercept=True,\
                                        n_jobs=1,\
                                        max_iter=100)
start = time.time()
model = model.fit(X, y.ravel())
end = time.time()

In [48]:
exec_time2=(end - start)
print('The time taken to execute the logistic regression:',exec_time2,'seconds')

The time taken to execute the logistic regression: 1.0180144309997559 seconds


In [49]:
y_pred = model.predict(X_test)
y_obs = y_test
y_score = y_pred

**Model Metrics**

In [50]:
r2=metrics.r2_score(y_obs, y_pred)
accuracy=model.score(X_train, y_train)
prec=metrics.precision_score(y_obs, y_pred, labels=None, pos_label=1)
recall=metrics.recall_score(y_obs, y_pred, labels=None, pos_label=1)
f1 = metrics.f1_score(y_obs,y_pred)
ROC_AUC = metrics.roc_auc_score(y_obs, y_score)
print('The correlation coefficient:',r2,\
      '\nThe accuracy of the model:',accuracy,\
      '\nThe precision (tp / (tp + fp)):',prec,\
      '\nThe recall (tp / (tp + fn)):',recall,\
      '\nThe f1 score is:',f1,\
      '\nThe Area Under the Curve score is:',ROC_AUC)

The correlation coefficient: 0.1588200017358622 
The accuracy of the model: 0.8501928640308583 
The precision (tp / (tp + fp)): 0.7191295546558705 
The recall (tp / (tp + fn)): 0.5874328234807772 
The f1 score is: 0.6466439135381115 
The Area Under the Curve score is: 0.7581257999666293


Execute Logistic Regression in Tensorflow
-------------

In [19]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.75)

Reshape Scikit-learn CSR sparse matrix to Tensorflow sparse tensor matrix

In [20]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, coo.data, coo.shape)

In [21]:
tf_train_matrix=convert_sparse_matrix_to_sparse_tensor(X_train)
print('SparseTensor shape:',tf_train_matrix.get_shape())

SparseTensor shape: (20740, 123)


In [64]:
def load(file, feature_num):
    feature_num = feature_num
    ins_num = 0
    f = open(file, "r")
    ins_feature_interval.append(0)
    for line in f.readlines():
        tokens = line.split(" ")
        y.append(float(tokens[0]))
        ins_feature_interval.append(ins_feature_interval[-1] + len(tokens) - 1)
        for feature in tokens[1:]:
            try:
                feature_id, feature_value = feature.split(":")
                feature_ids.append(int(feature_id))
                feature_values.append(float(feature_value))
            except:
                continue
        ins_num += 1

In [92]:
sparse_index = []
sparse_ids = []
sparse_values = []
sparse_shape = []

def slice(begin, end):
    max_feature_num = 0
    for i in range(begin, end):
        feature_num = ins_feature_interval[i + 1] - ins_feature_interval[i]
        if feature_num > max_feature_num:
            max_feature_num = feature_num
        for j in range(ins_feature_interval[i], ins_feature_interval[i + 1]):
            sparse_index.append([i - begin, j - ins_feature_interval[i]]) # index must be accent
            sparse_ids.append(feature_ids[j])
            sparse_values.append(feature_values[j])
    sparse_shape.append(end - begin)
    sparse_shape.append(max_feature_num)
    y = np.array(y[begin:end]).reshape((end - begin, 1))
    return (sparse_index, sparse_ids, sparse_values, sparse_shape, y)

iters=0
epoch_pass=0

def mini_batch(batch_size):
    begin = iters
    end = iters
    if iters + batch_size > ins_num:
        end = ins_num
        iters = 0
        epoch_pass += 1
    else:
        end += batch_size
        iters = end
    return slice(begin, end)

In [93]:
class BinaryLogisticRegression(object):
    def __init__(self, feature_num):
        self.feature_num = feature_num
        self.sparse_index = tf.placeholder(tf.int64)
        self.sparse_ids = tf.placeholder(tf.int64)
        self.sparse_values = tf.placeholder(tf.float32)
        self.sparse_shape = tf.placeholder(tf.int64)
        self.w = tf.Variable(tf.random_normal([self.feature_num, 1], stddev=0.1))
        self.y = tf.placeholder("float", [None, 1])
    def forward(self):
            return tf.nn.embedding_lookup_sparse(self.w,
                                                 tf.SparseTensor(self.sparse_index, self.sparse_ids, self.sparse_shape),
                                                 tf.SparseTensor(self.sparse_index, self.sparse_values, self.sparse_shape),
                                                 combiner="sum")

In [94]:
y = []
feature_ids = []
feature_values = []
ins_feature_interval = []

learning_rate =  0.001
max_iter = 100
batch_size = 100
feature_num = 123
dense = False

load('a1a.t', 123)

In [96]:
sparse_index = tf.placeholder(tf.int64)
sparse_ids = tf.placeholder(tf.int64)
sparse_values = tf.placeholder(tf.float32)
sparse_shape = tf.placeholder(tf.int64)

In [97]:
w = tf.Variable(tf.random_normal([feature_num, 1], stddev=0.1))

In [98]:
y = tf.placeholder("float", [None, 1])

In [99]:
model = BinaryLogisticRegression(123)

In [100]:
y = model.forward()

In [101]:
loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=model.y))

In [102]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

In [103]:
probability_output = tf.nn.sigmoid(y)

In [104]:
session = tf.Session()
init_all_variable = tf.global_variables_initializer()
init_local_variable = tf.initialize_local_variables()
session.run([init_all_variable, init_local_variable])

[None, None]

In [105]:
while epoch_pass < max_iter:
    sparse_index, sparse_ids, sparse_values, sparse_shape, mb_y = train_set.mini_batch(batch_size)

    _, loss_, prob_out = session.run([optimizer, loss, probability_output],\
                                     feed_dict={model.sparse_index: sparse_index,\
                                                model.sparse_ids: sparse_ids,\
                                                model.sparse_values: sparse_values,\
                                                model.sparse_shape: sparse_shape,\
                                                model.y: mb_y})
auc = roc_auc_score(mb_y, prob_out)
print("epoch: ", epoch_pass, " auc: ", auc)

NameError: name 'train_set' is not defined