# CSE847 Homework 4 - Part 1

In [1]:
# LIBRARIES
import numpy as np

In [2]:
# -- FROM HW DESCRIPTION
# %
# % code to train a logistic regression classifier
# %
# % INPUTS:
# % data = n * (d+1) matrix withn samples and d features, where
# %    column d+1 is all ones (corresponding to the intercept term)
# % labels = n * 1 vector of class labels (taking values 0 or 1)
# % epsilon = optional argument specifying the convergence
# %    criterion - if the change in the absolute difference in
# %    predictions, from one iteration to the next, averaged across
# %    input features, is less than epsilon, then halt
# %    (if unspecified, use a default value of 1e-5)
# % maxiter = optional argument that specifies the maximum number of
# %    iterations to execute (useful when debugging in case your
# %    code is not converging correctly!)
# %    (if unspecified can be set to 1000)
# %
# % OUTPUT:
# % weights = (d+1) * 1 vector of weights where the weights correspond to
# %    the columns of "data"
# %

def logistic_train(data, labels, epsilon=1e-5, maxiter=1000):
    ones = np.ones(data.shape[0])[...,None]
    X = np.append(ones, data, 1)  # add the intitial column of ones
    
    w = np.zeros(X.shape[1])  # initialize weights to zero
    eta = 1
    
    for i in range(maxiter):
        p_old = logistic_predict(data,w)
        gradient = getGradient(X,labels,w)   # compute the gradient
        w = w + eta*(-1)*gradient            # update weights
        p_new = logistic_predict(data,w)
        if (np.absolute(p_old-p_new).mean()<epsilon): break
    print(" Training Complete:",i,"iterations")
    return(w)

def logistic_predict(X,w,returnProb=False):
    ones = np.ones(X.shape[0])[...,None]
    X = np.append(ones, X, 1)  # add the intitial column of ones
    
    s = np.matmul(X,w)
    p = sigmoid(s) # probability that y = +1 
    if returnProb == False: # map probabilities to predictions
        # arr[arr > 255] = x
        p[p>=0.5] = 1
        p[p<0.5] = -1
        
    return(p)

def getGradient(X,y,w):
    n = y.shape[0]
    s = np.matmul(X,w)
    s = np.multiply(-y,s)
    s = sigmoid(s)
    g = np.multiply(-y,s)
    g = np.matmul(g,X)
    g = (1/n)*g
    return(g)

def sigmoid(s):
    e = np.exp(-s)
    return(1/(1+e))

def getAccuracy(actual, pred):
    accuracy = (actual == pred).sum() / pred.shape[0]
    return(accuracy)

In [3]:
# TEST WITH BASIC DATA
X = np.array([[1, 2], [3, 4], [2, 1], [4,3]])
y = np.array([1,1,-1,-1])
w = np.array([1,1])
ones = np.ones(X.shape[0])[...,None]
np.append(ones, X, 1)
w = logistic_train(X,y,maxiter=1000)
print('w:',w)
logistic_predict(X,w)

 Training Complete: 1 iterations
w: [ 0.         -0.46891175  0.46891175]


array([ 1.,  1., -1., -1.])

In [4]:
# GET SPAM DATASET
X_file = 'data/spam/data.txt'
y_file = 'data/spam/labels.txt'

X_spam = np.loadtxt(X_file)
y_spam = np.loadtxt(y_file)
y_spam[y_spam==0] = -1        # adjust data labels to be [-1,1], not [0,1]
X_spam.shape

(4601, 57)

In [5]:
# TEST WITH DIFFERENT SIZE TRAINING DATASETS
split_options = [200, 500, 800, 1000, 1500, 2000]
X = X_spam
y = y_spam


for split in split_options:
    print('----- N = '+str(split)+' ------------------------------------------')
    # split the data
    X_train = X[split:,:]
    y_train = y[split:]
    X_test = X[:split,:]
    y_test = y[:split]
    
    print(' Train Dataset Size: '+str(y_test.shape[0]))
    
    w = logistic_train(X_train,y_train)
    pred_train = logistic_predict(X_train,w)
    acc_train = getAccuracy(y_train, pred_train)
    print(' Train Accuracy: '+str(acc_train))
          
    pred_test = logistic_predict(X_test,w)
    acc_test = getAccuracy(y_test, pred_test)
    print(' Test Accuracy: '+str(acc_test))

----- N = 200 ------------------------------------------
 Train Dataset Size: 200
 Training Complete: 20 iterations
 Train Accuracy: 0.924789820495342
 Test Accuracy: 0.915
----- N = 500 ------------------------------------------
 Train Dataset Size: 500
 Training Complete: 14 iterations
 Train Accuracy: 0.9229456230187759
 Test Accuracy: 0.908
----- N = 800 ------------------------------------------
 Train Dataset Size: 800
 Training Complete: 14 iterations
 Train Accuracy: 0.9234411996842936
 Test Accuracy: 0.9225
----- N = 1000 ------------------------------------------
 Train Dataset Size: 1000
 Training Complete: 23 iterations
 Train Accuracy: 0.9255762288253263
 Test Accuracy: 0.927
----- N = 1500 ------------------------------------------
 Train Dataset Size: 1500
 Training Complete: 19 iterations
 Train Accuracy: 0.927765237020316
 Test Accuracy: 0.9193333333333333
----- N = 2000 ------------------------------------------
 Train Dataset Size: 2000
 Training Complete: 9 iteratio