# Lab 7: Logistic Regression
In this lab, I will implement a basic logarithmic regression function. I will report the training and testing error values below.

In [1]:
import numpy as np

ALPHA = np.power(10.0, -5)

def read_data(filename):
    f = open(filename, 'r')
    data = [l.replace('\n','').split(',') for l in f if not l.isspace()]
    return data

def normalize(arr):
    mean = np.mean(arr, axis=0)
    std = np.std(arr, axis=0)
    return np.array([(x - m)/s for x,m,s in zip(arr.T,mean,std)]).T

def accuracy(act, pred):
    assert len(pred) == len(act)
    correct = 0
    for p,a in zip(pred, act):
        if (p >= 0.5) == a:
            correct = correct + 1
    return 1-(correct/len(act))

def sigmoid(x):
    return 1/(1+np.exp(-x))

def solve_w(w, X, y):
    p1 = sigmoid(X.dot(w))
    p2 = (X.T).dot(y-p1)
#     print(X.shape, w.shape, p1.shape, p2.shape)
    return w + (ALPHA*p2)

def optimize_w(w_init, X, y, tol=0.00001):
    mv = 1
    w = w_init
    while mv > tol:
        prev_w = w
        w = solve_w(w, X, y)
        mv = np.linalg.norm(w - prev_w)
    return w

## Problem 1: Logistic regression

In [2]:
def problem1(train, test):
    
    # format training data
    X_train, y_train = np.split(train, [-1], 1)
    y_train = np.reshape(y_train, -1)
    
    # normalize features
    X_train = normalize(X_train)
    X_train = np.hstack((X_train, [[1]]*len(X_train)))
    
    # perform gradient descent
    w_temp = np.random.normal(size=len(X_train[0]))
    w = optimize_w(w_temp, X_train, y_train, tol=np.power(10.0, -5))
    
    # format testing data
    X_test, y_test = np.split(test, [-1], 1)
    y_test = np.reshape(y_test, -1)
    X_test = normalize(X_test)
    X_test = np.hstack((X_test, [[1]]*len(X_test)))
    
    # prediction
    pred_train = (X_train).dot(w)
    pred_test = (X_test).dot(w)
    
    # evaluate prediction
    err = (accuracy(y_train, pred_train), accuracy(y_test, pred_test))
    print('>====== Problem 1 ======<')
    print('Training error: {0:.2f}%\nTesting error: {1:.2f}%\n'.format(err[0]*100,err[1]*100))
    return pred_test

In [3]:
def main():
    # import training data
    raw_data = read_data('spambase/spam-train')
    train = np.array(raw_data).astype('float')
    
    # import testing data
    raw_data = read_data('spambase/spam-test')
    test = np.array(raw_data).astype('float')
    
    # run predictions
    pred = problem1(train, test)
    
main()

FileNotFoundError: [Errno 2] No such file or directory: 'spambase/spam-train'