In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import itertools
import matplotlib.pyplot as plt
#import helpers

In [2]:
def load_data(path_dataset,sub_sample=True, add_outlier=False):
    """Load data and convert it to the metric system."""
    data = np.genfromtxt(
        path_dataset, delimiter=",", dtype=str,  skip_header=1)
    labels = data[:,1]
    labels[labels=='s']=0
    labels[labels=='b']=1
    labels = np.asarray(labels, dtype=float)
    data = np.delete(data, 1, 1)
    data = np.asarray(data, dtype=float)
    return data, labels

Prediction of :
- type s is assigned value 0
- type b is assigned value 1

In [3]:
# Load data
train_data, labels = load_data('train.csv')

In [4]:
# remove the columns containing a -999 value
valid_cols = np.all(train_data!=-999, axis=0)
train_data = train_data[:,valid_cols]

In [5]:
# Standardize the data
def standardize(x):
     """Standardize the original data set."""
     mean_x = np.mean(x, axis=0)
     x = x - mean_x
     std_x = np.std(x, axis=0)
     x = x / std_x
     return x

new_data = standardize(train_data)

In [6]:
print(new_data)

[[-1.73204388  0.06833197  0.40768027 ...  0.38684673  1.04440205
   0.4125105 ]
 [-1.73203002  0.55250482  0.54013641 ... -0.35771893  0.02130497
  -0.27381996]
 [-1.73201617  3.19515553  1.09655998 ...  0.40013535  0.02130497
  -0.29396985]
 ...
 [ 1.73201617  0.31931645 -0.13086367 ... -0.08608887  0.02130497
  -0.31701723]
 [ 1.73203002 -0.84532397 -0.30297338 ... -0.76742886 -1.00179211
  -0.74543941]
 [ 1.73204388  0.66533608 -0.25352276 ... -0.87267059 -1.00179211
  -0.74543941]]


## Logistic regression

- data (np.array): Dataset of shape (N, D) 
- labels (np.array): Labels of shape (N, ) 
- w (np.array): Weights of logistic regression model of shape (D, ) 

In [8]:
def sigmoid(t):
    """apply sigmoid function on t."""
    return 1.0 / (1 + np.exp(-t))

Recall that the cross entropy loss is defined as:
$$
R(w\ = -\sum_i (y_i \log(\hat{y}(\xb_i)) + (1-y_i)\log(1-\hat{y}(\xb_i))) $$

Let's code it using NumPy. If you do it correctly, it can be written in one line!

In [9]:
def calculate_loss(y, tx, w):
    """compute the cost by negative log likelihood."""
    pred = sigmoid(tx.dot(w))
    loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))
    return np.squeeze(- loss)

In [10]:
def calculate_gradient(y, tx, w):
    """compute the gradient of loss."""
    pred = sigmoid(tx.dot(w))
    grad = tx.T.dot(pred - y)
    return grad

In [11]:
def learning_by_gradient_descent(y, tx, w, gamma):
    """
    Do one step of gradient descent using logistic regression.
    Return the loss and the updated w.
    """
    loss = calculate_loss(y, tx, w)
    grad = calculate_gradient(y, tx, w)
    w -= gamma * grad
    return loss, w

In [12]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    """ Training function for binary class logistic regression. 
    
    Args:
        y (np.array): Labels of shape (N, ).
        tx (np.array): Dataset of shape (N, D).
        initial_w (np.array): Initial weights of shape (D,)
        max_iters (integer): Maximum number of iterations.
        gamma (integer): Step size
    Returns:
        np.array: weights of shape(D, )
    """  
    
    threshold = 1e-8
    losses = []
    
    w = initial_w.copy()
    print(w.shape)
    print(tx.shape)
    print(y.shape)
    for it in range(max_iters):
        loss, w = learning_by_gradient_descent(y, tx, w, gamma)
        # log info
        if it % 100 == 0:
            print("Current iteration={i}, loss={l}".format(i=it, l=loss))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    print("loss={l}".format(l=calculate_loss(y, tx, w)))
    return (w, loss)

In [14]:
# Initialize weights
tx = np.c_[np.ones((labels.shape[0], 1)), new_data]
initial_w = np.ones((tx.shape[1]))
trained_weights, train_loss = logistic_regression(labels, tx, initial_w, max_iters=10000, gamma=0.01)

(21,)
(250000, 21)
(250000,)
Current iteration=0, loss=nan


  loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))
  return 1.0 / (1 + np.exp(-t))


Current iteration=100, loss=nan
Current iteration=200, loss=nan
Current iteration=300, loss=nan
Current iteration=400, loss=nan
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan


KeyboardInterrupt: 