In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import itertools
import matplotlib.pyplot as plt
#import helpers

In [2]:
def load_data(path_dataset,sub_sample=True, add_outlier=False):
    """Load data and convert it to the metric system."""
    data = np.genfromtxt(
        path_dataset, delimiter=",", dtype=str,  skip_header=1)
    labels = data[:,1]
    labels[labels=='s']=0
    labels[labels=='b']=1
    labels = np.asarray(labels, dtype=float)
    data = np.delete(data, 1, 1)
    data = np.asarray(data, dtype=float)
    return data, labels

Prediction of :
- type s is assigned value 0
- type b is assigned value 1

In [3]:
# Load data
train_data, labels = load_data('train.csv')

In [4]:
# remove the columns containing a -999 value
valid_cols = np.all(train_data!=-999, axis=0)
train_data = train_data[:,valid_cols]

In [None]:
# Standardize the data
def standardize(x):
     """Standardize the original data set."""
     mean_x = np.mean(x, axis=0)
     x = x - mean_x
     std_x = np.std(x, axis=0)
     x = x / std_x
     return x

new_data = standardize(train_data)

In [None]:
print(new_data)

## Logistic regression

- data (np.array): Dataset of shape (N, D) 
- labels (np.array): Labels of shape (N, ) 
- w (np.array): Weights of logistic regression model of shape (D, ) 

In [7]:
def sigmoid(t):
    """apply sigmoid function on t."""
    return 1.0 / (1 + np.exp(-t))

Recall that the cross entropy loss is defined as:
$$
R(\wb) = -\sum_i (y_i \log(\hat{y}(\xb_i)) + (1-y_i)\log(1-\hat{y}(\xb_i))) $$

Let's code it using NumPy. If you do it correctly, it can be written in one line!

In [8]:
def calculate_loss(y, tx, w):
    """compute the cost by negative log likelihood."""
    pred = sigmoid(tx.dot(w))
    loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))
    return np.squeeze(- loss)

In [9]:
def calculate_gradient(y, tx, w):
    """compute the gradient of loss."""
    pred = sigmoid(tx.dot(w))
    grad = tx.T.dot(pred - y)
    return grad

In [None]:
def learning_by_gradient_descent(y, tx, w, gamma):
    """
    Do one step of gradient descent using logistic regression.
    Return the loss and the updated w.
    """
    loss = calculate_loss(y, tx, w)
    grad = calculate_gradient(y, tx, w)
    w -= gamma * grad
    return loss, w

In [10]:
def logistic_regression_classify(tx, w):
    """ Classification function for binary class logistic regression. 
    
    Args:
        tx (np.array): Dataset of shape (N, D).
        w (np.array): Weights of logistic regression model of shape (D, )
    Returns:
        np.array: Label assignments of data of shape (N, )
    """
    #### write your code here: find predictions and threshold.
    predictions = sigmoid(tx @ w)
    predictions[predictions<0.5]=0
    predictions[predictions>=0.5]=1        
    return predictions

In [11]:
def accuracy(labels_gt, labels_pred):
    """ Computes accuracy.
    
    Args:
        labels_gt (np.array): GT labels of shape (N, ).
        labels_pred (np.array): Predicted labels of shape (N, ).
        
    Returns:
        float: Accuracy, in range [0, 1].
    """
    
    return np.sum(labels_gt == labels_pred) / labels_gt.shape[0]

In [12]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    """ Training function for binary class logistic regression. 
    
    Args:
        y (np.array): Labels of shape (N, ).
        tx (np.array): Dataset of shape (N, D).
        initial_w (np.array): Initial weights of shape (D,)
        max_iters (integer): Maximum number of iterations.
        gamma (integer): Step size
    Returns:
        np.array: weights of shape(D, )
    """  
    
    threshold = 1e-8
    losses = []
    
    w = initial_w.copy()
    print(w.shape)
    print(tx.shape)
    print(y.shape)
    for it in range(max_iters):
        loss, w = learning_by_gradient_descent(y, tx, w, gamma)
        # log info
        if iter % 100 == 0:
            print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    print("loss={l}".format(l=calculate_loss(y, tx, w)))
    return (w, loss)

In [13]:
# Initialize weights
tx = np.c_[np.ones((labels.shape[0], 1)), new_data]
initial_w = np.zeros((tx.shape[1], 1))
trained_weights, train_loss = logistic_regression(labels, tx, initial_w, max_iters=1000, gamma=1e-3)

(21,)
(250000, 21)
(250000,)
loss at iteration 0 : nan
loss at iteration 1 : 61154621.75546359
loss at iteration 2 : nan
loss at iteration 3 : nan
loss at iteration 4 : nan
loss at iteration 5 : nan
loss at iteration 6 : nan
loss at iteration 7 : nan
loss at iteration 8 : nan


  loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))
  return np.sum(labels_gt == labels_pred) / labels_gt.shape[0]
  return 1.0 / (1 + np.exp(-t))


loss at iteration 9 : nan
loss at iteration 10 : nan
loss at iteration 11 : nan
loss at iteration 12 : nan
loss at iteration 13 : nan
loss at iteration 14 : nan
loss at iteration 15 : nan
loss at iteration 16 : nan
loss at iteration 17 : 53237890.48214967
loss at iteration 18 : nan
loss at iteration 19 : nan
loss at iteration 20 : nan
loss at iteration 21 : nan
loss at iteration 22 : nan
loss at iteration 23 : nan
loss at iteration 24 : nan
loss at iteration 25 : nan
loss at iteration 26 : nan
loss at iteration 27 : nan
loss at iteration 28 : nan
loss at iteration 29 : nan
loss at iteration 30 : nan
loss at iteration 31 : nan
loss at iteration 32 : nan
loss at iteration 33 : nan
loss at iteration 34 : 35595872.98086763
loss at iteration 35 : nan
loss at iteration 36 : nan
loss at iteration 37 : nan
loss at iteration 38 : nan
loss at iteration 39 : nan
loss at iteration 40 : nan
loss at iteration 41 : nan
loss at iteration 42 : 50021763.13945133
loss at iteration 43 : nan
loss at iterat