In [3]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
#import helpers

In [55]:
def load_data(path_dataset,sub_sample=True, add_outlier=False):
    """Load data and convert it to the metric system."""
    data = np.genfromtxt(
        path_dataset, delimiter=",", dtype=str,  skip_header=1)
    labels = data[:,1]
    labels[labels=='s']=0
    labels[labels=='b']=1
    labels = np.asarray(labels, dtype=float)
    data = np.delete(data, 1, 1)
    data = np.asarray(data, dtype=float)
    #data[:,:][data[:,:]==-999]=None
    return data, labels

Prediction of :
- type s is assigned value 0
- type b is assigned value 1

In [83]:
# Load data
train_data, labels = load_data('train.csv')

In [84]:
# remove the columns containing a -999 value
valid_cols = np.all(train_data!=-999, axis=0)
train_data = train_data[:,valid_cols]

In [97]:
# Normalize the data
new_data = (train_data - train_data.min(axis=0)) / train_data.max(axis=0)

In [99]:
print(new_data)
print(np.max(new_data, axis=0))

[[0.00000000e+00 7.48541825e-02 6.78088948e-02 ... 1.22284399e-01
  6.66666667e-01 6.94837193e-02]
 [2.85715102e-06 9.96529363e-02 7.18167475e-02 ... 7.52843347e-02
  3.33333333e-01 2.82999058e-02]
 [5.71430204e-06 2.35006340e-01 8.86529895e-02 ... 1.23123231e-01
  3.33333333e-01 2.70907959e-02]
 ...
 [7.14279184e-01 8.77093070e-02 5.15136536e-02 ... 9.24307477e-02
  3.33333333e-01 2.57078191e-02]
 [7.14282041e-01 2.80578198e-02 4.63059649e-02 ... 4.94217496e-02
  0.00000000e+00 0.00000000e+00]
 [7.14284898e-01 1.05432018e-01 4.78022397e-02 ... 4.27784564e-02
  0.00000000e+00 0.00000000e+00]]
[0.7142849  1.         0.9953096  1.         0.96340605 1.
 0.97511204 0.99762302 2.         0.97383596 2.00080096 2.
 0.95359389 2.00079904 2.         0.99996166 2.         0.99317457
 1.         1.        ]


## Logistic regression

- data (np.array): Dataset of shape (N, D) 
- labels (np.array): Labels of shape (N, ) 
- w (np.array): Weights of logistic regression model of shape (D, ) 

In [106]:
def loss_logistic(data, labels, w): 
    """ Logistic regression loss function for binary classes
    
    Args:
        data (np.array): Dataset of shape (N, D).
        labels (np.array): Labels of shape (N, ).
        w (np.array): Weights of logistic regression model of shape (D, )
    Returns:
        int: Loss of logistic regression.
    """    
    return -np.sum(labels * np.log(sigmoid(data @ w)) + (1 - labels) * np.log(1 - sigmoid(data @ w)))

In [143]:
def sigmoid(t):
    return 1/(1+np.exp(-t))  

In [144]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    """ Training function for binary class logistic regression. 
    
    Args:
        y (np.array): Dataset of shape (N, D).
        tx (np.array): Labels of shape (N, ).
        initial_w (np.array): Initial weights of shape (D,)
        max_iters (integer): Maximum number of iterations.
        gamma (integer): Step size
    Returns:
        np.array: weights of shape(D, )
    """  
    w = initial_w.copy()
    for it in range(max_iters):
        # Compute gradient
        gradient = y.T.dot(sigmoid(y.dot(w)) - tx)
        w = w - gamma * gradient
        # Classify the predictions
        predictions = sigmoid(y @ w)
        print(predictions)
        predictions[predictions<0.5]=0
        predictions[predictions>=0.5]=1
        # Compute loss
        loss = loss_logistic(y, tx, w)
        print('loss at iteration', it)
        # Compute accuracy
        accuracy = np.sum(tx == predictions) / tx.shape[0];
        print(accuracy)
        # If accurate then break
        if accuracy == 1:
            break
    return (w, loss)

In [145]:
# Initialize weights
initial_w = np.random.normal(0., 0.1, [new_data.shape[1],])
logistic_regression(new_data, labels, initial_w, max_iters=100, gamma=0.1)

[1. 1. 1. ... 1. 1. 1.]
loss at iteration 0
0.657332
[0. 0. 0. ... 0. 0. 0.]
loss at iteration 1
0.342668
[1. 1. 1. ... 1. 1. 1.]
loss at iteration 2
0.657332
[1. 1. 1. ... 1. 1. 1.]
loss at iteration 3
0.657068
[0. 0. 0. ... 0. 0. 0.]
loss at iteration 4
0.342668
[1. 1. 1. ... 1. 1. 1.]
loss at iteration 5
0.657332
[1. 0. 0. ... 1. 1. 1.]
loss at iteration 6
0.660044
[0. 0. 0. ... 0. 0. 1.]
loss at iteration 7
0.499924
[1. 1. 1. ... 1. 1. 1.]


  return -np.sum(labels * np.log(sigmoid(data @ w)) + (1 - labels) * np.log(1 - sigmoid(data @ w)))
  return -np.sum(labels * np.log(sigmoid(data @ w)) + (1 - labels) * np.log(1 - sigmoid(data @ w)))
  return 1/(1+np.exp(-t))


loss at iteration 8
0.657332
[0. 0. 0. ... 0. 0. 1.]
loss at iteration 9
0.55682
[1. 1. 1. ... 1. 1. 1.]
loss at iteration 10
0.657332
[0. 0. 0. ... 0. 0. 0.]
loss at iteration 11
0.342668
[1. 1. 1. ... 1. 1. 1.]
loss at iteration 12
0.657332
[1. 1. 1. ... 1. 1. 1.]
loss at iteration 13
0.656784
[0. 0. 0. ... 0. 0. 0.]
loss at iteration 14
0.342668
[1. 1. 1. ... 1. 1. 1.]
loss at iteration 15
0.657316
[0. 0. 0. ... 0. 0. 1.]
loss at iteration 16
0.661004
[0. 0. 0. ... 0. 0. 1.]
loss at iteration 17
0.632956
[1. 1. 1. ... 1. 1. 1.]
loss at iteration 18
0.656988
[0. 0. 0. ... 0. 0. 0.]
loss at iteration 19
0.342668
[1. 1. 1. ... 1. 1. 1.]
loss at iteration 20
0.657332
[1.00000000e+00 0.00000000e+00 0.00000000e+00 ... 3.03943721e-32
 0.00000000e+00 1.00000000e+00]
loss at iteration 21
0.661084
[0. 0. 0. ... 0. 0. 1.]
loss at iteration 22
0.57948
[1. 1. 1. ... 1. 1. 1.]
loss at iteration 23
0.657328
[0. 0. 0. ... 0. 0. 0.]
loss at iteration 24
0.342668
[1. 1. 1. ... 1. 1. 1.]
loss at itera

(array([  4705.2104046 ,  16703.07283191,    942.35780362,  -1264.55135283,
         -1722.98168734,    694.90569956,  -2596.65588867,  10690.12027033,
        -20295.0237282 ,  -6305.08141304,    117.4222028 ,    735.07061205,
          2696.52694333,   -382.14153526,   -395.20387057,    585.65849812,
         -2367.15214072,   -353.08558476,    558.56921426,   -917.80731127]),
 nan)