In [57]:
# Useful starting lines
%matplotlib inline
import numpy as np
import itertools
import matplotlib.pyplot as plt
import csv
#import helpers

In [58]:
def load_data_train(path_dataset,sub_sample=True, add_outlier=False):
    """Load data and convert it to the metric system."""
    data = np.genfromtxt(
        path_dataset, delimiter=",", dtype=str,  skip_header=1)
    ids = data[:,0]
    labels = data[:,1]
    labels[labels=='s']=0
    labels[labels=='b']=1
    labels = np.asarray(labels, dtype=float)
    data = np.delete(data, [0,1], 1)
    data = np.asarray(data, dtype=float)
    return data, labels, ids

In [59]:
def load_data_test(path_dataset,sub_sample=True, add_outlier=False):
    """Load data and convert it to the metric system."""
    data = np.genfromtxt(
        path_dataset, delimiter=",", dtype=str,  skip_header=1)
    ids = data[:,0]
    labels = data[:,1]
    data = np.delete(data, [0,1], 1)
    data = np.asarray(data, dtype=float)
    return data, labels, ids

Prediction of :
- type s is assigned value 0
- type b is assigned value 1

In [60]:
# Load data
train_data, train_labels, ids = load_data_train('train.csv')

In [61]:
def clean_data(data):
    # Remove columns with more than 50% of -999
    dirty_cols = np.where(np.sum(data == -999, axis=0)/data.shape[0] < 0.5, True, False)
    data = data[:, dirty_cols]
    # Replace -999 by nan
    data = np.where(data == -999, np.nan, data)
    # Compute the columns means without nan values 
    means = np.nanmean(data, axis=0)
    #Find indices that you need to replace
    inds = np.where(np.isnan(data))
    #Place column means in the indices. Align the arrays using take
    data[inds] = np.take(means, inds[1])
    return data

In [62]:
def clean_data_old(data):
    # remove the columns containing a -999 value
    valid_cols = np.all(data!=-999, axis=0)
    return data[:,valid_cols]

In [63]:
# Standardize the data
def standardize(x):
    #Standardize the original data set.
    mean_x = np.mean(x, axis=0)
    x = x - mean_x
    std_x = np.std(x, axis=0)
    x = x / std_x
    return x

In [64]:
new_data = standardize(clean_data(train_data))

In [65]:
print(new_data)

[[ 3.14910656e-01  6.83319669e-02  4.07680272e-01 ...  1.55729751e+00
   3.24824359e-01  4.12510497e-01]
 [ 7.40827026e-01  5.52504823e-01  5.40136414e-01 ...  5.26704866e-01
   8.32993155e-01 -2.73819964e-01]
 [ 0.00000000e+00  3.19515553e+00  1.09655998e+00 ...  1.48714489e+00
  -1.43454996e+00 -2.93969845e-01]
 ...
 [-3.10930673e-01  3.19316447e-01 -1.30863670e-01 ...  1.30416949e+00
  -1.09325452e-01 -3.17017229e-01]
 [-5.10097335e-01 -8.45323970e-01 -3.02973380e-01 ...  1.00367341e-17
   1.11117522e-17 -7.45439413e-01]
 [ 0.00000000e+00  6.65336083e-01 -2.53522760e-01 ...  1.00367341e-17
   1.11117522e-17 -7.45439413e-01]]


## Logistic regression

- data (np.array): Dataset of shape (N, D) 
- labels (np.array): Labels of shape (N, ) 
- w (np.array): Weights of logistic regression model of shape (D, ) 

Recall that the cross entropy loss is defined as:
$$
L(w) = -\sum_i (y_i \log(\hat{y}(x_i)) + (1-y_i)\log(1-\hat{y}(x_i)))
$$
and vectorized :
$$
L(w) = -y^T\log(\hat{y}(x)-(1-y)^T\log(1-\hat{y}(x))$$

$$
\delta L(w) = \sum (\hat{y}(x_i) - y_i)x_i    ???
$$

$$
\delta L(w) = x^T(\hat{y}(x)-y)
$$

In [136]:
def calculate_loss_reg(labels, data, w, lambda_):
    """compute the cost by negative log likelihood."""
    loss = np.sum(np.logaddexp(0, data @ w) + labels * data.dot(w)) + lambda_*np.linalg.norm(w)**2
    return loss

In [67]:
def sigmoid(t):
    return 1.0 / (1 + np.exp(-t))

In [68]:
def calculate_loss_old(labels, data, w):
    """compute the cost by negative log likelihood."""
    loss = -np.sum(labels * np.log(sigmoid(data @ w)) + (1 - labels) * np.log(1 - sigmoid(data @ w)))
    return loss

In [69]:
def calculate_loss(labels, data, w):
    """compute the cost by negative log likelihood."""
    loss = np.sum(np.logaddexp(0, data @ w) + labels * data.dot(w))
    return loss

In [70]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    """ Training function for binary class logistic regression. 
    
    Args:
        y (np.array): Labels of shape (N, ).
        tx (np.array): Dataset of shape (N, D).
        initial_w (np.array): Initial weights of shape (D,)
        max_iters (integer): Maximum number of iterations.
        gamma (integer): Step size
    Returns:
        np.array: weights of shape(D, )
    """  
    def sigmoid(t):
        """apply sigmoid function on t."""
        return 1.0 / (1 + np.exp(-t))

    threshold = 1e-8
    losses = []
    
    w = initial_w.copy()
    for it in range(max_iters):
        #loss = np.sum(np.logaddexp(0, tx.dot(w)) + y * tx.dot(w))
        grad = tx.T.dot(sigmoid(tx.dot(w)) - y)
        w -= gamma * grad
        # log info
        if it % 100 == 0:
            print(f"Current iteration={it}")
        #    print("Current iteration={i}, loss={l}".format(i=it, l=loss))
        # converge criterion
        #losses.append(loss)
        #if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            #break
    loss = calculate_loss(y, tx, w)
    print("loss={l}".format(l=loss))
    return (w, loss)

In [140]:
def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    """ Training function for binary class logistic regression. 
    
    Args:
        y (np.array): Labels of shape (N, ).
        tx (np.array): Dataset of shape (N, D).
        lambda_ (integer): Regularization factor
        initial_w (np.array): Initial weights of shape (D,)
        max_iters (integer): Maximum number of iterations.
        gamma (integer): Step size
    Returns:
        np.array: weights of shape(D, )
    """  
    def sigmoid(t):
        """apply sigmoid function on t."""
        return 1.0 / (1 + np.exp(-t))

    threshold = 1e-8
    losses = []
    
    w = initial_w.copy()
    for it in range(max_iters):
        #loss = -np.sum(y * np.log(sigmoid(tx @ w)) + (1 - y) * np.log(1 - sigmoid(tx @ w)))
        grad = tx.T.dot(sigmoid(tx.dot(w)) - y) + 2*lambda_*w
        w -= gamma * grad
        # log info
        if it % 100 == 0:
            #print("Current iteration={i}, loss={l}".format(i=it, l=loss))
            print(f"Current iteration={it}")
        # converge criterion
        #losses.append(loss)
        #if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
        #    break
    loss = calculate_loss_reg(y, tx, w, lambda_)
    print("loss={l}".format(l=loss))
    return (w, loss)

In [None]:
# Add bias to data
tx = np.c_[np.ones((train_labels.shape[0], 1)), new_data]
#initialize the weights randomly according to a Gaussian distribution
initial_w = np.random.normal(0., 0.1, [tx.shape[1],])
#trained_weights, train_loss = logistic_regression(train_labels, tx, initial_w, max_iters=4000, gamma=0.01)
trained_weights, train_loss = reg_logistic_regression(train_labels, tx, 1, initial_w, max_iters=4000, gamma=0.01)

Current iteration=0


  return 1.0 / (1 + np.exp(-t))


Current iteration=100
Current iteration=200
Current iteration=300
Current iteration=400
Current iteration=500
Current iteration=600
Current iteration=700
Current iteration=800
Current iteration=900
Current iteration=1000
Current iteration=1100
Current iteration=1200
Current iteration=1300
Current iteration=1400
Current iteration=1500
Current iteration=1600
Current iteration=1700
Current iteration=1800
Current iteration=1900
Current iteration=2000
Current iteration=2100
Current iteration=2200
Current iteration=2300
Current iteration=2400
Current iteration=2500
Current iteration=2600
Current iteration=2700
Current iteration=2800
Current iteration=2900
Current iteration=3000
Current iteration=3100
Current iteration=3200


In [149]:
def predict_logistic(tx, w):
    def sigmoid(t):
        return 1.0 / (1 + np.exp(-t))
    y = sigmoid(tx @ w)
    # s = 1 , b = -1
    y[y < 0.5] = 1
    y[y >= 0.5] = -1
    return y

In [150]:
print(trained_weights)

[  823.02512847   -79.17503719   566.57877195  1178.9373725
 -1308.13841605  -562.7990073     46.29895111  -141.42865866
   724.74861699  -469.00635392  -816.3652398      3.34907989
    -7.43860355 -1206.85660937   -11.79690743    -3.12230748
  -476.48465348    -6.88608745  -330.97660713  -825.02958601
  -264.4015371     -3.10177048     2.17331447   291.38871187]


In [145]:
tx_validation = np.c_[np.ones((train_labels.shape[0], 1)), new_data]
validation_predict = predict_logistic(tx_validation, trained_weights)
validation_labels = train_labels
# s = 1, b = -1
validation_labels = np.where(validation_labels > 0.5, 1, -1)
print(validation_labels)
print(validation_predict)

[-1  1  1 ... -1  1  1]
[-1. -1. -1. ... -1. -1. -1.]


  return 1.0 / (1 + np.exp(-t))


In [146]:
def accuracy(a, b):
    return np.sum(a == b)/a.shape[0]

In [147]:
print(accuracy(predict_validation, validation_labels))

0.686224


In [131]:
test_data, labels, ids = load_data_test('test.csv')
test_data = standardize(clean_data(test_data))

In [132]:
tx_test = np.c_[np.ones((test_data.shape[0], 1)), test_data]
print(tx_test.shape)
print(trained_weights.shape)
predicted_labels = predict_logistic(tx_test, trained_weights)
print(predicted_labels)

(568238, 24)
(24,)
[-1. -1. -1. ... -1. -1. -1.]


  return 1.0 / (1 + np.exp(-t))


In [134]:
def csv_submission(ids, y_pred, name):

    with open (name, 'w') as csvfile:
        fd = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fd)
        writer.writeheader()
        for r1, r2 in zip (ids, y_pred):
            writer.writerow({'Id' : int(r1), 'Prediction': str(r2)})

In [135]:
csv_submission(ids, predicted_labels, 'Predictions_Logistics_5.csv')