In [9]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data 
into feature matrix X, class labels y, and event ids:

In [39]:
from proj1_helpers import *
DATA_TRAIN_PATH = 'train.csv' # TODO: download train data and supply path here 
y, X, ids = load_csv_data(DATA_TRAIN_PATH)

## Data cleaning
Here we deal with problematic values.

In [40]:
from helpers import *

def standardize_badFeatures(X):
    
    # Function that calculate the mean and std of bad features without elements equal to -999
    # Then, it remplaces -999 values by zeros, zeros won't influence the train of the model... 
    mean_x = np.zeros((X.shape[1],))
    std_x = np.zeros((X.shape[1],))
    for d in range(X.shape[1]):
        idx = np.where(X[:,d] == -999)[0]
        mean_x[d] = np.mean(np.delete(X[:,d], (idx)))
        std_x[d] = np.std(np.delete(X[:,d], (idx)))
        X[:,d] = (X[:,d]-mean_x[d])/std_x[d]
        X[idx,d] = 0
    return X, mean_x, std_x


def clean_data(X):

    # find indices of features that have at least one value -999, we call them "bad" features
    idx_badFeatures = []
    for d in range(X.shape[1]):
        if sum(X[:,d] == -999) > 0:
            idx_badFeatures.append(d)

    # separate "good" and "bad" features
    X_badFeatures = X[:,idx_badFeatures]
    X_goodFeatures = np.delete(X,(idx_badFeatures), axis=1)

    # Standardize it differently (see : standardize_badFeatures(X))
    tX, mean_x, std_x = standardize(X_goodFeatures)
    tX2, mean_x2, std_x2 = standardize_badFeatures(X_badFeatures)

    # comment the 3 next lines if you want to work only with "good" features
    tX = np.hstack((tX, tX2))
    mean_x = np.hstack((mean_x, mean_x2))
    std_x = np.hstack((std_x, std_x2))
    
    return tX, mean_x, std_x

# Now tX already has ones in the first column...
tX, mean_x, std_x = clean_data(X)

In [37]:
idx_red = np.where(y == 1)[0]
idx_blue = np.where(y == -1)[0]

#for d in range(1,X.shape[1]):
#    f, axarr = plt.subplots(2, sharex=True)
#    axarr[0].hist(X[idx_red,d], 50, normed=1, facecolor='red', alpha=0.5)
#    axarr[1].hist(X[idx_blue,d], 50, normed=1, facecolor='blue', alpha=0.5)
#    plt.show()

# look for outliers (> 40*std)
#outliers = np.where(tX > 40)
# here we see that our single outlier is in fact a higgs boson event...
# should we keep it ???
#print(outliers[0], y[outliers[0]])
#plt.boxplot(tX)
#np.sum(y == 1)/len(y), np.sum(y == -1)/len(y)


#Is there samples that are identical ?
#ncols = tX.shape[1]
#dtype = tX.dtype.descr * ncols
#struct = tX.view(dtype)

#uniq, idx = np.unique(struct, return_index=True)
#tX = uniq.view(tX.dtype).reshape(-1, ncols)

#print(tX.shape)

# Ridge regression

In [3]:
import myFunctions as my
from plots import bias_variance_decomposition_visualization

def cross_validation(y, x, k_indices, k, lamb):
    """return the loss of ridge regression."""
    
    # get k'th subgroup in test, others in train: TODO
    tx_tr = x[np.delete(k_indices, (k), axis=0).flatten()]
    y_tr = y[np.delete(k_indices, (k), axis=0).flatten()]
    tx_te = x[k_indices[k]]
    y_te = y[k_indices[k]]
    
    # regression/classification method
    #w = my.least_squares(y_tr, tx_tr)
    w = my.ridge_regression(y_tr, tx_tr, lamb)
    
    # calculate the loss for train and test data: TODO
    #loss_tr = my.compute_loss(y_tr, tx_tr, w)
    #loss_te = my.compute_loss(y_te, tx_te, w)
    
    loss_tr, loss_te = my.compute_classerror(w, tx_tr, tx_te, y_tr, y_te)
       
    return loss_tr, loss_te

def cross_validation_demo(y, X):
    # parameters
    seed = 74
    k_fold = 10
    
    # hyperparameters
    degrees = [1, 5, 9, 13, 15]
    lambdas = np.logspace(-5, 2, 15)
    
    # split data in k fold
    k_indices = my.build_k_indices(y, k_fold, seed)
    
    
    # define lists to store the loss of training data and test data
    rmse_tr = np.zeros((len(degrees),k_fold))
    rmse_te = np.zeros((len(degrees),k_fold))
    best_lambda = np.zeros((len(degrees),))
    
    # Hyperparameter 1
    for d, degree in enumerate(degrees):
        print(degree)
        # Build polynomial function
        tX = my.build_poly(X, degree)
        
        loss_tr = np.zeros((len(lambdas),k_fold))
        loss_te = np.zeros((len(lambdas),k_fold))
        
        for l, lamb in enumerate(lambdas):
        
            # Cross-validation
            for k in range(k_fold):
                loss_tr[l,k], loss_te[l,k] = cross_validation(y, tX, k_indices, k, lamb)
                
        best_idx = np.argmin(np.mean(loss_te,axis=0))
        best_lambda[d] = lambdas[best_idx]
        
        rmse_tr[d,:] = loss_tr[best_idx,:]
        rmse_te[d,:] = loss_te[best_idx,:]
        
    bias_variance_decomposition_visualization(degrees, rmse_tr.T, rmse_te.T)   
    print(best_lambda)
    return rmse_tr, rmse_te, degrees, best_lambda

rmse_tr, rmse_te, degrees, lambdas = cross_validation_demo(y, tX)

1
5
9
13
15
[ 0.1         0.1         0.1         0.1         0.00031623]


In [7]:
print(degrees, np.mean(rmse_tr.T, axis=0), np.mean(rmse_te.T, axis=0))

[1, 5, 9, 13, 15] [ 0.26394267  0.21328978  0.20573022  0.2023      0.25617067] [ 0.264     0.213532  0.206148  0.202756  0.257792]


## Logistic

In [43]:
def logistic_regression_newton_method_demo(y, tx):
    
    y[np.where(y == -1)[0]] = 0
    # init parameters
    max_iter = 10000
    gamma = 0.01
    threshold = 1e-8
    #lambdas = np.logspace(-5,2,10)
    losses = []

    # build tx
    w = np.zeros((tx.shape[1], 1))

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = my.learning_by_newton_method(y, tx, w, gamma)
        # log info
        if iter % 500 == 0:
            print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
        # converge criteria
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    # visualization
    print("The loss={l}".format(l=my.calculate_logic_loss(y, tx, w)))

logistic_regression_newton_method_demo(y, tX)

MemoryError: 

In [115]:
degree = 2
lamb = 1
final_X = my.build_poly(tX, degree)
w = my.ridge_regression(y, final_X, lamb)
y_pred = predict_labels(w, final_X)
loss = len(np.nonzero(y_pred-y)[0])/len(y)
loss

0.259052

## Generate predictions and save ouput in csv format for submission:

In [116]:
DATA_TEST_PATH = 'test.csv' # TODO: download train data and supply path here 
_, X_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [119]:
tX_test, mean_xtest, std_xtest = clean_data(X_test)
final_X_test = my.build_poly(tX_test, degree)

In [120]:
OUTPUT_PATH = 'results.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(w, final_X_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)