# Higg's Boson Challenge - Final Predictions

In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from regression import ridge_regression
from least_squares import least_squares
from proj1_helpers import *
from helpers import standardize
%load_ext autoreload
%autoreload 2

## Preprocessing

In [2]:
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [24]:
def prepare(x):
    """
    Prepare the data by standardizing and replacing unused 
    values (-999) by the mean of their columns such that they
    don't affect the computation then.
    """
    # Here we put the non sense values (-999) to mean 
    # such that then with the standardization they will be set to 0
    # And we count the number of -999 values to add this information to
    N = x.shape[0]
    novalues_len = np.zeros((x.shape[0], x.shape[1]))
    useless_features = []
    
    
    xt = np.copy(x.T)
    i = 0
    for xi in xt:
        xi[xi==-999] = np.nan
        m = np.nanmean(xi)
        nanidx = np.where(np.isnan(xi))
        number_noval = nanidx[0].shape[0]
        xi[nanidx] = m
        if number_noval >= N:
            useless_features.append(i)
        i = i + 1
    
    i = 0
    for xi in xt.T:
        nanidx = np.where(np.isnan(xi))
        novalues_len[i] = nanidx[0].shape[0]
        i = i + 1
    
    #tx = xt.T
    #tx = np.delete(tx, useless_features, axis=1)
    #tx = np.hstack((tx, novalues_len))
    
    tx, mean, std = standardize(xt.T)
    
    return tx

In [25]:
tx = prepare(tX)

## Training phase 

In [26]:
def build_poly(x, degree = 5):
    tX_poly = np.power(x, 0)
    tX_d = np.log2(np.abs(x))
    tX_poly = np.hstack((tX_poly, tX_d))
    tX_d = np.log10(np.square(x))
    tX_poly = np.hstack((tX_poly, tX_d))
    
    for d in range(0, degree):
        tX_d = np.power(x, d+1)
        tX_poly = np.hstack((tX_poly, tX_d))
       
    return tX_poly

In [27]:
tx_poly = build_poly(tx)

In [28]:
lamb = 0.0000000001
weights, loss = ridge_regression(y, tx_poly, 0.0000000001)

In [29]:
print(loss)
print(weights)

0.291414239863
[  1.07515899e+00  -9.87449866e-02  -1.56104408e-01  -3.08276424e-02
  -7.06175533e-03  -2.30834167e-02   2.04894800e-02  -2.70037688e-02
  -6.99165033e-02   3.74785135e-02   8.04391370e-02  -2.33356425e-02
  -6.06667814e-03  -1.52590304e-01  -1.15196626e-01  -1.59690367e-01
  -1.03057590e-01  -2.53404351e-01  -2.36360130e-01  -7.73366852e-02
  -5.15956222e-02  -1.60062962e-01  -1.18891025e-01  -6.51324245e-02
  -6.48083452e-02  -2.44825695e-03   3.66033505e-03  -1.40124754e-01
  -6.72536902e-02  -5.92511781e-02  -1.96671661e-02   0.00000000e+00
   5.49026782e-01  -4.25778420e-02  -8.92855890e-02   9.49148787e-02
  -4.08419867e-01  -3.29382781e-01   3.54901297e+00   7.56092249e-02
   5.73337142e-02  -1.11006048e-01   1.55026178e-01  -1.97089318e-02
  -9.78476784e-01  -1.60921827e-02   1.88474611e-01   2.19020869e-02
   7.13437259e-02  -4.29290619e-02   7.95226643e-02   1.10391102e-01
  -6.30415932e-02   1.69496255e-01  -5.66214315e-02   2.11254560e-01
  -9.93192393e-01  

## Prediction for the test data

In [9]:
DATA_TEST_PATH = '../data/test.csv'
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [10]:
tx_test = prepare(tX_test)

In [11]:
tx_test_poly = build_poly(tx_test)

In [12]:
OUTPUT_PATH = '../output/out.csv'
y_pred = predict_labels(weights, tx_test_poly)

create_csv_submission(ids_test, y_pred, OUTPUT_PATH)