In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload

%autoreload 2
from implementations import *



# Load the data and clean the data

In [146]:
from proj1_helpers import *
path = '../data/train.csv'

y, tx,ids = load_csv_data(path)
tx

array([[ 138.47 ,   51.655,   97.827, ...,    1.24 ,   -2.475,  113.497],
       [ 160.937,   68.768,  103.235, ..., -999.   , -999.   ,   46.226],
       [-999.   ,  162.172,  125.953, ..., -999.   , -999.   ,   44.251],
       ...,
       [ 105.457,   60.526,   75.839, ..., -999.   , -999.   ,   41.992],
       [  94.951,   19.362,   68.812, ..., -999.   , -999.   ,    0.   ],
       [-999.   ,   72.756,   70.831, ..., -999.   , -999.   ,    0.   ]])

In [148]:
# As some column as -999 values, we have to replace them 
# first try : use the mean of each column to replace them : 0.741 with ridge lambda = 2
# second try : use the median of each column to replace them 0.742 with ridge lambda = 2
# adapted code from :https://stackoverflow.com/questions/18689235/numpy-array-replace-nan-values-with-average-of-columns
tx[tx==-999]=np.nan
col_median = np.nanmedian(tx,axis = 0) #change median by mean if you want mean
indices = np.where(np.isnan(tx))
tx[indices]=np.take(col_median,indices[1])

In [149]:
tx

array([[ 1.38470e+02,  5.16550e+01,  9.78270e+01, ...,  1.24000e+00,
        -2.47500e+00,  1.13497e+02],
       [ 1.60937e+02,  6.87680e+01,  1.03235e+02, ..., -1.00000e-02,
        -2.00000e-03,  4.62260e+01],
       [ 1.12406e+02,  1.62172e+02,  1.25953e+02, ..., -1.00000e-02,
        -2.00000e-03,  4.42510e+01],
       ...,
       [ 1.05457e+02,  6.05260e+01,  7.58390e+01, ..., -1.00000e-02,
        -2.00000e-03,  4.19920e+01],
       [ 9.49510e+01,  1.93620e+01,  6.88120e+01, ..., -1.00000e-02,
        -2.00000e-03,  0.00000e+00],
       [ 1.12406e+02,  7.27560e+01,  7.08310e+01, ..., -1.00000e-02,
        -2.00000e-03,  0.00000e+00]])

# train the models 

In [218]:
# definitions of the hyperparameters
max_iters = 200
gamma = 0.000006
initial_w = np.zeros(tx.shape[1])
lambda_ = 0.1

In [155]:
#train model
w_ls, loss_ls = least_squares(y,tx)

In [156]:
w_lsGD,loss_lsGD = least_squares_GD(y, tx, initial_w, max_iters, gamma)

In [157]:
w_lsSGD,loss_lsSGD = least_squares_SGD(y, tx, initial_w, max_iters, gamma)

In [219]:
w_ridge,loss_ridge = ridge_regression(y,tx,lambda_)

# grid search

In [162]:
def split_data(x, y, ratio, seed=1):
    """
    split the dataset based on the split ratio. If ratio is 0.8 
    you will have 80% of your data set dedicated to training 
    and the rest dedicated to testing
    """
    # set seed random
    np.random.seed(seed)
    
    num_row = len(y)
    indices = np.random.permutation(num_row)
    index_split = int(np.floor(ratio * num_row))
    index_tr = indices[: index_split]
    index_te = indices[index_split:]
    # create split
    x_tr = x[index_tr]
    x_te = x[index_te]
    y_tr = y[index_tr]
    y_te = y[index_te]
    return x_tr, x_te, y_tr, y_te

lambdas = [25,10,4,2,1.0,0.8,0.5,0.3,0.2,0.1,0.05,0.02,0.01,0.001]
ratio = 0.8
x_tr, x_te, y_tr, y_te=split_data(tx, y, ratio, seed=6)
def grid_search(x_tr, x_te, y_tr, y_te,lambdas,ratio):
    
    rmse_tr = []
    rmse_te = []
    for ind, lambda_ in enumerate(lambdas):
        weight,_ = ridge_regression(y_tr, x_tr, lambda_)
        rmse_tr.append(compute_mse(y_tr, x_tr, weight))
        rmse_te.append(compute_mse(y_te, x_te, weight))
        print("proportion={p}, lambda={l:.3f}, Training RMSE={tr:.15f}, Testing RMSE={te:.15f}".format(
               p=ratio, l=lambda_, tr=rmse_tr[ind], te=rmse_te[ind]))

In [163]:
grid_search(x_tr, x_te, y_tr, y_te,lambdas,ratio)

proportion=0.8, lambda=25.000, Training RMSE=0.000001718932766, Testing RMSE=0.000006899107575
proportion=0.8, lambda=10.000, Training RMSE=0.000001718932550, Testing RMSE=0.000006899086932
proportion=0.8, lambda=4.000, Training RMSE=0.000001718932466, Testing RMSE=0.000006899078938
proportion=0.8, lambda=2.000, Training RMSE=0.000001718932374, Testing RMSE=0.000006899076452
proportion=0.8, lambda=1.000, Training RMSE=0.000001718932201, Testing RMSE=0.000006899075507
proportion=0.8, lambda=0.800, Training RMSE=0.000001718932116, Testing RMSE=0.000006899075436
proportion=0.8, lambda=0.500, Training RMSE=0.000001718931868, Testing RMSE=0.000006899075621
proportion=0.8, lambda=0.300, Training RMSE=0.000001718931446, Testing RMSE=0.000006899076391
proportion=0.8, lambda=0.200, Training RMSE=0.000001718930952, Testing RMSE=0.000006899077541
proportion=0.8, lambda=0.100, Training RMSE=0.000001718929659, Testing RMSE=0.000006899081160
proportion=0.8, lambda=0.050, Training RMSE=0.000001718927

## Generate predictions and save ouput in csv format for submission:

In [220]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
y_test, tx_test, ids_test = load_csv_data(DATA_TEST_PATH)
# Have to apply the same data cleaning as for train set
tx_test[tx_test==-999]=np.nan
col_median = np.nanmedian(tx_test,axis = 0)#change median by mean if you want mean
indices = np.where(np.isnan(tx_test))
tx_test[indices]=np.take(col_median,indices[1])

In [245]:
OUTPUT_PATH = '../data/result.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(w_lsGD, tx_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [242]:
from sklearn.metrics import accuracy_score,f1_score

In [246]:
1-accuracy_score(y_test,y_pred)

0.8351729380998807

In [247]:
f1_score(y_test,y_pred)

0.28300692401710836