# Test Set Prediction

In [348]:
%matplotlib inline
import numpy as np
from cross_validation import cross_validation
from polynomial import build_poly
import matplotlib.pyplot as plt
from implementations import least_squares
from proj1_helpers import load_csv_data
from proj1_helpers import predict_labels
from proj1_helpers import create_csv_submission

In [349]:
test_set = load_csv_data('../test.csv')
y_test, X_test, ids, columns = test_set

print("Test_X shape: {}".format(X_test.shape))
print("\nTest_Y shape: {}".format(y_test.shape))
print("\nColumn names: {}".format(columns))
print("\nColumn shape: {}".format(columns.shape))

n_features = len(columns)

Test_X shape: (568238, 30)

Test_Y shape: (568238,)

Column names: ['DER_mass_MMC' 'DER_mass_transverse_met_lep' 'DER_mass_vis' 'DER_pt_h'
 'DER_deltaeta_jet_jet' 'DER_mass_jet_jet' 'DER_prodeta_jet_jet'
 'DER_deltar_tau_lep' 'DER_pt_tot' 'DER_sum_pt' 'DER_pt_ratio_lep_tau'
 'DER_met_phi_centrality' 'DER_lep_eta_centrality' 'PRI_tau_pt'
 'PRI_tau_eta' 'PRI_tau_phi' 'PRI_lep_pt' 'PRI_lep_eta' 'PRI_lep_phi'
 'PRI_met' 'PRI_met_phi' 'PRI_met_sumet' 'PRI_jet_num'
 'PRI_jet_leading_pt' 'PRI_jet_leading_eta' 'PRI_jet_leading_phi'
 'PRI_jet_subleading_pt' 'PRI_jet_subleading_eta' 'PRI_jet_subleading_phi'
 'PRI_jet_all_pt']

Column shape: (30,)


In [350]:
selected_features = np.array([1,3,9,10,11,13,21,22,23])
selected_features = np.sort(np.append(selected_features, [0,4,5,6,12]))
selected_features

array([ 0,  1,  3,  4,  5,  6,  9, 10, 11, 12, 13, 21, 22, 23])

## 1. Data Transformation 

In [351]:
log_transformed_columns = [3,9,10,13,21]

for i in log_transformed_columns:
    X_test[np.where(X_test[:,i] != -999), i] = np.log(X_test[np.where(X_test[:,i] != -999),i] + 1)
    X_test[np.where(X_test[:,i] == -999)] = -999

X_test.shape

(568238, 30)

## 2. Feature Selection 

In [352]:
X_test = X_test[:, selected_features]
X_test.shape

(568238, 14)

## 3. Data Standartization

Data standartization applied to data.

In [353]:
X_train = np.load('X_train_not_normalized.npy')
X_train.shape

(250000, 14)

In [354]:
for i in [x for x in range(X_train.shape[1]) if x != 12]:
    col_val = X_train[np.where(X_train[:,i] != -999), i]
    X_test[np.where(X_test[:,i] != -999), i] = (X_test[np.where(X_test[:,i] != -999), i] - np.mean(col_val)) / (np.std(col_val)) 
    
print(X_test.shape)
print(X_test.min())
print(X_test.max())

(568238, 14)
-999.0
31.892941944610822


In [355]:
X_test

array([[-9.99000000e+02,  8.58660055e-01, -1.47857345e+00, ...,
        -1.03674699e+00,  0.00000000e+00, -9.99000000e+02],
       [-2.69826566e-01,  5.16346762e-01,  3.89240111e-01, ...,
        -2.47999365e-02,  1.00000000e+00, -6.14009744e-01],
       [-7.09366267e-02,  1.97657867e-01, -1.30094557e+00, ...,
        -8.22238938e-01,  0.00000000e+00, -9.99000000e+02],
       ...,
       [-2.33193538e-01, -1.11481187e+00, -3.37540180e-01, ...,
         9.37941259e-02,  0.00000000e+00, -9.99000000e+02],
       [-4.38889995e-01, -8.27103476e-01,  1.70253195e-02, ...,
         3.02151950e-01,  1.00000000e+00, -8.89503128e-01],
       [-5.14599416e-01,  8.73372255e-01, -1.32321134e+00, ...,
        -1.05761046e+00,  0.00000000e+00, -9.99000000e+02]])

## 4. Data Imputation

We impute DER_mass_MMC with median value

In [356]:
X_test[np.where(X_test[:,0] == -999), 0] = np.median(X_test[np.where(X_test[:,0] != -999), 0])
print(X_test[:,0].min())
print(X_test[:,0].max())

-1.9677211447574205
31.892941944610822


# Splitting into 3 test sets

We split the dataset using PRI_jet_num column

In [357]:
ids_pri_0 = np.where(X_test[:, -2] == 0)[0]
ids_pri_1 = np.where(X_test[:, -2] == 1)[0]
ids_pri_23 = np.where((X_test[:, -2] == 2) | (X_test[:, -2] == 3))[0]

In [358]:
X_pri_0 = X_test[(X_test[:, -2] == 0),:]
X_pri_1 = X_test[(X_test[:, -2] == 1),:]
X_pri_23 = X_test[(X_test[:, -2] == 2) | (X_test[:, -2] == 3),:]

print("PRI_0: {}".format(X_pri_0.shape))
print("PRI_1: {}".format(X_pri_1.shape))
print("PRI_23: {}".format(X_pri_23.shape))

PRI_0: (227458, 14)
PRI_1: (175338, 14)
PRI_23: (165442, 14)


In [359]:
X_pri_0 = np.delete(X_pri_0, np.s_[12], axis=1)
X_pri_1 = np.delete(X_pri_1, np.s_[12], axis=1)
X_pri_23 = np.delete(X_pri_23, np.s_[12], axis=1)

print("PRI_0: {}".format(X_pri_0.shape))
print("PRI_1: {}".format(X_pri_1.shape))
print("PRI_23: {}".format(X_pri_23.shape))

PRI_0: (227458, 13)
PRI_1: (175338, 13)
PRI_23: (165442, 13)


In [360]:
selected_features = np.array([1,3,9,10,11,13,21,23])
selected_features = np.sort(np.append(selected_features, [0,4,5,6,12]))
print(selected_features)

[ 0  1  3  4  5  6  9 10 11 12 13 21 23]


In [361]:
delete_columns_0 = []

for i in range(X_pri_0.shape[1]):
    if np.isin(True, (X_pri_0[:,i] == -999)):
        delete_columns_0.append(i)
        
delete_columns_0

[3, 4, 5, 9, 12]

In [362]:
delete_columns_1 = []

for i in range(X_pri_1.shape[1]):
    if np.isin(True, (X_pri_1[:,i] == -999)):
        delete_columns_1.append(i)
        
delete_columns_1

[3, 4, 5, 9]

In [363]:
X_pri_0 = np.delete(X_pri_0, np.s_[delete_columns_0], axis=1)  
X_pri_0.shape

(227458, 8)

In [364]:
X_pri_1 = np.delete(X_pri_1, np.s_[delete_columns_1], axis=1)  
X_pri_1.shape

(175338, 9)

In [365]:
print("PRI_0: {}".format(X_pri_0.shape))
print("PRI_1: {}".format(X_pri_1.shape))
print("PRI_23: {}".format(X_pri_23.shape))

PRI_0: (227458, 8)
PRI_1: (175338, 9)
PRI_23: (165442, 13)


# Predictions

In [366]:
predictions = np.zeros(len(y_test))

In [367]:
w0 = np.load('w0.npy')
w1 = np.load('w1.npy')
w23 = np.load('w23.npy')

In [368]:
pri_0_y = predict_labels(w0, build_poly(X_pri_0, 12))
pri_1_y = predict_labels(w1, build_poly(X_pri_1, 12))
pri_23_y = predict_labels(w23, build_poly(X_pri_23, 11))

In [369]:
predictions[ids_pri_0] = pri_0_y
predictions[ids_pri_1] = pri_1_y
predictions[ids_pri_23] = pri_23_y

In [370]:
create_csv_submission(ids, predictions, 'output.csv')