In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('./../src/')
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from implementations import *
from utils import *
from pipeline import * 
from proj1_helpers import * 

In [3]:
# PATHS
DATA_TRAIN_PATH = "./../data/train.csv"
DATA_TEST_PATH = "./../data/test.csv" 

In [4]:
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [5]:
tX.shape

(250000, 30)

In [6]:
col_names = ['DER_mass_MMC', 'DER_mass_transverse_met_lep',
       'DER_mass_vis', 'DER_pt_h', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet',
       'DER_prodeta_jet_jet', 'DER_deltar_tau_lep', 'DER_pt_tot', 'DER_sum_pt',
       'DER_pt_ratio_lep_tau', 'DER_met_phi_centrality',
       'DER_lep_eta_centrality', 'PRI_tau_pt', 'PRI_tau_eta', 'PRI_tau_phi',
       'PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_met', 'PRI_met_phi',
       'PRI_met_sumet', 'PRI_jet_num', 'PRI_jet_leading_pt',
       'PRI_jet_leading_eta', 'PRI_jet_leading_phi', 'PRI_jet_subleading_pt',
       'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'PRI_jet_all_pt']

In [7]:
values = list(range(len(col_names)))

In [8]:
col_dict = dict(zip(col_names, values))
inverse_col_dict = dict(zip(values, col_names))

In [9]:
tX[:, col_dict["PRI_jet_num"]]

array([2., 1., 1., ..., 1., 0., 0.])

In [10]:
np.unique(tX[:, col_dict["PRI_jet_num"]], return_counts=True)

(array([0., 1., 2., 3.]), array([99913, 77544, 50379, 22164]))

In [11]:
def split_data(y, tX, ids, jet_num):
    """
    Splits the given data set such that only the data points with a certain 
    jet number remains, where jet number is a discrete valued feature. In
    other words, filters the data set using the jet number.
    Args: 
        y: a numpy array representing the given labels
        tx: a numpy array representing the given features
        ids: a numpy array representing the ids of the data points
        jet_num: the certain value of the discrete feature jet number
    Returns:
        y_masked: numpy array of labels of data points having the specified jet number
        tx_masked: numpy array of features of data points having the specified jet number
        ids_masked: numpy array of ids of data points having the specified jet number
    """
    mask = tX[:, 22] == jet_num
    return y[mask], tX[mask], ids[mask]

# Jet 0 processing

## Filter useless columns

In [12]:
y0, tX0, ids0 = split_data(y, tX, ids, 0)

In [13]:
tX0.shape

(99913, 30)

In [14]:
dummy_cols = []
for i in range(tX0.shape[1]):
    col = tX0[:, i]
    if len(np.unique(col)) == 1 and col[0] == -999.:
        dummy_cols.append(i)

In [15]:
dummy_cols

[4, 5, 6, 12, 23, 24, 25, 26, 27, 28]

In [16]:
for i in range(len(dummy_cols)):
    l = tX0[:, dummy_cols[i]]
    print(len(l) - len(l[l==-999.]))

0
0
0
0
0
0
0
0
0
0


In [17]:
tX0 = np.delete(tX0, dummy_cols, 1)

In [18]:
tX0.shape

(99913, 20)

## Fillna

In [19]:
np.isnan(tX0).sum(axis = 1).sum()

0

In [20]:
col1 = tX0[:, 0]

In [21]:
np.where(tX0[:, 0] == -999.0)

(array([    3,     5,     8, ..., 99909, 99910, 99912]),)

In [22]:
tX0[:6, 0]

array([ 143.905,  175.864,  105.594, -999.   ,   82.488, -999.   ])

In [23]:
col1[np.where(tX0[:, 0] == -999.0)] = np.nan

In [24]:
col1[:6]

array([143.905, 175.864, 105.594,     nan,  82.488,     nan])

In [25]:
def transform(tX, col_mean, col_std):
    # Standardize
    tX = (tX - col_mean)/col_std

    # Replaces missings with mean
    inds = np.where(np.isnan(tX))
    tX[inds] = np.take(col_mean, inds[1])

    # Creates dummies for imputed values
    tX_imputed = np.zeros((tX.shape[0],tX.shape[1]))
    array_one = np.ones(tX.shape[1])
    tX_imputed[inds] = np.take(array_one, inds[1])

    # Concatenates imputed dummies with variables
    #tX_clean = np.hstack((tX, tX_imputed))
    return tX


def preprocessing(tX):
    '''
    Fit and transform both datasets
    '''
    #missing data
    tX = np.where(tX == -999, np.NaN, tX)

    # Our own transformer. fit
    col_mean = np.nanmean(tX, axis=0)
    col_std = np.nanstd(tX, axis=0)

    tX_clean = transform(tX, col_mean, col_std)

    return tX_clean

In [33]:
tX0_clean = preprocessing(tX0)

  This is separate from the ipykernel package so we can avoid doing imports until


In [34]:
tX0_clean

array([[ 4.49033352e-01,  7.07136352e-01, -2.43749845e-02, ...,
        -7.49699713e-01,  0.00000000e+00,  0.00000000e+00],
       [ 1.06660202e+00, -1.30833757e+00,  1.39142614e+00, ...,
        -1.37002885e+00,  0.00000000e+00,  0.00000000e+00],
       [-2.91279963e-01, -2.57073972e-01,  5.02548436e-01, ...,
         7.42788213e-02,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 1.20667654e+02, -1.89742027e-02, -3.62409265e-01, ...,
        -8.56205465e-01,  0.00000000e+00,  0.00000000e+00],
       [-4.96942947e-01, -1.23187692e+00, -3.43246979e-01, ...,
        -2.47574254e-01,  0.00000000e+00,  0.00000000e+00],
       [ 1.20667654e+02,  4.36508780e-01, -2.90176123e-01, ...,
        -4.98354425e-01,  0.00000000e+00,  0.00000000e+00]])

In [35]:
for i in range(tX0_clean.shape[1])[:2]:
    col = tX0_clean[:, i]
    col_mean = np.mean(col)
    col_std = np.std(col)
    
    upper_lim = col_mean + 2 * col_std
    lower_lim = col_mean - 2 * col_std
    
    big_outliers = np.where(col > upper_lim)
    small_outliers = np.where(col < lower_lim)

    col[big_outliers] = upper_lim
    col[small_outliers] = lower_lim

In [36]:
tX0_clean

array([[ 4.49033352e-01,  7.07136352e-01, -2.43749845e-02, ...,
        -7.49699713e-01,  0.00000000e+00,  0.00000000e+00],
       [ 1.06660202e+00, -1.30833757e+00,  1.39142614e+00, ...,
        -1.37002885e+00,  0.00000000e+00,  0.00000000e+00],
       [-2.91279963e-01, -2.57073972e-01,  5.02548436e-01, ...,
         7.42788213e-02,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 1.20667654e+02, -1.89742027e-02, -3.62409265e-01, ...,
        -8.56205465e-01,  0.00000000e+00,  0.00000000e+00],
       [-4.96942947e-01, -1.23187692e+00, -3.43246979e-01, ...,
        -2.47574254e-01,  0.00000000e+00,  0.00000000e+00],
       [ 1.20667654e+02,  4.36508780e-01, -2.90176123e-01, ...,
        -4.98354425e-01,  0.00000000e+00,  0.00000000e+00]])