In [41]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 
import datetime

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
#input train data from csv
from proj1_helpers import *
DATA_TRAIN_PATH = 'D:\\Jupyter Notebook\Machine Learning\project1\data\\train.csv'  
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [43]:
#check the shape of y,tX,ids
y.shape,tX.shape,ids.shape

((250000,), (250000, 30), (250000,))

### 1 Feature Engineering

#### 1.1 import some functions and make some settings

In [21]:
def p_x(p_t,phi):
    px = p_t*np.cos(phi)
    return px

def p_y(p_t,phi):
    py = p_t*np.sin(phi)
    return py

def p_z(p_t,eta):
    pz = p_t*np.sinh(eta)
    return pz

#mass are neglected, E = p
def particle_energy(px,py,pz):
    energy = np.sqrt(px**2+py**2+pz**2)
    return energy

def cross_product(px_1,py_1,pz_1,px_2,py_2,pz_2):
    N = len(px_1)
    cp_x = np.zeros(N)
    cp_y = np.zeros(N)
    cp_z = np.zeros(N)
    for t in range(N):
        temp_cross=np.cross(np.array([px_1[t],py_1[t],pz_1[t]]),np.array([px_2[t],py_2[t],pz_2[t]]))
        cp_x[t] = temp_cross[0]
        cp_y[t] = temp_cross[1]
        cp_z[t] = temp_cross[2]
    return cp_x,cp_y,cp_z

def dot_product(px_1,py_1,pz_1,px_2,py_2,pz_2):
    N = len(px_1)
    dp = np.zeros(N)
    for t in range(N):
        dp[t] = np.inner(np.array([px_1[t],py_1[t],pz_1[t]]),np.array([px_2[t],py_2[t],pz_2[t]]))
    return dp

def cosine_similarity(px_1,py_1,pz_1,px_2,py_2,pz_2):
    cp = dot_product(px_1,py_1,pz_1,px_2,py_2,pz_2)/(np.sqrt(px_1**2+py_1**2+pz_1**2)*np.sqrt(px_2**2+py_2**2+pz_2**2))
    
    return cp

def determinant_vector(px_1,py_1,pz_1,px_2,py_2,pz_2,px_3,py_3,pz_3):
    N = len(px_1)
    dv = np.zeros(N)
    for t in range(N):
        temp_vector = np.array([[px_1[t],py_1[t],pz_1[t]],[px_2[t],py_2[t],pz_2[t]],[px_3[t],py_3[t],pz_3[t]]])
        dv[t] = np.linalg.det(temp_vector)

    return dv

def sum_p_xyz(px,py,pz):
    sp = px+py+pz
    return sp


In [None]:
#import the name of each feature
# DER_mass_MMC = tX[:,0]
# DER_mass_transverse_met_lep=tX[:,1]
# DER_mass_vis = tX[:,2]
# DER_pt_h = tX[:,3]
# DER_deltaeta_jet_jet = tX[:,4]
# DER_mass_jet_jet = tX[:,5]
# DER_prodeta_jet_jet = tX[:,6]
# DER_deltar_tau_lep = tX[:,7]
# DER_pt_tot = tX[:,8]
# DER_sum_pt = tX[:,9]
# DER_pt_ratio_lep_tau = tX[:,10]
# DER_met_phi_centrality = tX[:,11]
# DER_lep_eta_centrality = tX[:,12]
# PRI_tau_pt = tX[:,13]
# PRI_tau_eta = tX[:,14]
# PRI_tau_phi = tX[:,15]
# PRI_lep_pt = tX[:,16]
# PRI_lep_eta =tX[:,17]
# PRI_lep_phi = tX[:,18]
# PRI_met=tX[:,19]
# PRI_met_phi = tX[:,20]
# PRI_met_sumet = tX[:,21]
# PRI_jet_num=tX[:,22]
# PRI_jet_leading_pt=tX[:,23]
# PRI_jet_leading_eta = tX[:,24]
# PRI_jet_leading_phi = tX[:,25]
# PRI_jet_subleading_pt=tX[:,26]
# PRI_jet_subleading_eta = tX[:,27]
# PRI_jet_subleading_phi = tX[:,28]
# PRI_jet_all_pt = tX[:,29]

#### 1.2 generate some features

In [22]:
#generate px,py,pz and append to the tX
#tau
p_tau_x = p_x(PRI_tau_pt,PRI_tau_phi)
p_tau_y = p_y(PRI_tau_pt,PRI_tau_phi)
p_tau_z = p_z(PRI_tau_pt,PRI_tau_eta)
#lep
p_lep_x = p_x(PRI_lep_pt,PRI_lep_phi)
p_lep_y = p_y(PRI_lep_pt,PRI_lep_phi)
p_lep_z = p_z(PRI_lep_pt,PRI_lep_eta)
#met
p_met_x = p_x(PRI_met,PRI_met_phi)
p_met_y = p_y(PRI_met,PRI_met_phi)

#append features
tX=np.append(tX,np.array([p_tau_x,p_tau_y,p_tau_z,p_lep_x,p_lep_y,p_lep_z,p_met_x,p_met_y]).T,axis=1)

In [23]:
#generate energy of particle
p_tau_energy = particle_energy(p_tau_x,p_tau_y,p_tau_z)
p_lep_energy = particle_energy(p_lep_x,p_lep_y,p_lep_z)

#append features
tX=np.append(tX,np.array([p_tau_energy,p_lep_energy]).T,axis=1)

In [24]:
N = len(y)
#vector product
#tau-lep

tau_lep_dot = dot_product(p_tau_x,p_tau_y,p_tau_z,p_lep_x,p_lep_y,p_lep_z)
tau_lep_cross_x,tau_lep_cross_y,tau_lep_cross_z = cross_product(p_tau_x,p_tau_y,p_tau_z,p_lep_x,p_lep_y,p_lep_z)
tau_lep_cosine_similarity = cosine_similarity(p_tau_x,p_tau_y,p_tau_z,p_lep_x,p_lep_y,p_lep_z)

#tau-met
tau_met_dot = dot_product(p_tau_x,p_tau_y,p_tau_z,p_met_x,p_met_y,np.zeros(N))
tau_met_cross_x,tau_met_cross_y,tau_met_cross_z = cross_product(p_tau_x,p_tau_y,p_tau_z,p_met_x,p_met_y,np.zeros(N))
tau_met_cosine_similarity =cosine_similarity(p_tau_x,p_tau_y,p_tau_z,p_met_x,p_met_y,np.zeros(N))
#lep-met
lep_met_dot = dot_product(p_lep_x,p_lep_y,p_lep_z,p_met_x,p_met_y,np.zeros(N))
lep_met_cross_x,lep_met_cross_y,lep_met_cross_z = cross_product(p_lep_x,p_lep_y,p_lep_z,p_met_x,p_met_y,np.zeros(N))
lep_met_cosine_similarity =cosine_similarity(p_lep_x,p_lep_y,p_lep_z,p_met_x,p_met_y,np.zeros(N))

#append features
tX=np.append(tX,np.array([tau_lep_dot,tau_lep_cross_x,tau_lep_cross_y,tau_lep_cross_z,tau_lep_cosine_similarity]).T,axis=1)
tX=np.append(tX,np.array([tau_met_dot,tau_met_cross_x,tau_met_cross_y,tau_met_cross_z,tau_met_cosine_similarity]).T,axis=1)
tX=np.append(tX,np.array([lep_met_dot,lep_met_cross_x,lep_met_cross_y,lep_met_cross_z,lep_met_cosine_similarity]).T,axis=1)

In [25]:
#generate determinant vector
d_vector = determinant_vector(p_tau_x,p_tau_y,p_tau_z,p_lep_x,p_lep_y,p_lep_z,p_met_x,p_met_y,np.zeros(N))

#append features
tX=np.append(tX,np.array([d_vector]).T,axis=1)

#### 1.3 cross-validation function and indice function

In [44]:
def cross_validation(y,tX,k_indices,k):
    
    tX_test_indice = k_indices[k]
    tX_train_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tX_train_indice = tX_train_indice.reshape(-1)
    y_test = y[tX_test_indice]
    y_train = y[tX_train_indice]
    tX_test = tX[tX_test_indice]
    tX_train = tX[tX_train_indice]
    
    #split train and test into four groups
    y_train_0,y_train_1,y_train_2,y_train_3,y_test_0,y_test_1,y_test_2,y_test_3,tX_train_0,tX_train_1,tX_train_2,tX_train_3,tX_test_0,tX_test_1,tX_test_2,tX_test_3=split_groups(y_train,y_test,tX_train,tX_test)
    
    
    weight_0 = ridge_regression(y_train_0,tX_train_0,0.01)
    weight_1 = ridge_regression(y_train_1,tX_train_1,0.01)
    weight_2 = ridge_regression(y_train_2,tX_train_2,0.01)
    weight_3 = ridge_regression(y_train_3,tX_train_3,0.01)
    
    y_pred_0 = predict_labels(weight_0, tX_test_0)
    y_pred_1 = predict_labels(weight_1, tX_test_1)
    y_pred_2 = predict_labels(weight_2, tX_test_2)
    y_pred_3 = predict_labels(weight_3, tX_test_3)
    
    num_0 = np.count_nonzero(y_pred_0==y_test_0)
    num_1 = np.count_nonzero(y_pred_1==y_test_1)
    num_2 = np.count_nonzero(y_pred_2==y_test_2)
    num_3 = np.count_nonzero(y_pred_3==y_test_3)
    
    num = len(y_test)
    accuracy = (num_0+num_1+num_2+num_3)/num
    
    return accuracy

def split_groups(y_train,y_test,tX_train,tX_test):
    tX_train_jet_num = tX_train[:,22]
    tX_test_jet_num = tX_test[:,22]
    
    y_train_0 = np.copy(y_train[tX_train_jet_num==0])
    y_train_1 = np.copy(y_train[tX_train_jet_num==1])
    y_train_2 = np.copy(y_train[tX_train_jet_num==2])
    y_train_3 = np.copy(y_train[tX_train_jet_num==3])
    
    y_test_0 = np.copy(y_test[tX_test_jet_num==0])
    y_test_1 = np.copy(y_test[tX_test_jet_num==1])
    y_test_2 = np.copy(y_test[tX_test_jet_num==2])
    y_test_3 = np.copy(y_test[tX_test_jet_num==3])
    
    tX_train_0 = np.copy(tX_train[tX_train_jet_num==0])
    tX_train_1 = np.copy(tX_train[tX_train_jet_num==1])
    tX_train_2 = np.copy(tX_train[tX_train_jet_num==2])
    tX_train_3 = np.copy(tX_train[tX_train_jet_num==3])
    
    tX_test_0 = np.copy(tX_test[tX_test_jet_num==0])
    tX_test_1 = np.copy(tX_test[tX_test_jet_num==1])
    tX_test_2 = np.copy(tX_test[tX_test_jet_num==2])
    tX_test_3 = np.copy(tX_test[tX_test_jet_num==3])
    
    return y_train_0,y_train_1,y_train_2,y_train_3,y_test_0,y_test_1,y_test_2,y_test_3,tX_train_0,tX_train_1,tX_train_2,tX_train_3,tX_test_0,tX_test_1,tX_test_2,tX_test_3

def build_k_indices(y,k_fold,seed):
    num_row = y.shape[0]
    interval = int(num_row/k_fold)
    np.random.seed(seed)
    indices=np.random.permutation(num_row)
    k_indices = [indices[k*interval:(k+1)*interval] for k in range (k_fold)]
    return np.array(k_indices)

In [45]:
def ridge_regression(y, tx, lambda_):
    """implement ridge regression."""
    N = len(y)
    a = tx.T.dot(tx)+lambda_*(2*N)*np.identity(tx.shape[1])
    b = tx.T.dot(y)
    weight = np.linalg.solve(a,b)
    
    return weight

#### 1.4 demo

In [46]:
def demo():
    k_fold = 4
    seed = 12
    k_indices = build_k_indices(y, k_fold, seed)
    total = 0
    for k in range(k_fold):
        accuracy = cross_validation(y,tX,k_indices,k)
        total  = total+accuracy
        print(f'{k}:{accuracy} ')
    
    average=total/k_fold
    print(f'average accuracy:{accuracy}')

In [47]:
demo()

0:0.75872 
1:0.758608 
2:0.757616 
3:0.755632 
average accuracy0.755632
