In [124]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 
import datetime
from imps_zda import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [125]:
#input train data from csv
from proj1_helpers import *
DATA_TRAIN_PATH = 'D:\\Jupyter Notebook\Machine Learning\project1\data\\train.csv'  
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [126]:
#check the shape of y,tX,ids
y.shape,tX.shape,ids.shape

((250000,), (250000, 30), (250000,))

In [127]:
dim = tX.shape[1]
initial_w = np.zeros(dim)
    
#least squares SGD
weight_SGD = least_squares_SGD(y, tX, initial_w, 1000, 1e-8)
print(weight_SGD)

(array([ 7.45542426e-04, -3.57912369e-05,  4.45005565e-05,  1.02384759e-04,
        1.65709638e-05, -8.27618041e-05,  2.12646571e-05,  1.23279777e-06,
       -1.67086922e-05,  1.88924878e-04,  7.68269459e-07, -8.89718836e-07,
        1.79526102e-05,  4.40138515e-05, -9.20492221e-07,  7.31776350e-06,
        2.41253959e-05,  3.66168425e-06, -7.58781738e-06,  5.45773518e-05,
       -1.68543763e-06,  2.28216786e-04,  7.28386025e-08,  4.72923720e-04,
        3.16097994e-04,  3.14143716e-04,  3.67198696e-05,  1.91842242e-05,
        1.03967182e-05,  1.20785453e-04]), 0.5718113686029341)


In [128]:
def data_cleaning(tX):
    
    length = len(tX)
    dim = tX.shape[1]
    delete_index=np.array([],dtype=np.int32)
    

    #for all features
    for d in range(dim):
        if(np.abs(np.std(tX[:,d]))<1e-4):
            delete_index = np.append(delete_index,d)
        else:
            median = np.median(tX[:,d][tX[:,d]!=-999])
            tX[:,d][tX[:,d]==-999] = median
            mean = np.mean(tX[:,d])
            std = np.std(tX[:,d])
            _max = mean+2*std
            _min = mean-2*std
            
            tX[:,d][tX[:,d]>_max] = _max
            tX[:,d][tX[:,d]<_min] = _min
            
            #calculate again
            tX[:,d] = (tX[:,d]-np.mean(tX[:,d]))/np.std(tX[:,d])
        
    tX=np.delete(tX,delete_index,axis=1)
    return tX

In [129]:
def split_groups(y_train,y_test,tX_train,tX_test):
    tX_train_jet_num = tX_train[:,22]
    tX_test_jet_num = tX_test[:,22]
    
    y_train_0 = np.copy(y_train[tX_train_jet_num==0])
    y_train_1 = np.copy(y_train[tX_train_jet_num==1])
    y_train_2 = np.copy(y_train[tX_train_jet_num==2])
    y_train_3 = np.copy(y_train[tX_train_jet_num==3])
    
    y_test_0 = np.copy(y_test[tX_test_jet_num==0])
    y_test_1 = np.copy(y_test[tX_test_jet_num==1])
    y_test_2 = np.copy(y_test[tX_test_jet_num==2])
    y_test_3 = np.copy(y_test[tX_test_jet_num==3])
    
    tX_train_0 =np.copy(tX_train[tX_train_jet_num==0])
    tX_train_1 =np.copy(tX_train[tX_train_jet_num==1])
    tX_train_2 =np.copy(tX_train[tX_train_jet_num==2])
    tX_train_3 =np.copy(tX_train[tX_train_jet_num==3])
    
    tX_test_0 = np.copy(tX_test[tX_test_jet_num==0])
    tX_test_1 = np.copy(tX_test[tX_test_jet_num==1])
    tX_test_2 = np.copy(tX_test[tX_test_jet_num==2])
    tX_test_3 = np.copy(tX_test[tX_test_jet_num==3])
    
    return y_train_0,y_train_1,y_train_2,y_train_3,y_test_0,y_test_1,y_test_2,y_test_3,tX_train_0,tX_train_1,tX_train_2,tX_train_3,tX_test_0,tX_test_1,tX_test_2,tX_test_3


In [130]:
def build_poly(tX,degree):
    
    N=tX.shape[0]
    dim = 10
    
    
    for d in range(dim):
        phi = np.zeros((N,degree-1))
        valid_index = (tX[:,d]!=-999)
        for t in range(2,degree+1):
            phi[valid_index,t-2] = np.power(tX[valid_index,d],t)
            phi[~valid_index,t-2] = -999
        tX = np.concatenate((tX,phi),axis=1)
    return tX

In [131]:
def build_feature(tX,degree):
    
    
    tX=build_poly(tX,degree)
    
    
    return tX

In [132]:
def build_k_indices(y,k_fold,seed):
    num_row = y.shape[0]
    interval = int(num_row/k_fold)
    np.random.seed(seed)
    indices=np.random.permutation(num_row)
    k_indices = [indices[k*interval:(k+1)*interval] for k in range (k_fold)]
    return np.array(k_indices)

In [143]:
def cross_validation(y,tX,k_indices,k,lambda_,gamma,max_iters):
    
    tX_test_indice = k_indices[k]
    tX_train_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tX_train_indice = tX_train_indice.reshape(-1)
    y_test = y[tX_test_indice]
    y_train = y[tX_train_indice]
    tX_test = tX[tX_test_indice]
    tX_train = tX[tX_train_indice]
    
    #split train and test into four groups
    y_train_0,y_train_1,y_train_2,y_train_3,y_test_0,y_test_1,y_test_2,y_test_3,tX_train_0,tX_train_1,tX_train_2,tX_train_3,tX_test_0,tX_test_1,tX_test_2,tX_test_3=split_groups(y_train,y_test,tX_train,tX_test)
    
    tX_train_0=data_cleaning(tX_train_0)
    tX_train_1=data_cleaning(tX_train_1)
    tX_train_2=data_cleaning(tX_train_2)
    tX_train_3=data_cleaning(tX_train_3)
    
    tX_test_0=data_cleaning(tX_test_0)
    tX_test_1=data_cleaning(tX_test_1)
    tX_test_2=data_cleaning(tX_test_2)
    tX_test_3=data_cleaning(tX_test_3)  

#     tX_train_0 = build_feature(tX_train_0,2)
#     tX_train_1 = build_feature(tX_train_1,2)
#     tX_train_2 = build_feature(tX_train_2,2)
#     tX_train_3 = build_feature(tX_train_3,2)
    
#     tX_train_1 = build_cross_term(tX_train_1)
#     tX_train_2 = build_cross_term(tX_train_2)
#     tX_train_3 = build_cross_term(tX_train_3)
    
#     tX_test_0 = build_feature(tX_test_0,2)
#     tX_test_1 = build_feature(tX_test_1,2)
#     tX_test_2 = build_feature(tX_test_2,2)
#     tX_test_3 = build_feature(tX_test_3,2)
    
#     tX_train_0=data_cleaning(tX_train_0)
#     tX_train_1=data_cleaning(tX_train_1)
#     tX_train_2=data_cleaning(tX_train_2)
#     tX_train_3=data_cleaning(tX_train_3)
    
#     tX_test_0=data_cleaning(tX_test_0)
#     tX_test_1=data_cleaning(tX_test_1)
#     tX_test_2=data_cleaning(tX_test_2)
#     tX_test_3=data_cleaning(tX_test_3)      
    
    weight_SGD_0,weight_LR_0,weight_0=stacking_train(y_train_0,tX_train_0,lambda_,gamma,max_iters)
    y_pred_0=stacking_test(tX_test_0,weight_SGD_0,weight_LR_0,weight_0)

    weight_SGD_1,weight_LR_1,weight_1=stacking_train(y_train_1,tX_train_1,lambda_,gamma,max_iters)
    y_pred_1=stacking_test(tX_test_1,weight_SGD_1,weight_LR_1,weight_1)

    weight_SGD_2,weight_LR_2,weight_2=stacking_train(y_train_2,tX_train_2,lambda_,gamma,max_iters)
    y_pred_2=stacking_test(tX_test_2,weight_SGD_2,weight_LR_2,weight_2)
    
    weight_SGD_3,weight_LR_3,weight_3=stacking_train(y_train_3,tX_train_3,lambda_,gamma,max_iters)
    y_pred_3=stacking_test(tX_test_3,weight_SGD_3,weight_LR_3,weight_3)
    
    num_0 = np.count_nonzero(y_pred_0==y_test_0)
    num_1 = np.count_nonzero(y_pred_1==y_test_1)
    num_2 = np.count_nonzero(y_pred_2==y_test_2)
    num_3 = np.count_nonzero(y_pred_3==y_test_3)
    
    accuracy_0 = num_0/len(y_pred_0)
    accuracy_1 = num_1/len(y_pred_1)
    accuracy_2 = num_2/len(y_pred_2)
    accuracy_3 = num_3/len(y_pred_3)
    
    num = len(y_test)
    print(f'group 0 accuracy: {accuracy_0}')
    print(f'group 1 accuracy: {accuracy_1}')
    print(f'group 2 accuracy: {accuracy_2}')
    print(f'group 3 accuracy: {accuracy_3}')
    accuracy = (num_0+num_1+num_2+num_3)/num
    
    return accuracy




In [144]:
def stacking_train(y,tX,lambda_,gamma,max_iters):
    
    dim = tX.shape[1]
    initial_w = np.zeros(dim)
    
    #least squares SGD
    weight_SGD,cost_SGD = least_squares_SGD(y, tX, initial_w, max_iters, gamma)
    y_pred_SGD=predict_labels(weight_SGD, tX)

    tX = np.append(tX,np.array([y_pred_SGD]).T,axis=1)
    
    #Logistic Regression
    dim = tX.shape[1]
    initial_w = np.append(initial_w,0)    
    
    weight_LR,cost_LR= reg_logistic_regression(y, tX, lambda_, initial_w, max_iters, gamma)
    y_pred_LR=predict_labels(weight_LR, tX)
    tX = np.append(tX,np.array([y_pred_LR]).T,axis=1)
    
    #Ridge Regression
    weight,cost_RR = ridge_regression(y,tX,lambda_)
    
    
    return weight_SGD,weight_LR,weight

In [145]:
def stacking_test(tX,weight_SGD,weight_LR,weight):
    
    y_pred_SGD=predict_labels(weight_SGD, tX)
    tX = np.append(tX,np.array([y_pred_SGD]).T,axis=1)
    
    y_pred_LR=predict_labels(weight_LR, tX)
    tX = np.append(tX,np.array([y_pred_LR]).T,axis=1)
    
    y_pred=predict_labels(weight, tX)

    
    return y_pred

In [146]:
def demo():
    k_fold = 4
    seed = 12
    k_indices = build_k_indices(y, k_fold, seed)
    total = 0
    
    for k in range(k_fold):
        accuracy = cross_validation(y,tX,k_indices,k,1e-7,1e-7,10000)
        total  = total+accuracy
        print(f'{k}:{accuracy} ')
    
    average=total/k_fold
    print(f'average accuracy:{average}')

In [147]:
demo()

group 0 accuracy: 0.7378392421759463
group 1 accuracy: 0.6854163456105203
group 2 accuracy: 0.7373907465319541
group 3 accuracy: 0.6602922605087498
0:0.714544 
group 0 accuracy: 0.7368736035748484
group 1 accuracy: 0.6803485793843139
group 2 accuracy: 0.739326375711575
group 3 accuracy: 0.6543095458758109
1:0.712704 
group 0 accuracy: 0.7299158990788946
group 1 accuracy: 0.6855777223607648
group 2 accuracy: 0.7444215856939389
group 3 accuracy: 0.641566265060241
2:0.711216 
group 0 accuracy: 0.7315768302493966
group 1 accuracy: 0.6853776497221651
group 2 accuracy: 0.7346696244652194
group 3 accuracy: 0.6519168756718022
3:0.71072 
average accuracy:0.712296
