In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from implementations import *
DATA_TRAIN_PATH = 'train.csv' 
y_orig, tX_raw, ids = load_csv_data(DATA_TRAIN_PATH) #Standardized data ready to use

## Standardization

In [3]:
# DO NOT RUN FOR 4 WAY SPLIT ALGORITHM
# Run to standardize data, mode = 0, 1, 2
# mode 0: basic standardization with standard deviation
# mode 1: quantile standardization
# mode 2: scaled standardization
tX_raw = standardize_data(tX_raw, mode = 0)

## Feature Expansion

In [4]:
degree=1 #degree atleast 1
flags =[1,1,0] #Log,sin,cos flags 
tX_orig = feature_expand(tX_raw,degree,flags)
print(tX_orig.shape)

(250000, 121)


## Visualizing Data

In [None]:
i1 = 10 # Fix one index to analyze 
tXi1= [row[i1] for row in tX_orig]
print(len(tXi1),len(y_orig))
plt.figure()
plt.plot(tXi1,y_orig,'bo')
plt.xlabel(i1)
plt.ylabel("y")
plt.show()

#Plotting all graphs across various indices versus chosen index to see dependence
for i2 in range(0,31):
        plt.figure()
        tXi2= [row[i2] for row in tX_orig]
        plt.plot(tXi1, tXi2,'ro')
        plt.xlabel(i1)
        plt.ylabel(i2)
        plt.show()

## Cross Validation 

In [6]:
#Split Training Data into 2 parts one for training and other for cross-validating with a fixed seed

ratio=0.97 #percentage of data to be trained and what to be used for cros-validating
seed=13

tX,tX_cross,y,y_cross = split_data(tX_raw,y_orig,ratio,seed)

print(y.shape,tX.shape,y_cross.shape,tX_cross.shape)

(242500,) (242500, 30) (7500,) (7500, 30)


## Do your thing crazy machine learning thing here :) ...

In [None]:
print([row[23] for row in tX[0:10]])  #23rd column is the quarternary one PRI_jet_num

## Gradient Descent

In [None]:
# Define the parameters of the algorithm.
max_iters = 200
gamma = 0.1


#tX=np.c_[np.ones(len(y)), tX[:, 14:31 ] ]  # to comment this line for only primitive values

# Initialization
w_initial = np.zeros(len(tX[0]))

# Start gradient descent.
gradient_losses, gradient_ws = gradient_descent(y, tX, w_initial, max_iters, gamma)

weights = gradient_ws[-1]
loss=gradient_losses[-1]
print(weights)
#weights= np.array([weights[0],0,0,0,0,0,0,0,0,0,0,0,0,0]  +list(weights[1:])) # to comment this line for only primitive values
#print(weights)

## Stochastic Gradient Descent

In [None]:
# Define the parameters of the algorithm.

gamma = 0.1
batch_size = 1
max_iters = int(len(y)/batch_size)

# Initialization
w_initial = np.zeros(len(tX[0]))

# Start gradient descent.
stoch_gradient_losses, stoch_gradient_ws = stochastic_gradient_descent(y, tX, w_initial, batch_size, max_iters, gamma)

weights = stoch_gradient_ws[-1]
loss=stoch_gradient_losses[-1]
print(weights)

## Least Squares

In [7]:
loss,weights = least_squares(y,tX)
print(weights)
print(loss)

[ 2.94192259e-02 -2.52518124e-01 -2.54701272e-01 -3.04953456e-02
 -1.41582139e+00  2.94531395e-01 -1.07763775e+01  2.68141936e-01
 -2.80309591e-03 -3.27450781e+02 -1.82440637e-01  1.13314114e-01
  2.05970765e+01  6.36184284e+01  5.60971571e-05 -1.79410494e-03
  6.27314892e+01 -6.32998485e-04  1.65431667e-03  1.21838797e-01
  7.25046202e-04 -6.36012656e-02 -2.04984811e-01 -1.03064688e-01
  2.17324634e-02  2.31403894e-01 -4.30612487e-02 -3.05927893e+00
 -5.36893579e+00  2.77312025e+02]
0.38915286643055325


## Ridge Regression

In [None]:
#Find best lambda value for ridge regression

#selecting values of lambda to test
n_lambdas = 50
lambdas = np.logspace(-7, 0, n_lambdas)

#using polynomial of degree to determine basis functions to fit nonlinear data

txtrn = tX #comment this line and uncomment above for polynomial fitting

rmse_lst=[]
rmse_min=1e10

for ind, lambda_ in enumerate(lambdas):

    rmse, weights_trn = ridge_regression(y, txtrn, lambda_)
    rmse_lst.append(rmse)
    
    if rmse < rmse_min:
        rmse_min = rmse
        lambda_opt = lambda_
        weights = weights_trn
        
plt.semilogx(lambdas, rmse_lst, color='r', marker='*', label="RMSE")
plt.xlabel("lambda")
plt.ylabel("RMSE")
plt.title("Ridge regression")
leg = plt.legend(loc=1, shadow=True)
leg.draw_frame(False)

print("The best lambda value for ridge regression is ",lambda_opt)
print("Least mse is ", mse_min) 
print("Weights ", weights)

## Logistic Regression

In [None]:
# Define the parameters of the algorithm.
max_iters = 200
gamma = 0.000002
# Initialization
w_initial = np.zeros(len(tX[0]))
y[np.where(y==-1)]=0 # Since y is array with -1 and +1 , we need to make it to 0's and 1's

# Start Logistic gradient descent.
logistic_losses, logistic_ws = logistic_gradient_descent(y, tX, w_initial, max_iters, gamma)

weights = logistic_ws[-1]
loss=logistic_losses[-1]
print(weights)
print(loss)

## Regularized Logistic Regression

In [None]:
# Define the parameters of the algorithm.
max_iters = 200
gamma = 0.000001

# Initialization
w_initial = np.zeros(len(tX[0]))
y[np.where(y==-1)]=0 # Since y is array with -1 and +1 , we need to make it to 0's and 1's


'''Loop for Lambda and find best value  '''
lambdas = np.logspace(-4, 0, 10)
loss=1e10


for lambda_ in lambdas:
    # Start Regularized gradient descent.
    reg_logistic_losses_temp, reg_logistic_ws_temp = reg_logistic_gradient_descent(y, tX, w_initial, max_iters, gamma,lambda_)
    
    print(lambda_, reg_logistic_losses_temp[-1])
    if loss>reg_logistic_losses_temp[-1]:
        loss = reg_logistic_losses_temp[-1]
        weights= reg_logistic_ws_temp[-1]
        lambda_opt = lambda_

print("The best lambda value for regularized logistic regression is ",lambda_opt)
print(weights)
print(loss)

## Newton Descent

In [None]:
# Define the parameters of the algorithm.
max_iters = 300
gamma = 0.0001

# Initialization
w_initial = np.zeros(len(tX[0]))
y[np.where(y==-1)]=0 # Since y is array with -1 and +1 , we need to make it to 0's and 1's

''' TODO Hessian computation is too large''' 
''' Maybe combine with SGD'''
''' DONT RUN ''''


# Start Newton logistic gradient descent.
newt_logistic_losses, newt_logistic_ws = newt_logistic_gradient_descent(y, tX, w_initial, max_iters, gamma)

weights = logistic_ws[-1]
loss=logistic_losses[-1]
print(weights)

## Stochastic Newton Descent

In [None]:
# Define the parameters of the algorithm.

gamma = 0.0001
batch_size = 1000
max_iters = int(len(y)/batch_size)

# Initialization
w_initial = np.zeros(len(tX[0]))
y[np.where(y==-1)]=0 # Since y is array with -1 and +1 , we need to make it to 0's and 1's

# Start Newton Stochastic gradient descent
newt_stochastic_losses, newt_stochastic_ws = newt_stochastic_gradient_descent(y, tX, w_initial, max_iters, gamma, batch_size)

weights = newt_stochastic_ws[-1]
loss=newt_stochastic_losses[-1]
print(weights)

## Fancy Idea No.1 : Split Data into 4 parts and run GD

In [None]:
#NOTE: ONLY RUN FIRST TWO CELLS BEFORE THIS, ONLY LOAD DATA
# DONT RUN FEATURE EXPANSION OR CROSS VALIDATION, DOING IT HERE
tX23=[row[22] for row in X_raw[0:100]] #23rd column which is JET_PRI_NUM and has exactly 4 distinct values
values=(list(set(tX23))) #get all 4 unique values
indices=[]

weights_new = []
loss_new   = []

#Split each of the 4 subsets of Training Data into 2 parts one- for training and other for cross-validating with a fixed seed

#Cross Validation Parameters
ratio=0.97 #percentage of data to be trained and what to be used for cros-validating
seed=13 #random seed

#Feature Expansion Parameters
degree=1 #degree atleast 1

#Logistic Regression Parameters
gamma=[0.000002,0.0000055,0.0000054,0.0000056]
flags = [[1,1,1],[0,1,1],[0,1,1],[0,1,1] ]
max_iters = 1000 

cross_val= [0,0,0,0] #cross validation score for each group
shapes=[]

# Replace every -999. in the first column by the median of the said column to avoid having it deleted

first_col = np.array(X_raw[:,0])
print(first_col)
first_col[first_col == -999.] = 0
first_col[first_col == 0.] = np.median(first_col)
X_raw[:,0] = first_col

col_to_del = []

#X_reduced, y_reduced = clean_columns_rows(X_raw, y_orig, 0, 1, [1])
#tX_reduced = standardize_data(X_reduced, mode = 0)

for i in range(0,4): #iterate over each of the 4 possibilites of pri num
    curr_indices = []
    curr_indices=np.where( [row[22] for row in X_raw] ==values[i])[0] 
    indices.append(curr_indices) #keeps track of indices

    X_reduced, y_reduced, del_col = clean_columns_rows(X_raw[curr_indices,:], y_orig[curr_indices], 0, 1, flags[i])

    #Standardize all the remaining columns
    X_reduced = standardize_data(X_reduced, mode = 1)

    tX_4way_temp = feature_expand(X_reduced,degree,flags[i]) #Feature Expansion
    y_4way_temp = y_reduced

    #Cross validation split    
    tX_4way_train,tX_4way_cross,y_4way_train,y_4way_cross = split_data(tX_4way_temp,y_4way_temp,ratio,seed)
    
    shapes.append(y_4way_train.shape[0])

    #Running Logistic Regression ***********************


    # Initialization
    w_initial = np.zeros(len(tX_4way_train[0]))
    y_4way_train[np.where(y_4way_train==-1)]=0 # Since y is array with -1 and +1 , we need to make it to 0's and 1's


    # Start Logistic gradient descent
    logistic_losses, logistic_ws = logistic_gradient_descent(y_4way_train, tX_4way_train, w_initial, max_iters, gamma[i])

    weights_new.append(logistic_ws[-1])
    loss_new.append(logistic_losses[-1])
    
    cross_val[i] = cross_validation(logistic_ws[-1],tX_4way_cross,y_4way_cross)

    col_to_del.append(del_col)


In [None]:
DATA_TEST_PATH = 'test.csv' 
_, X_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
OUTPUT_PATH = 'output.csv' 

y_pred = predict_labels_new(weights_new, X_test,degree,flags, col_to_del)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

## Cross Validation Results

In [8]:
#Split original data used above into two parts and cross validate here to get expected accuracy
print("Cross Validation Score : ",cross_validation(weights,tX_cross,y_cross))


Cross Validation Score :  0.7209333333333333


## Generate predictions and save ouput in csv format for submission:

In [9]:
DATA_TEST_PATH = 'test.csv' 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [10]:
OUTPUT_PATH = 'output.csv' 

'''NOTE: If Logistic, then the predict labels should be at 0.5 and not 0 since it is a 0-1 problem'''

tX_pred=feature_expand(tX_test,degree,flags)
y_pred = predict_labels(weights, tX_pred)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

  tx=np.c_[tx,np.log(np.abs(x))]


ValueError: shapes (568238,121) and (30,) not aligned: 121 (dim 1) != 30 (dim 0)