# Logistic Regression Models

In [1]:
%matplotlib inline
import numpy as np
from cross_validation import cross_validation
from polynomial import build_poly
import matplotlib.pyplot as plt
from implementations import logistic_regression
from proj1_helpers import predict_labels
from cross_validation import accuracy

In [2]:
X_train = np.load("X_train.npy")
y_train = np.load("y_train_10.npy")
print(X_train.shape)
print(y_train.shape)

(250000, 13)
(250000,)


# Splitting into 4 classifiers

We split the dataset using PRI_jet_num column

In [3]:
X_pri_0 = X_train[(X_train[:, -1] == 0),:-1]
X_pri_1 = X_train[(X_train[:, -1] == 1),:-1]
X_pri_2 = X_train[(X_train[:, -1] == 2),:-1]
X_pri_3 = X_train[(X_train[:, -1] == 3),:-1]
y_pri_0 = y_train[(X_train[:, -1] == 0)]
y_pri_1 = y_train[(X_train[:, -1] == 1)]
y_pri_2 = y_train[(X_train[:, -1] == 2)]
y_pri_3 = y_train[(X_train[:, -1] == 3)]

print("PRI_0: {}".format(X_pri_0.shape))
print("PRI_1: {}".format(X_pri_1.shape))
print("PRI_2: {}".format(X_pri_2.shape))
print("PRI_3: {}".format(X_pri_3.shape))

print("y_PRI_0: {}".format(y_pri_0.shape))
print("y_PRI_1: {}".format(y_pri_1.shape))
print("y_PRI_2: {}".format(y_pri_2.shape))
print("y_PRI_3: {}".format(y_pri_3.shape))

PRI_0: (99913, 12)
PRI_1: (77544, 12)
PRI_2: (50379, 12)
PRI_3: (22164, 12)
y_PRI_0: (99913,)
y_PRI_1: (77544,)
y_PRI_2: (50379,)
y_PRI_3: (22164,)


In [4]:
selected_features = np.array([1,3,9,10,11,13,21,22])
selected_features = np.sort(np.append(selected_features, [0,4,5,6,12]))
print(selected_features)

[ 0  1  3  4  5  6  9 10 11 12 13 21 22]


In [5]:
delete_columns = []

for i in range(X_pri_0.shape[1]):
    if np.isin(True, (X_pri_0[:,i] == -999)):
        delete_columns.append(i)
        
delete_columns

[3, 4, 5, 9]

In [6]:
X_pri_0 = np.delete(X_pri_0, np.s_[delete_columns], axis=1)  
X_pri_0.shape

(99913, 8)

In [7]:
X_pri_1 = np.delete(X_pri_1, np.s_[delete_columns], axis=1)  
X_pri_1.shape

(77544, 8)

In [8]:
print("PRI_0: {}".format(X_pri_0.shape))
print("PRI_1: {}".format(X_pri_1.shape))
print("PRI_2: {}".format(X_pri_2.shape))
print("PRI_3: {}".format(X_pri_3.shape))

PRI_0: (99913, 8)
PRI_1: (77544, 8)
PRI_2: (50379, 12)
PRI_3: (22164, 12)


# Logistic Regression Models w/ Gradient Descent for 4 subsets

In [10]:
degrees = np.linspace(2,6,5).astype(int)
max_iter = (np.linspace(2,6,3)*1000).astype(int)
gamma = np.logspace(-5,-4,2)

pars = {'degrees': degrees,
       'max_iter': max_iter,
       'gamma': gamma}

# Model training

### Training for PRI_0 subset

In [12]:
pri0_tr_acc, pri0_te_acc = cross_validation(y_pri_0, X_pri_0, 5, h_pars=pars, model='log')

KeyboardInterrupt: 