In [5]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

# Seed the random number generator:
np.random.seed(1)

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

In [8]:
# Load all the data
all = load_data('train_2008.csv')

In [196]:
# Split into features and labels
Y = all[:, -1]
X = all[:, :-1]

# Delete all the columns which are identical among all data points, these only hurt our model 
shouldDelete = np.all(X == X[0,:], axis = 0)
for i in range(X.shape[1]):
    if (shouldDelete[i] == True):
        # delete the i-th column
        X = np.delete(X, i, 1)
print(X.shape[1])

# Split into train and validation
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42)


367


In [197]:
def normalize(rows_train, rows_test):
    columns_train  = np.transpose(rows_train)
    columns_test = np.transpose(rows_test)
    for i in range(columns_train.shape[0]):
        col = columns_train[i]
        mean = np.mean(col)
        stddev = np.std(col)
        if (stddev == 0):
            print(i)
            stddev = 1
        columns_train[i] = (col - mean) / stddev
        columns_test[i] = (columns_test[i] - mean) / stddev
    return np.transpose(columns_train), np.transpose(columns_test)

In [198]:
import keras
from keras.datasets import mnist
from sklearn.decomposition import TruncatedSVD

Y_train_1d = Y_train.copy()
Y_test_1d = Y_test.copy()

# First convert to real labels
Y_train = keras.utils.np_utils.to_categorical(Y_train)
Y_test = keras.utils.np_utils.to_categorical(Y_test)


X_train, X_test = normalize(X_train.copy(), X_test.copy())

svd = TruncatedSVD(n_components=324, n_iter=20, random_state=42)
svd.fit(X_train)
X_train = svd.transform(X_train)
X_test = svd.transform(X_test)


1
10
13
42
52
122
123
124
128
129
241
245


In [203]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import Conv2D, MaxPooling2D, Flatten, BatchNormalization
from keras import regularizers


model = Sequential()
model.add(Dense(256, input_shape=(X_train.shape[1],), kernel_regularizer=regularizers.l2(0.000)))
model.add(Activation('relu'))
model.add(Dropout(.3))
model.add(Dense(256, kernel_regularizer=regularizers.l2(0.000)))
model.add(Activation('relu'))
model.add(Dropout(.3))
model.add(Dense(256, kernel_regularizer=regularizers.l2(0.000)))
model.add(Activation('relu'))
model.add(Dropout(.3))
model.add(Dense(256, input_shape=(X_train.shape[1],), kernel_regularizer=regularizers.l2(0.000)))
model.add(Activation('relu'))
model.add(Dropout(.3))
model.add(Dense(256, kernel_regularizer=regularizers.l2(0.000)))
model.add(Activation('relu'))
model.add(Dropout(.3))
model.add(BatchNormalization())
model.add(Dense(2))
model.add(Activation('softmax'))
models.append(model)

In [205]:
model.compile(optimizer='adam', 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model, iterating on the data in batches of 32 samples
history = model.fit(X_train, Y_train, epochs=100, batch_size=64,
                    validation_data=(X_test, Y_test))


Train on 54966 samples, validate on 9701 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100


Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [170]:
# This 4-liner is currently the best classifier with ~.78 accuracy
from sklearn.ensemble import AdaBoostClassifier
clf1 = AdaBoostClassifier(n_estimators=100)
clf1.fit(X_train, Y_train_1d)

clf1.score(X_test, Y_test_1d)

0.76198330069065046