In [14]:
import numpy as np 
import tensorflow as tf 
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers import BatchNormalization
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from keras.utils.np_utils import to_categorical

from sklearn.gaussian_process import GaussianProcess
import random


# Seed the random number generator:
np.random.seed(1)

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

In [15]:
X = load_data('training_data.txt')
y = X[:, 0]
X = X[:, 1:]


In [22]:
X_short = X[:10]
y_short = y[:10]

In [23]:
print(np.shape(X_short), np.shape(y_short))
print(np.shape(X), np.shape(y))

(10, 1000) (10,)
(20000, 1000) (20000,)


Using grid to optimize hyper-params

In [44]:

def make_tuning_model(X_train, y_train, X_test, y_test, h_params):
    '''
    Architecture stays constant except for toggling batch normalization
    
    h_params: {'layer_sizes' : [size1, size2, size3], 
                'dropouts' : [d1, d2]
                }
    '''
    
    ## Create your own model here given the constraints in the problem
    model = Sequential() # Use np.reshape instead of this in hw
        
    model.add(Dense(h_params['layer_sizes'][0], input_dim = 1000))
    model.add(Activation('relu'))
    model.add(Dropout(h_params['dropouts'][0]))
    
    model.add(Dense(h_params['layer_sizes'][1]))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(h_params['dropouts'][1]))
    
    model.add(Dense(h_params['layer_sizes'][2]))
    model.add(Activation('relu'))

    model.add(Dense(2))
    model.add(Activation('softmax'))

    ## Printing a summary of the layers and weights in your model
    #model.summary()
    
    ## In the line below we have specified the loss function as 'mse' (Mean Squared Error) because in the above code we did not one-hot encode the labels.
    ## In your implementation, since you are one-hot encoding the labels, you should use 'categorical_crossentropy' as your loss.
    ## You will likely have the best results with RMS prop or Adam as your optimizer.  In the line below we use Adadelta
    model.compile(loss='categorical_crossentropy',optimizer='Adam', metrics=['accuracy'])
    
    fit = model.fit(X_train, y_train, batch_size=128, epochs=2,
        verbose=0)

    ## Printing the accuracy of our model, according to the loss function specified in model.compile above
    score = model.evaluate(X_test, y_test, verbose=0)
#     print('Test score:', score[0])
#     print('Test accuracy:', score[1])
    
    return score[1], model

In [45]:
def kfold(X, y, h_params):
    
#     print(np.shape(X))
#     print(np.shape(y))
    
    kf = KFold(n_splits = 8, shuffle=True)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        y_train = to_categorical(y_train)
        y_test = to_categorical(y_test)

        scores.append(make_tuning_model(X_train, y_train, X_test, y_test, h_params)[0])

#     print('validation acc: ' + str(np.mean(scores)))
    return np.mean(scores)

In [48]:


layer_sizes = [(1500, 300, 200), (2000, 500, 100)]
dropouts = [(.4, .3), (.45, .33)]

validation_accs = {}

iters = len(layer_sizes) * len(dropouts)

for layer_size in layer_sizes:
    for dropout in dropouts:
        print('Iterations left: ' + str(iters))
        iters -= 1
          
        h_params = {'layer_sizes' : layer_size, 'dropouts' : dropout}
        
        val_acc = kfold(X, y, h_params)

        validation_accs[val_acc] = h_params
    
scoress = list(validation_accs.keys())
scoress.sort(reverse=True)

print('hyper params           :               Validation accuracy')
for s in scoress:
    print(str(validation_accs[s]) + ': ' + str(s))
        

Iterations left: 4
Iterations left: 3
Iterations left: 2
Iterations left: 1
hyper params           :               Validation accuracy
{'layer_sizes': (1500, 300, 200), 'dropouts': (0.45, 0.33)}: 0.84755
{'layer_sizes': (2000, 500, 100), 'dropouts': (0.45, 0.33)}: 0.8472500000000001
{'layer_sizes': (1500, 300, 200), 'dropouts': (0.4, 0.3)}: 0.847
{'layer_sizes': (2000, 500, 100), 'dropouts': (0.4, 0.3)}: 0.842


In [47]:
print(np.shape(X))
print(np.shape(y))

(20000, 1000)
(20000,)


In [6]:
X_test = load_data('test_data.txt')
X = load_data('training_data.txt')
y = X[:, 0]
y = to_categorical(y)
X = X[:, 1:]
m = make_model(X, y, X, y)[1]
pred = m.predict(X_test)

Epoch 1/2
Epoch 2/2
Test score: 0.1895963114976883
Test accuracy: 0.92885


In [7]:
def format_pred(pred):
    result = [1 if i[0] == 0 else 0 for i in np.round(pred)]
    with open("result.txt", "w") as f:
        f.write("Id,Prediction\n") 
        for i in range(1, len(result) + 1):
            f.write(str(i) + "," + str(result[i-1]) + "\n")

In [8]:
format_pred(pred)