In [21]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

# Seed the random number generator:
np.random.seed(1)

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

In [None]:
# Load all the data
all = load_data('train_2008.csv')

In [None]:
####### FEATURE ENGINEERING HERE #######
# Below, we transform the real valued 
# data from the dataset to categorical 
# data if it should be. This should 
# vastly improve model performance.
########################################
import csv
import keras
from keras.datasets import mnist
from sklearn.decomposition import TruncatedSVD

# Split into features and labels
Y = all[:, -1]
X = all[:, :-1]

featureNames = None
with open('train_2008.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    for row in spamreader:
        featureNames = row
        break

# Delete all the columns which are identical among all data points, these only hurt our model 
shouldDelArr = ["id", "QSTNUM"]
shouldDelete = np.all(X == X[0,:], axis = 0)
for i in range(X.shape[1]):
    ib = X.shape[1] - i - 1
    if ((shouldDelete[ib] == True) or (featureNames[ib] in shouldDelArr)):
        # delete the i-th columnif 
        X = np.delete(X, ib, 1)
        featureNames = np.delete(featureNames, ib)

newX = np.empty((X.shape[0],1))

# These are the features which are NOT categorical; the rest I deemed were.
real_valued_feats = ["HWHHWGT", "GTCBSA", "GTCO", "PEAGE", "PEHRUSL1", "PEHRUSL2", "PUHROFF2", "PUHROT2", "PEHRACT2", 
                     "PELAYDUR", "PELKDUR", "PRUNEDUR", "PEERNHRO", "PEERNWKP", "PRNMCHLD", "PEHGCOMP", "PECYC", "PWCMPWGT"]


for i in range(X.shape[1]):
    feature = np.reshape(X[:, i], (X.shape[0], 1))
    
    num_distinct_negs = 0
    min_neg = 0
    distinct_negs = []
    for val in feature:
        if val < 0:
            if val not in distinct_negs:
                distinct_negs.append(val)
                num_distinct_negs += 1
            if val < min_neg:
                min_neg = val
    
    #print(feature.shape)
    if featureNames[i] in real_valued_feats:
        newX = np.append(newX, feature, axis = 1)
    else:
        feature -= min_neg
        cat = keras.utils.np_utils.to_categorical(X[:, 0])
        
        # delete the zero columns
        zero_cols = np.any(cat, axis=0)
        for j in range(zero_cols.shape[0]):
            jb = zero_cols.shape[0] - j - 1
            if zero_cols[jb] == False:
                np.delete(cat, jb, axis=1)
                
        # Delete the negative cols
        np.delete(cat, np.s_[0:num_distinct_negs], 1) 
        
        # print(test.shape, newX.shape)
        newX = np.append(newX, cat, axis = 1)
        
        
# Delete the placeholder column and transpose
newX = np.delete(newX, 0, 1)
print(newX.shape)

In [5]:
# Split into train and validation
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

In [6]:
def normalize(rows_train, rows_test):
    columns_train  = np.transpose(rows_train)
    columns_test = np.transpose(rows_test)
    for i in range(columns_train.shape[0]):
        col = columns_train[i]
        mean = np.mean(col)
        stddev = np.std(col)
        if (stddev == 0):
            stddev = 1
        columns_train[i] = (col - mean) / stddev
        columns_test[i] = (columns_test[i] - mean) / stddev
    return np.transpose(columns_train), np.transpose(columns_test)

In [7]:
Y_train_1d = Y_train.copy()
Y_test_1d = Y_test.copy()

# First convert to real labels
Y_train = keras.utils.np_utils.to_categorical(Y_train)
Y_test = keras.utils.np_utils.to_categorical(Y_test)


X_train, X_test = normalize(X_train.copy(), X_test.copy())

# svd = TruncatedSVD(n_components=324, n_iter=20, random_state=42)
# svd.fit(X_train)
# X_train = svd.transform(X_train)
# X_test = svd.transform(X_test)


In [9]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import Conv2D, MaxPooling2D, Flatten, BatchNormalization, LocallyConnected1D
from keras import regularizers
from sklearn.model_selection import KFold

n_bags = 5
kf = KFold(n_splits=n_bags)
models = []

for i in range(n_bags):
    model = Sequential()
    model.add(Dense(128, input_shape=(X_train.shape[1],), kernel_regularizer=regularizers.l2(0.000)))
    model.add(Activation('relu'))
    model.add(Dropout(.3))
    model.add(Dense(128, kernel_regularizer=regularizers.l2(0.000)))
    model.add(Activation('relu'))
    model.add(Dropout(.3))
    model.add(Dense(128, kernel_regularizer=regularizers.l2(0.000)))
    model.add(Activation('relu'))
    model.add(Dense(2))
    model.add(Activation('softmax'))
    models.append(model)

In [13]:
i =0
for train_index, test_index in kf.split(X_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    Y_train_fold, Y_test_fold = Y_train[train_index], Y_train[test_index]
    models[i].compile(optimizer='adam', 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

    # Train the model, iterating on the data in batches of 64 samples
    history = models[i].fit(X_train_fold, Y_train_fold, epochs=10, batch_size=64,
                    validation_data=(X_test_fold, Y_test_fold))
    i += 1


Train on 43972 samples, validate on 10994 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 43973 samples, validate on 10993 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 43973 samples, validate on 10993 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 43973 samples, validate on 10993 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 43973 samples, validate on 10993 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
probs = np.empty((5,X_test.shape[0], 2))
i = 0
for model in models:
    probs[i] = model.predict_proba(X_test)
    i += 1
    
probs_mean = np.mean(probs, axis = 0)
rounded = np.round(probs_mean)
errors = 0
print(rounded[0][0])
for i in range(rounded.shape[0]):
    if (rounded[i][0] != Y_test[i][0]):
        errors += 1

acc = 1 - (errors / Y_test.shape[0])
print(acc)

0.7746624059375322


In [20]:
# Ensemble of Random Forests, getting great test accuracy to train time ratio
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=5)
#clf.fit(X_train, Y_train)

print("Starting ADA")
# Now, use this as the base classifier for the adaboost classifier
clf1 = AdaBoostClassifier(base_estimator=clf, n_estimators=100)
clf1.fit(X_train, Y_train_1d)

clf1.score(X_test, Y_test_1d)

Starting ADA


0.78517678589836104

In [11]:
# TODO: Make a Adaboosted MLP classifier. Should be dank af.
from sklearn.neural_network import MLPClassifier

clfMLP = MLPClassifier(hidden_layer_sizes=(128,128,128), early_stopping=True, learning_rate='adaptive')
clfMLP.fit(X_train, Y_train)
clfMLP.score(X_test, Y_test)

# clfDANK = AdaBoostClassifier(base_estimator=clf, n_estimators=10)
# clfDANK.fit(X_train, Y_train_1d)
# clfDANK.score(X_test, Y_test_1d)

0.74487166271518401

In [None]:
# SVM Classifier. This might be interesting to ADABoost. Takes a long time to train.
from sklearn.svm import SVC

clfSVC = SVC()
clfSVC.fit(X_test, Y_test_1d) # Train with smaller dataset to cuts down time.
clfSVC.score(X_train, Y_train_1d)