In [2]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

# Seed the random number generator:
np.random.seed(1)

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

In [79]:
# Load all the data
all = load_data('train_2008.csv')
all2 = load_data('test_2008.csv')

In [82]:
####### FEATURE ENGINEERING HERE #######
# Below, we transform the real valued 
# data from the dataset to categorical 
# data if it should be. This should 
# vastly improve model performance.
########################################
import csv
import keras
from keras.datasets import mnist
from sklearn.decomposition import TruncatedSVD

# Split into features and labels
Y = all[:, -1]
X = all[:, :-1]
comp1X = all2[:, :]
comp1X = np.append(comp1X, [all2[1, :]], axis = 0)

featureNames = None
with open('train_2008.csv', 'r') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    for row in spamreader:
        featureNames = row
        break

# Delete all the columns which are identical among all data points, these only hurt our model 
shouldDelArr = ["id", "QSTNUM", "PEIO1OCD", "PEIO2OCD", "PEHRACTT", "PEHRUSLT"]
shouldDelete = np.all(X == X[0,:], axis = 0)
for i in range(X.shape[1]):
    ib = X.shape[1] - i - 1
    if ((shouldDelete[ib] == True) or (featureNames[ib] in shouldDelArr)):
        # delete the i-th columnif 
        X = np.delete(X, ib, 1)
        comp1X = np.delete(comp1X, ib, 1)
        featureNames = np.delete(featureNames, ib)

newX = np.empty((X.shape[0],1))
newc1X = np.empty((comp1X.shape[0],1))

# These are the features which are NOT categorical; the rest I deemed were.
real_valued_feats = ["HWHHWGT", "GTCBSA", "GTCO", "PEAGE", "PEHRUSL1", "PEHRUSL2", "PUHROFF2", "PUHROT2", "PEHRACT2", 
                     "PELAYDUR", "PELKDUR", "PRUNEDUR", "PEERNHRO", "PEERNWKP", "PRNMCHLD", "PEHGCOMP", "PECYC", "PWCMPWGT", 
                     "PWFMWGT", "PWLGWGT", "PWORWGT", "PWSSWGT", "PWVETWGT", "PRERNWA", "PRERNHLY", "PEERNH1O", "PEERNH2", 
                     "PEERN", "PEHRACT1"]


for i in range(X.shape[1]):
    feature = np.reshape(X[:, i], (X.shape[0], 1))
    featurec1 = np.reshape(comp1X[:, i], (comp1X.shape[0], 1))
    if featureNames[i] in ["PENATVTY", "PEMNTVTY", "PEFNTVTY"]:
        feature[feature == 55] = 1 # US
        feature[feature == 66] = 2 # Guam
        feature[feature == 73] = 3 # Puerto Rico
        feature[feature == 78] = 4 # US Virgin Islands
        feature[feature == 96] = 5 # Other US Island Area
        feature[feature > 5] = 6
        
        featurec1[featurec1 == 55] = 1 # US
        featurec1[featurec1 == 66] = 2 # Guam
        featurec1[featurec1 == 73] = 3 # Puerto Rico
        featurec1[featurec1 == 78] = 4 # US Virgin Islands
        featurec1[featurec1 == 96] = 5 # Other US Island Area
        featurec1[featurec1 > 5] = 6 
            
    
    #print(feature.shape)
    if featureNames[i] in real_valued_feats:
        newX = np.append(newX, feature, axis = 1)
        newc1X = np.append(newc1X, featurec1, axis = 1)
    else:
        distinct_negs = []
        distinct_nums = []
        for val in feature:
            if val[0] < 0:
                if val[0] not in distinct_negs:
                    distinct_negs.append(val[0])
            if val[0] not in distinct_nums:
                distinct_nums.append(val[0])
        
        distinct_nums.sort()
        if (len(distinct_nums)> 25):
            print(len(distinct_nums), featureNames[i])
            
        for i in range(feature.shape[0]):
            feature[i][0] = distinct_nums.index(feature[i][0])
        
        for i in range(featurec1.shape[0]):
            if (featurec1[i][0] not in distinct_nums):
                featurec1[i][0] = feature[i][0]
            else:
                featurec1[i][0] = distinct_nums.index(featurec1[i][0])
                
            if (i == (featurec1.shape[0] - 1)):
                featurec1[i][0]= len(distinct_nums) - 1
        
        feature = keras.utils.np_utils.to_categorical(feature)
        featurec1 = keras.utils.np_utils.to_categorical(featurec1)
   
        # Delete the negative cols
        feature = np.delete(feature, np.s_[0:len(distinct_negs)], 1)
        featurec1 = np.delete(featurec1, np.s_[0:len(distinct_negs)], 1)
        
        # print(test.shape, newX.shape)
        newX = np.append(newX, feature, axis = 1)
        newc1X = np.append(newc1X, featurec1, axis = 1)
        
        
# Delete the placeholder column and transpose
newX = np.delete(newX, 0, 1)
newc1X = np.delete(newc1X, 0, 1)
print(newX.shape)
print(newc1X.shape)

30 HRHHID2
51 GESTCEN
51 GESTFIPS
27 GTCSA
32 PRABSREA
53 PRDTIND1
41 PRDTIND2
262 PEIO1ICD
123 PEIO2ICD
(64667, 2419)
(16001, 2419)


In [91]:
# Split into train and validation
X_train, X_test, Y_train, Y_test = train_test_split(newX, Y, test_size=0.15, random_state=42)

In [92]:
def normalize(rows_train, rows_test):
    columns_train  = np.transpose(rows_train)
    columns_test = np.transpose(rows_test)
    for i in range(columns_train.shape[0]):
        col = columns_train[i]
        mean = np.mean(col)
        stddev = np.std(col)
        if (stddev == 0):
            stddev = 1
        columns_train[i] = (col - mean) / stddev
        columns_test[i] = (columns_test[i] - mean) / stddev
    return np.transpose(columns_train), np.transpose(columns_test)

In [93]:
Y_train_1d = Y_train.copy()
Y_test_1d = Y_test.copy()

# First convert to real labels
Y_train = keras.utils.np_utils.to_categorical(Y_train)
Y_test = keras.utils.np_utils.to_categorical(Y_test)

c1X_train, c1X_test = normalize(X_train.copy(), newc1X.copy())
X_train, X_test = normalize(X_train.copy(), X_test.copy())

# svd = TruncatedSVD(n_components=324, n_iter=20, random_state=42)
# svd.fit(X_train)
# X_train = svd.transform(X_train)
# X_test = svd.transform(X_test)


In [88]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import Conv2D, MaxPooling2D, Flatten, BatchNormalization, LocallyConnected1D
from keras import regularizers
from sklearn.model_selection import KFold

n_bags = 2
kf = KFold(n_splits=n_bags)
models = []

for i in range(n_bags):
    model = Sequential()
    model.add(Dense(2048, input_shape=(X_train.shape[1],), kernel_regularizer=regularizers.l2(0.0005)))
    model.add(Activation('relu'))
    model.add(Dropout(.4))
    model.add(Dense(1024, kernel_regularizer=regularizers.l2(0.0005)))
    model.add(Activation('relu'))
    model.add(Dropout(.4))
    model.add(Dense(512, kernel_regularizer=regularizers.l2(0.0005)))
    model.add(Activation('relu'))
    model.add(Dropout(.4))
    model.add(Dense(128, kernel_regularizer=regularizers.l2(0.0005)))
    model.add(Activation('relu'))
    model.add(Dense(2))
    model.add(Activation('softmax'))
    models.append(model)

In [89]:
i=0
for train_index, test_index in kf.split(X_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    Y_train_fold, Y_test_fold = Y_train[train_index], Y_train[test_index]
    models[i].compile(optimizer='adam', 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

    # Train the model, iterating on the data in batches of 64 samples
    history = models[i].fit(X_train_fold, Y_train_fold, epochs=25, batch_size=64,
                    validation_data=(X_test_fold, Y_test_fold))
    i += 1


Train on 27483 samples, validate on 27483 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Train on 27483 samples, validate on 27483 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [94]:
probs = np.empty((n_bags, c1X_test.shape[0], 2))
i = 0
for model in models:
    probs[i] = model.predict_proba(c1X_test)
    i+=1

probs_meanc1 = np.mean(probs, axis = 0)

probs = np.empty((5,X_test.shape[0], 2))
i = 0
for model in models:
    probs[i] = model.predict_proba(X_test)
    i += 1
    
probs_mean = np.mean(probs, axis = 0)
rounded = np.round(probs_mean)
errors = 0
print(rounded[0][0])
for iprobs_meanc1 in range(rounded.shape[0]):
    if (rounded[i][0] != Y_test[i][0]):
        errors += 1

acc = 1 - (errors / Y_test.shape[0])
print(acc)

0.2395629316565303


In [96]:
# Ensemble of Random Forests, getting great test accuracy to train time ratio
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=6)
#clf.fit(X_train, Y_train)

print("Starting ADA")
# Now, use this as the base classifier for the adaboost classifier
clf1 = AdaBoostClassifier(base_estimator=clf, n_estimators=100)
clf1.fit(X_train, Y_train_1d)

clf1.score(X_test, Y_test_1d)

Starting ADA


0.78909390784455213

In [97]:
clf1rf = clf1.predict_log_proba(c1X_test)

In [105]:
clf1rf[:, 0]
import pandas as pd
pd.DataFrame(clf1rf[:, 0]).to_csv("ag_c2.csv")

array([[-0.68288326, -0.70351755],
       [-0.68075991, -0.70568982],
       [-0.68275776, -0.70364567],
       ..., 
       [-0.67401557, -0.71265196],
       [-0.68345534, -0.70293387],
       [-0.66086798, -0.72650318]])

In [39]:
# TODO: Make a Adaboosted MLP classifier. Should be dank af.
from sklearn.neural_network import MLPClassifier


clfMLP = MLPClassifier(hidden_layer_sizes=(512,256,128), early_stopping=True, learning_rate='adaptive')
clfMLP.fit(X_train, Y_train)
clfMLP.score(X_test, Y_test)

Halfway


ValueError: MLPClassifier doesn't support sample_weight.

In [None]:
# SVM Classifier. This might be interesting to ADABoost. Takes a long time to train.
from sklearn.svm import SVC

clfSVC = SVC()
clfSVC.fit(X_test, Y_test_1d) # Train with smaller dataset to cuts down time.
clfSVC.score(X_train, Y_train_1d)

In [45]:
from sklearn.ensemble import AdaBoostClassifier

clfD = AdaBoostClassifier(base_estimator=clfMLP, n_estimators=100)
clfD.fit(X_train, Y_train_1d)
clfD.score(X_test, Y_test_1d)



ValueError: MLPClassifier doesn't support sample_weight.

In [95]:
import pandas as pd
pd.DataFrame(probs_meanc1[:, 0]).to_csv("ag_c1.csv")