In [12]:
# Import everything we need

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn import svm
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from catboost import Pool, cv, CatBoostClassifier, CatBoostRegressor
import lightgbm as lgb
import random
import torch
import torch.nn as nn
import torch.nn.functional as F

In [74]:
# Helper functions

# binary error definition
def bin_classification_err(real_y, y):
    len_data = y.size
    num_diff = 0.0
    for i in range(len_data):
        if (y[i] != real_y[i]):
            num_diff += 1.0
    return (num_diff / len_data)


# Handle missing values in data
def missing_values(original_data, method='omit', 
                   supply_data=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
                                15,16,17,18,19,20,21,22,23,24,25,26]):
    """
    Replace missing values in original data according to given rules.
    Parameters
    ----------
    original_data : numpy array
        The data set containing NaN.
    method : str, optional
        'omit' : remove rows containing NaN. Default.
        'mean' : replace NaN by the mean of its column.
        'median' : replace NaN by the median of its column.
        'zeros' : replace NaN by 0.
        'change_and_add_flags' : replace NaN by the values specified in 
         supply_data at each corresponding columns. Then add new columns 
         with 0 = not NaN and 1 = is NaN.
    supply_data : list of floats, optional
        values to replace NaN in each column. The default is 
        [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26].
        'imputation' : fill in missing values by simple machine learning
    Returns
    -------
    new_data : numpy array of size (row_original_data, 
               column_original_data + n_column_containing_missing)
        The processed data array.
    """
    if method == 'omit':
        new_data = original_data[~np.isnan(original_data).any(axis=1)]
        
    elif method == 'mean':
        non_nan_data = original_data[~np.isnan(original_data).any(axis=1)]
        mean_row = np.mean(non_nan_data, axis=0)
        for i_column in range(len(mean_row)):
            original_data[:,i_column] = np.nan_to_num(original_data[:,i_column], 
                                                      nan=mean_row[i_column])
            new_data = original_data
            
    elif method == 'median':
        non_nan_data = original_data[~np.isnan(original_data).any(axis=1)]
        median_row = np.median(non_nan_data, axis=0)
        for i_column in range(len(median_row)):
            original_data[:,i_column] = np.nan_to_num(original_data[:,i_column], 
                                                      nan=median_row[i_column])
            new_data = original_data
            
    elif method == 'zeros':
        new_data = np.nan_to_num(original_data, nan=0.0)
        
    elif method == 'change_and_add_flags':
        import numpy.ma as ma
        for i_column in range(27): # 27 columns in total, not including y
            new_column = np.zeros(len(original_data[:,i_column]))
            mask = np.ma.masked_invalid(original_data[:,i_column]).mask
            new_column[mask] = 1
            if np.sum(new_column) != 0:
                new_column = np.expand_dims(new_column, axis=0)
                new_column = new_column.transpose()
                original_data = np.insert(original_data, [-1], new_column, axis=1)
                original_data[:,i_column] = np.nan_to_num(original_data[:,i_column], 
                                                      nan=supply_data[i_column])
                new_data = original_data
                
    elif method == 'imputation':
        # to do
        pass
    
    else: 
        print('Invalid option for treating missing data.')
    
    return new_data

# Transform data into something (hopefully) more useful
def get_trans_data(data):
    tdata = []
    for i in range(len(data)):
        pt = []
        for idx in range(2, 6):
            pt.append(data[i][idx])
        for idx in range(16, len(data[i])):
            pt.append(data[i][idx])
        # add last price / med price
        div_price = data[i][0] / data[i][1]
        pt.append(div_price)
        # add ask spread
        ask_spread = data[i][11] - data[i][15]
        pt.append(ask_spread)
        # add ask fractions
        for j in range(12, 16):
            frac = data[i][j] / data[i][11]
            pt.append(frac)
        # add bid spread
        bid_spread = data[i][6] - data[i][10]
        pt.append(bid_spread)
        # add bid fractions
        for j in range(7, 11):
            frac = data[i][j] / data[i][6]
            pt.append(frac)
        
        tdata.append(pt)
    return np.array(tdata)

# Transform data into something (hopefully) more useful as a tensor
def get_tensor_data(data, labels):
    tdata = []
    for i in range(len(data)):
        pt = []
        for idx in range(2, 3):#6):
            pt.append(data[i][idx])
        for idx in range(16, 17):#len(data[i])):
            pt.append(data[i][idx])
        # add last price / med price
        div_price = data[i][0] / data[i][1]
        pt.append(div_price)
        # add ask spread
        ask_spread = data[i][11] - data[i][15]
        pt.append(ask_spread)
        # add bid spread
        bid_spread = data[i][6] - data[i][10]
        pt.append(bid_spread)
        pt.append(labels[i])
        tdata.append(pt)
    return torch.tensor(tdata)

def restrain_out(out):
    for i in range(len(out)):
        if (out[i] > 1.0):
            out[i] = 1.0
        elif (out[i] < 0.0):
            out[i] = 0.0

In [75]:
# Get data

# Parse from csv files
test = pd.read_csv("data/test.csv")
df_train = pd.read_csv('data/train.csv', index_col=0)
df_test = pd.read_csv('data/test.csv', index_col=0)

# Convert data and deal with missing values
dtrain = df_train.values[1:]
dtest = df_test.values[:]
dtest = missing_values(dtest, method = "zeros")
dtrain = missing_values(dtrain, method = "zeros")

# Separate validation and training data
X_all, Y_all = dtrain[:, :-1], dtrain[:, -1]
X_val = X_all[0:50000]
X_train = X_all[50000:(len(X_all) - 1)]
Y_val = Y_all[0:50000]
Y_train = Y_all[50000:(len(Y_all) - 1)]
ran_train = list(zip(X_train, Y_train))
random.shuffle(ran_train)
X_train, Y_train = zip(*ran_train)
X_test = dtest

X_train_t = get_trans_data(X_train)
X_val_t = get_trans_data(X_val)
X_test_t = get_trans_data(X_test)
X_all_t = get_trans_data(X_all)
print(X_all_t[1000])

[ 1.          6.          1.00002603 -2.          1.00031235  1.00036441
  1.00046853  1.00052059  0.8         0.99994794  0.99989588  0.99984382
  0.99979175]


In [64]:
# Functions to make and train classifiers

# Make and train a RandomForestClassifier
def get_forest(n_estimators, depth, X, Y):
    clf = RandomForestClassifier(n_estimators = n_estimators, max_depth = depth, criterion = 'gini')
    clf.fit(X, Y)
    return clf

In [31]:
# Try out Voting Classifier

train_size = 200000

X = X_train_t[:train_size]
Y = Y_train[:train_size]
random_state = 54321

test_trains = True

clfs = []
wts = []

rf1 = RandomForestClassifier(n_estimators=500, criterion='entropy',  n_jobs = -1,  random_state=random_state, max_depth=8)
if test_trains:
    rf1.fit(X, Y)
    # Get validation error
    val_probs = rf1.predict_proba(X_val_t)[:,1]
    val_err = roc_auc_score(Y_val, val_probs)
    print("rf1 validation error:", val_err)
#clfs.append(('rf1', rf1))
#wts.append(1)

rf2 = RandomForestClassifier(n_estimators=500, criterion='gini',  n_jobs = -1, random_state=random_state, max_depth=8)
if test_trains:
    rf2.fit(X, Y)
    val_probs = rf2.predict_proba(X_val_t)[:,1]
    val_err = roc_auc_score(Y_val, val_probs)
    print("rf2 validation error:", val_err)
clfs.append(('rf2', rf2))
wts.append(1)

gbc = GradientBoostingClassifier(random_state=random_state)
if test_trains:
    gbc.fit(X, Y)
    val_probs = gbc.predict_proba(X_val_t)[:,1]
    val_err = roc_auc_score(Y_val, val_probs)
    print("gbc validation error:", val_err)
clfs.append(('gbc', gbc))
wts.append(1)

xgb = XGBClassifier(seed=random_state)
if test_trains:
    xgb.fit(X, Y)
    val_probs = xgb.predict_proba(X_val_t)[:,1]
    val_err = roc_auc_score(Y_val, val_probs)
    print("xgb validation error:", val_err)
clfs.append(('xgb',xgb))
wts.append(1)

XGBClassifier(random_state=random_state)


eclf = VotingClassifier(estimators=clfs, voting='soft')
eclf.fit(X, Y)
val_probs = eclf.predict_proba(X_val_t)[:,1]
val_err = roc_auc_score(Y_val, val_probs)
print("eclf validation error:", val_err)

# Make submission
#test_probs = eclf.predict_proba(X_test_t)[:,1]
#test["Predicted"] = test_probs
#test[["id","Predicted"]].to_csv("submission.csv",index=False)

rf1 validation error: 0.6356276897222657
rf2 validation error: 0.6352894756179772
gbc validation error: 0.6380958709934388
xgb validation error: 0.6377406386216694
eclf validation error: 0.6379139949158191


In [70]:
X = X_train_t[:200000]
Y = Y_train[:200000]
XV = X_train_t[200001:400000]
YV = Y_train[200001:400000]

random_state = 54321


model1 = GradientBoostingClassifier(random_state=random_state)
model2 = XGBClassifier(seed=random_state)
model3 = RandomForestClassifier(n_estimators=400, criterion='gini',  n_jobs = -1, random_state=random_state, max_depth=8)
model4 = AdaBoostClassifier(n_estimators=100)
model5 = LinearDiscriminantAnalysis()
model6 = DecisionTreeClassifier(max_depth=8)
model7 = RandomForestClassifier(n_estimators=250, criterion='entropy',  n_jobs = -1,  random_state=random_state)
model8 = GradientBoostingClassifier()

model1.fit(X, Y)
print("Done training model 1 (Gradient Boosting Classifier)")
model2.fit(X, Y)
print("Done training model 2 (XGB Classifier)")
model3.fit(X, Y)
print("Done training model 3 (Random Forest)")
model4.fit(X, Y)
print("Done training model 4 (AdaBoost)")
model5.fit(X, Y)
print("Done training model 5 (LinearDiscriminantAnalysis)")
model6.fit(X, Y)
print("Done training model 6 (DecisionTreeClassifier)")
model7.fit(X, Y)
print("Done training model 7 (Random Forest Entropy)")
model8.fit(X, Y)
print("Done training model 8 (QuadraticDiscriminantAnalysis)")


preds1 = model1.predict_proba(XV)[:,1]
preds2 = model2.predict_proba(XV)[:,1]
preds3 = model3.predict_proba(XV)[:,1]
preds4 = model4.predict_proba(XV)[:,1]
preds5 = model5.predict_proba(XV)[:,1]
preds6 = model6.predict_proba(XV)[:,1]
preds7 = model7.predict_proba(XV)[:,1]
preds8 = model8.predict_proba(XV)[:,1]

tpreds1 = model1.predict_proba(X_test_t)[:,1]
tpreds2 = model2.predict_proba(X_test_t)[:,1]
tpreds3 = model3.predict_proba(X_test_t)[:,1]
tpreds4 = model4.predict_proba(X_test_t)[:,1]
tpreds5 = model5.predict_proba(X_test_t)[:,1]
tpreds6 = model6.predict_proba(X_test_t)[:,1]
tpreds7 = model7.predict_proba(X_test_t)[:,1]
tpreds8 = model8.predict_proba(X_test_t)[:,1]

test_preds1 = model1.predict_proba(X_val_t)[:,1]
val1_err = roc_auc_score(Y_val, test_preds1)
print("Model 1 (Gradient Boosting Classifier) validation error:", val1_err)
test_preds2 = model2.predict_proba(X_val_t)[:,1]
val2_err = roc_auc_score(Y_val, test_preds2)
print("Model 2 (XGB Classifier) validation error:", val2_err)
test_preds3 = model3.predict_proba(X_val_t)[:,1]
val3_err = roc_auc_score(Y_val, test_preds3)
print("Model 3 (Random Forest) validation error:", val3_err)
test_preds4 = model4.predict_proba(X_val_t)[:,1]
val4_err = roc_auc_score(Y_val, test_preds4)
print("Model 4 (AdaBoost) validation error:", val4_err)
test_preds5 = model5.predict_proba(X_val_t)[:,1]
val5_err = roc_auc_score(Y_val, test_preds5)
print("Model 5 (LinearDiscriminantAnalysis) validation error:", val5_err)
test_preds6 = model6.predict_proba(X_val_t)[:,1]
val6_err = roc_auc_score(Y_val, test_preds6)
print("Model 6 (DecisionTreeClassifier) validation error:", val6_err)
test_preds7 = model7.predict_proba(X_val_t)[:,1]
val7_err = roc_auc_score(Y_val, test_preds7)
print("Model 7 (Random Forest Entropy) validation error:", val7_err)
test_preds8 = model8.predict_proba(X_val_t)[:,1]
val8_err = roc_auc_score(Y_val, test_preds8)
print("Model 8 (QuadraticDiscriminantAnalysis) validation error:", val8_err)

stacked_predictions = np.column_stack((preds1, preds2, preds3, preds4, preds5, preds6, preds7, preds8))
tstacked_predictions = np.column_stack((tpreds1, tpreds2, tpreds3, tpreds4, tpreds5, tpreds6, tpreds7, tpreds8))
stacked_test_predictions = np.column_stack((test_preds1, test_preds2, test_preds3, test_preds4,
                                            test_preds5, test_preds6, test_preds7, test_preds8))
meta_model = LinearRegression()
meta_model.fit(stacked_predictions, YV)
final_predictions = meta_model.predict(stacked_test_predictions)
restrain_out(final_predictions)
print(final_predictions)
val_err = roc_auc_score(Y_val, final_predictions)
print("Validation error:", val_err)
# Make submission
test_probs = meta_model.predict(tstacked_predictions)
restrain_out(test_probs)
test["Predicted"] = test_probs
test[["id","Predicted"]].to_csv("submission.csv",index=False)

Done training model 1 (Gradient Boosting Classifier)
Done training model 2 (XGB Classifier)
Done training model 3 (Random Forest)
Done training model 4 (AdaBoost)
Done training model 5 (LinearDiscriminantAnalysis)
Done training model 6 (DecisionTreeClassifier)
Done training model 7 (Random Forest Entropy)
Done training model 8 (QuadraticDiscriminantAnalysis)
Model 1 (Gradient Boosting Classifier) validation error: 0.6527557871173288
Model 2 (XGB Classifier) validation error: 0.6513079483623094
Model 3 (Random Forest) validation error: 0.6510053316740823
Model 4 (AdaBoost) validation error: 0.6471550642904701
Model 5 (LinearDiscriminantAnalysis) validation error: 0.63172621598906
Model 6 (DecisionTreeClassifier) validation error: 0.6424465068171215
Model 7 (Random Forest Entropy) validation error: 0.6376918180463393
Model 8 (QuadraticDiscriminantAnalysis) validation error: 0.6527321558594005
[0.50088651 0.42309182 0.55757966 ... 0.59930218 0.23379786 0.35182111]
Validation error: 0.6493

In [26]:
train_size = 500000

X = X_train_t[:train_size]
Y = Y_train[:train_size]
random_state = 54321

clf = XGBClassifier(random_state=random_state)
clf.fit(X, Y)
val_probs = clf.predict_proba(X_val_t)[:,1]
val_err = roc_auc_score(Y_val, val_probs)
print("clf validation error:", val_err)
# Make submission
#test_probs = clf.predict_proba(X_test_t)[:, 1]
#test["Predicted"] = test_probs
#test[["id","Predicted"]].to_csv("submission.csv",index=False)

clf validation error: 0.6385600713957855


In [76]:
# Best model so far

# Validation score: 0.639053452056516

train_size = 500000

X = X_train_t[:train_size]
Y = Y_train[:train_size]
random_state = 54321

gbc = GradientBoostingClassifier(random_state=random_state)
gbc.fit(X, Y)
val_probs = gbc.predict_proba(X_val_t)[:,1]
val_err = roc_auc_score(Y_val, val_probs)
print("gbc validation error:", val_err)
# Make submission
test_probs = gbc.predict_proba(X_test_t)[:, 1]
test["Predicted"] = test_probs
test[["id","Predicted"]].to_csv("submission.csv",index=False)

gbc validation error: 0.6103300372827178


In [27]:
# validation error: 0.6467546999617307 0.0.6535391961510051 0.653722038522001

X = X_train_t[:200000]
Y = Y_train[:200000]
XV = X_train_t[200001:400000]
YV = Y_train[200001:400000]

random_state = 54321

model1 = GradientBoostingClassifier(random_state=random_state)
model2 = XGBClassifier(seed=54321)
model3 = RandomForestClassifier(n_estimators=100, criterion='gini',  n_jobs = -1, random_state=random_state, max_depth=8)

model1.fit(X, Y)
print("Done training model 1")
model2.fit(X, Y)
print("Done training model 2")
model3.fit(X, Y)
print("Done training model 3")

preds1 = model1.predict_proba(XV)[:,1]
preds2 = model2.predict_proba(XV)[:,1]
preds3 = model3.predict_proba(XV)[:,1]

test_preds1 = model1.predict_proba(X_val_t)[:,1]
val1_err = roc_auc_score(Y_val, test_preds1)
print("Model1 validation error:", val1_err)
test_preds2 = model2.predict_proba(X_val_t)[:,1]
val2_err = roc_auc_score(Y_val, test_preds2)
print("Model2 validation error:", val2_err)
test_preds3 = model3.predict_proba(X_val_t)[:,1]
val3_err = roc_auc_score(Y_val, test_preds2)
print("Model3 validation error:", val3_err)

stacked_predictions = np.column_stack((preds1, preds2, preds3))
stacked_test_predictions = np.column_stack((test_preds1, test_preds2, test_preds3))

meta_model = LinearRegression()
meta_model.fit(stacked_predictions, YV)
final_predictions = meta_model.predict(stacked_test_predictions)
print(final_predictions)
val_err = roc_auc_score(Y_val, final_predictions)
print("Validation error:", val_err)

Done training model 1
Done training model 2
Done training model 3
Model1 validation error: 0.6380958709934388
Model2 validation error: 0.6377406386216694
Model3 validation error: 0.6377406386216694
[0.45821113 0.38814805 0.61930441 ... 0.59136322 0.36506096 0.41681025]
Validation error: 0.6379781914842635


In [25]:
# validation error: 0.6467546999617307

X = X_all_t[:250000]
Y = Y_all[:250000]
XV = X_all_t[250001:500000]
YV = Y_all[250001:500000]

random_state = 54321

model1 = GradientBoostingClassifier(random_state=random_state)
model2 = XGBClassifier(seed=random_state)
model3 = RandomForestClassifier(n_estimators=1000, criterion='gini',  n_jobs = -1, random_state=random_state, max_depth=8)

model1.fit(X, Y)
print("Done training model 1")
model2.fit(X, Y)
print("Done training model 2")
model3.fit(X, Y)
print("Done training model 3")

preds1 = model1.predict_proba(XV)[:,1]
preds2 = model2.predict_proba(XV)[:,1]
preds3 = model3.predict_proba(XV)[:,1]

test_preds1 = model1.predict_proba(X_test_t)[:,1]
#val1_err = roc_auc_score(Y_val, test_preds1)
#print("Model1 validation error:", val1_err)
test_preds2 = model2.predict_proba(X_test_t)[:,1]
#val2_err = roc_auc_score(Y_val, test_preds2)
#print("Model2 validation error:", val2_err)
test_preds3 = model3.predict_proba(X_test_t)[:,1]
#val3_err = roc_auc_score(Y_val, test_preds2)
#print("Model3 validation error:", val3_err)

stacked_predictions = np.column_stack((preds1, preds2, preds3))
stacked_test_predictions = np.column_stack((test_preds1, test_preds2, test_preds3))

meta_model = LinearRegression()
meta_model.fit(stacked_predictions, YV)
final_predictions = meta_model.predict(stacked_test_predictions)
restrain_out(final_predictions)
test["Predicted"] = final_predictions
test[["id","Predicted"]].to_csv("submission.csv",index=False)

Done training model 1
Done training model 2
Done training model 3


In [58]:
data = get_tensor_data(X_train, Y_train)
X_val_ten = torch.tensor(X_train_t)

dropout = 0.5
model = nn.Sequential(
    nn.Linear(17, 10),
    nn.Softmax(),
    nn.Dropout(dropout),
    nn.Linear(10, 5),
    nn.Softmax(),
    nn.Dropout(dropout),
    nn.Linear(5, 3),
    nn.Softmax(),
    nn.Dropout(dropout),
    nn.Linear(3, 1)
)

num_epochs = 5
batch_size = 128
step = len(data) / batch_size - 1
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train_loader = torch.utils.data.DataLoader(get_tensor_data(X_train, Y_train), batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(get_tensor_data(X_val, Y_val), batch_size=batch_size, shuffle=True) 

model.train()

# Some layers, such as Dropout, behave differently during training
model.train()

for epoch in range(num_epochs):
    b = 0
    while b < len(data):
        batch = data[b:(b + batch_size), :-1]
        target = data[b:(b + batch_size), -1]
        # Erase accumulated gradients
        optimizer.zero_grad()

        # Forward pass
        output = model(batch)

        # Calculate loss
        loss = loss_fn(output, target)

        # Backward pass
        loss.backward()
        
        # Weight update
        optimizer.step()
        
        b += batch_size

    # Track loss each epoch
    print('Train Epoch: %d  Loss: %.4f' % (epoch + 1,  loss.item()))
    
    
model.eval()

# Turning off automatic differentiation
with torch.no_grad():
    val_pred = model(X_val_ten.float())
    print(val_pred)

val_err = roc_auc_score(Y_val, val_pred)
print("Validation error:", val_err)

RuntimeError: Assertion `x >= 0. && x <= 1.' failed. input value should be between 0~1, but got -0.139637 at /Users/distiller/project/conda/conda-bld/pytorch_1579022061893/work/aten/src/THNN/generic/BCECriterion.c:60

In [78]:
# Best model so far

# Validation score: 0.639053452056516

train_size = 500000

X = X_all_t
Y = Y_all
random_state = 54321

gbc = GradientBoostingClassifier(random_state=random_state)
gbc.fit(X, Y)
#val_probs = gbc.predict_proba(X_val_t)[:,1]
#val_err = roc_auc_score(Y_val, val_probs)
#print("gbc validation error:", val_err)
# Make submission
test_probs = gbc.predict_proba(X_test_t)[:, 1]
test["Predicted"] = test_probs
test[["id","Predicted"]].to_csv("submission.csv",index=False)
print("done!")

done!
