In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier

import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

import warnings
warnings.filterwarnings('ignore')

In [None]:
FILE = "RfqData.xlsx"
data_train = pd.read_excel(FILE, sheet_name=0)
data_test = pd.read_excel(FILE, sheet_name=1)

label_map = {'MISSED': 0, "DONE": 1}
side_map = {"Bid": 1, "Offer": -1}

data_train['label'] = data_train['Traded'].map(label_map)
data_train['Side'] = data_train['Side'].map(side_map)
data_train['dist'] = data_train.MidPrice - data_train.QuotedPrice
data_train = data_train.drop(columns=["Traded", "Counterparty"])


data_test['label'] = data_test['Traded'].map(label_map)
data_test['Side'] = data_test['Side'].map(side_map)
data_test['dist'] = data_test.MidPrice - data_test.QuotedPrice
data_test = data_test.drop(columns=["Traded", "Counterparty"])

columns = ['Bond', 'MidPrice', 'QuotedPrice', 'Side', 'Notional', 'Competitors', 'dist']

data_train0 = data_train[data_train.Bond == "Bond_0"]
data_train1 = data_train[data_train.Bond == "Bond_1"]
data_train2 = data_train[data_train.Bond == "Bond_2"]

data_test0 = data_test[data_test.Bond == "Bond_0"]
data_test1 = data_test[data_test.Bond == "Bond_1"]
data_test2 = data_test[data_test.Bond == "Bond_2"]

data_train.head()


Unnamed: 0,Time,Bond,Side,Notional,MidPrice,QuotedPrice,Competitors,NextMidPrice,label,dist
0,25000,Bond_2,-1,10000000,124.01,124.25,1,124.24,0,-0.24
1,25001,Bond_0,1,1000,98.07,98.06,1,98.08,1,0.01
2,25002,Bond_1,-1,1000,170.3,170.4,1,170.64,0,-0.1
3,25003,Bond_0,1,20000,98.0,97.98,4,97.94,1,0.02
4,25004,Bond_1,-1,1000,171.12,171.16,2,171.46,1,-0.04


In [None]:
features = columns[3:]
X_train, y_train = data_train.loc[:, features], data_train.label
X_test, y_test = data_test.loc[:, features], data_test.label

X_train0, y_train0 = data_train0.loc[:, features], data_train0.label
X_test0, y_test0 = data_test0.loc[:, features], data_test0.label

X_train1, y_train1 = data_train1.loc[:, features], data_train1.label
X_test1, y_test1 = data_test1.loc[:, features], data_test1.label

X_train2, y_train2 = data_train2.loc[:, features], data_train2.label
X_test2, y_test2 = data_test2.loc[:, features], data_test2.label

In [None]:
print(CV_rf.best_estimator_)
print(CV_rf0.best_estimator_)
print(CV_rf1.best_estimator_)
print(CV_rf2.best_estimator_)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, n_estimators=50)
RandomForestClassifier(max_depth=6, min_samples_leaf=5, min_samples_split=3,
                       n_estimators=50)
RandomForestClassifier(max_depth=7, min_samples_split=10, n_estimators=50)
RandomForestClassifier(max_depth=4, min_samples_leaf=8, min_samples_split=8,
                       n_estimators=75)


In [None]:
#For all bonds
rf = RandomForestClassifier(n_estimators=50, max_depth=6, min_samples_leaf=8)

In [None]:
params = {
        'n_estimators': [30, 50, 100, 200], 
        'learning_rate': [0.01, 0.02, 0.05, 0.1], 
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5, 10],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 6]
        }
xgb = XGBClassifier(objective='binary:logistic', silent=True, n_thread=2)
CV_xgb = RandomizedSearchCV(xgb, param_distributions=params, n_iter=4000, scoring='accuracy', n_jobs=2, cv=5, verbose=1, random_state=42).fit(X_train, y_train)
CV_xgb0 = RandomizedSearchCV(xgb, param_distributions=params, n_iter=4000, scoring='accuracy', n_jobs=2, cv=5, verbose=1, random_state=42).fit(X_train0, y_train0)
CV_xgb1 = RandomizedSearchCV(xgb, param_distributions=params, n_iter=4000, scoring='accuracy', n_jobs=2, cv=5, verbose=1, random_state=42).fit(X_train1, y_train1)
CV_xgb2 = RandomizedSearchCV(xgb, param_distributions=params, n_iter=4000, scoring='accuracy', n_jobs=2, cv=5, verbose=1, random_state=42).fit(X_train2, y_train2)

print(CV_xgb.score(X_test, y_test))
print(CV_xgb0.score(X_test0, y_test0))
print(CV_xgb1.score(X_test1, y_test1))
print(CV_xgb2.score(X_test2, y_test2))

Fitting 5 folds for each of 4000 candidates, totalling 20000 fits
Fitting 5 folds for each of 4000 candidates, totalling 20000 fits
Fitting 5 folds for each of 4000 candidates, totalling 20000 fits
Fitting 5 folds for each of 4000 candidates, totalling 20000 fits
0.837
0.8954918032786885
0.8673469387755102
0.8502415458937198


In [None]:
print(CV_xgb.best_estimator_)
print(CV_xgb0.best_estimator_)
print(CV_xgb1.best_estimator_)
print(CV_xgb2.best_estimator_)

XGBClassifier(colsample_bytree=1.0, gamma=10, learning_rate=0.02, max_depth=6,
              n_estimators=30, n_thread=2, silent=True, subsample=1.0)
XGBClassifier(colsample_bytree=0.8, gamma=5, min_child_weight=5,
              n_estimators=50, n_thread=2, silent=True, subsample=1.0)
XGBClassifier(colsample_bytree=0.6, gamma=0.5, min_child_weight=10,
              n_estimators=200, n_thread=2, silent=True, subsample=1.0)
XGBClassifier(colsample_bytree=1.0, gamma=5, max_depth=4, min_child_weight=10,
              n_estimators=50, n_thread=2, silent=True, subsample=0.8)


In [None]:
def create_ae_mlp(num_columns, num_labels, hidden_units, dropout_rates, ls = 1e-2, lr = 1e-3):
    
    inp = tf.keras.layers.Input(shape = (num_columns, ))
    x0 = tf.keras.layers.BatchNormalization()(inp)
    
    encoder = tf.keras.layers.GaussianNoise(dropout_rates[0])(x0)
    encoder = tf.keras.layers.Dense(hidden_units[0])(encoder)
    encoder = tf.keras.layers.BatchNormalization()(encoder)
    encoder = tf.keras.layers.Activation('relu')(encoder)
    
    decoder = tf.keras.layers.Dropout(dropout_rates[1])(encoder)
    decoder = tf.keras.layers.Dense(num_columns, name = 'decoder')(decoder)

    x_ae = tf.keras.layers.Dense(hidden_units[1])(decoder)
    x_ae = tf.keras.layers.BatchNormalization()(x_ae)
    x_ae = tf.keras.layers.Activation('swish')(x_ae)
    x_ae = tf.keras.layers.Dropout(dropout_rates[2])(x_ae)

    out_ae = tf.keras.layers.Dense(num_labels, activation = 'sigmoid', name = 'ae_action')(x_ae)
    
    x = tf.keras.layers.Concatenate()([x0, encoder])
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(dropout_rates[3])(x)
    
    for i in range(2, len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation('relu')(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 2])(x)
        
    out = tf.keras.layers.Dense(num_labels, activation = 'sigmoid', name = 'action')(x)
    
    model = tf.keras.models.Model(inputs = inp, outputs = [decoder, out_ae, out])
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = lr),
                  loss = {'decoder': tf.keras.losses.MeanSquaredError(), 
                          'ae_action': tf.keras.losses.BinaryCrossentropy(label_smoothing = ls),
                          'action': tf.keras.losses.BinaryCrossentropy(label_smoothing = ls), 
                         },
                  metrics = {'decoder': tf.keras.metrics.MeanAbsoluteError(name = 'MAE'), 
                             'ae_action': tf.keras.metrics.AUC(name = 'AUC'), 
                             'action': tf.keras.metrics.AUC(name = 'AUC'), 
                            }, 
                 )
    
    return model

In [None]:
params = {'num_columns': len(features), 
          'num_labels': 1, 
          'hidden_units': [512, 512, 1024, 512, 512, 512], #[96, 96, 896, 448, 448, 256], 
          'dropout_rates': [0.05,0.05,0.4,0.2,0.5,0.3,0.3,0.4],#[0.03527936123679956, 0.038424974585075086, 0.42409238408801436, 0.10431484318345882, 0.49230389137187497, 0.32024444956111164, 0.2716856145683449, 0.4379233941604448], 
          'ls': 0.01, 
          'lr':1e-5, 
         }

In [None]:
X_train_, X_valid_, y_train_, y_valid_ = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
X_train0_, X_valid0_, y_train0_, y_valid0_ = train_test_split(X_train0, y_train0, test_size=0.2, random_state=42, stratify=y_train0)
X_train1_, X_valid1_, y_train1_, y_valid1_ = train_test_split(X_train1, y_train1, test_size=0.2, random_state=42, stratify=y_train1)
X_train2_, X_valid2_, y_train2_, y_valid2_ = train_test_split(X_train2, y_train2, test_size=0.2, random_state=42, stratify=y_train2)

In [None]:
def map_label(prob):
  res = []
  for p in prob:
    if p[0] > 0.5:
      res.append(1)
    else:
      res.append(0)
  return res

In [None]:
def AutoMLP(X_train_, y_train_, X_valid_, y_valid_, X_test, y_test, params, batch_size=256):
    model = create_ae_mlp(**params)
    es = EarlyStopping(monitor = 'val_action_AUC', min_delta = 1e-4, patience = 10, mode = 'max', 
                        baseline = None, restore_best_weights = True, verbose = 0)
    history = model.fit(X_train_, [X_train_, y_train_, y_train_], validation_data = (X_valid_, [X_valid_, y_valid_, y_valid_]), epochs = 100, batch_size = batch_size, callbacks = [es], verbose = 0)
    hist = pd.DataFrame(history.history)
    score = hist['val_action_AUC'].max()
    prob = model.predict(X_test)[2]
    res = map_label(prob)
    print(score, accuracy_score(y_test, res))
    return model

In [None]:
bs = 128
model = AutoMLP(X_train_, y_train_, X_valid_, y_valid_, X_test, y_test, params, batch_size=bs)
model0 = AutoMLP(X_train0_, y_train0_, X_valid0_, y_valid0_, X_test, y_test, params, batch_size=bs)
model1 = AutoMLP(X_train1_, y_train1_, X_valid1_, y_valid1_, X_test, y_test, params, batch_size=bs)
model2 = AutoMLP(X_train2_, y_train2_, X_valid2_, y_valid2_, X_test, y_test, params, batch_size=bs)

0.9054998755455017 0.82
0.9136048555374146 0.783
0.6010929346084595 0.517
0.9263545870780945 0.792
