In [15]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from xgboost import XGBClassifier

**read_data_small** is the function to read in the small dataset about 30 MB

In [16]:
def read_data_small():
    X_train = pd.read_csv("data_small/X_train_small.csv")
    X_test = pd.read_csv("data_small/X_test_small.csv")
    y_train = np.asarray(pd.read_csv("data_small/y_train_small.csv", header=None)[0])
    return X_train, X_test, y_train

**read_data_big** is the function to read in the big dataset about 100 MB

In [17]:
def read_data_big():
    X_train = pd.read_csv("data_big/X_train_big.csv")
    X_test = pd.read_csv("data_big/X_test_big.csv")
    y_train = np.asarray(pd.read_csv("data_big/y_train_big.csv", header=None)[0])
    return X_train, X_test, y_train

**read_data** is the function to read in the whole dataset about 1.5 G

In [18]:
def read_data():
    X_train = pd.read_csv("data/X_train.csv")
    X_test = pd.read_csv("data/X_test.csv")
    y_train = np.asarray(pd.read_csv("data/y_train.csv", header=None)[0])
    return X_train, X_test, y_train

# Insert Your Code Here

**detect_spoofying** is the function for training the classifier and classify the results. 

Here we provide an simple example.

In [19]:
### code classifier here ###
def format_data(df):
    '''
    Takes raw df
    Returns enriched df
    '''
    def add_stock_features(X_train):
        ## Features extracted from stock
        X_train = X_train.copy()
        original_cols = X_train.columns.values.tolist()
        X_train['dt'] = pd.to_datetime(X_train['timestamp']*1000000)

        ## stock returns
        X_train['mid'] = (X_train['bestBid'] + X_train['bestAsk'])/2
        stocks = X_train[['obId','dt','mid']]
        # stocks.set_index('dt',inplace=True)
        stocks = pd.pivot_table(stocks,index='dt',columns=['obId'],values='mid')
        stocks = stocks.resample('1s').last().ffill()
        stocks_cols = stocks.columns
        stocks[[x+'.r_10s' for x in stocks_cols]] = stocks[stocks_cols]/stocks[stocks_cols].shift(10)-1
        stocks[[x+'.r_f10s' for x in stocks_cols]] = stocks[stocks_cols].shift(-10)/stocks[stocks_cols]-1
        stocks[[x+'.r_10min' for x in stocks_cols]] = stocks[stocks_cols]/stocks[stocks_cols].shift(600)-1
        stocks[[x+'.r_f10min' for x in stocks_cols]] = stocks[stocks_cols].shift(-600)/stocks[stocks_cols]-1

        ## stock volatility
        stocks[[x+'.vol_10min' for x in stocks_cols]] = stocks[[x+'.r_10s' for x in stocks_cols]].rolling(window=600).std()

        stocks = stocks.ffill().bfill()

        ## stock spread
        X_train['spread'] = (X_train['bestAsk']-X_train['bestBid'])/(X_train['bestAsk']+X_train['bestBid'])*2
        stocks_spread = pd.pivot_table(X_train,index='dt',columns=['obId'],values='spread')

        ## stock order imbalance
        X_train['orderImbalanceBest'] = X_train.eval('(bestAskVolume-bestBidVolume)/(bestAskVolume+bestBidVolume)')
        X_train['orderImbalance'] = X_train.eval('((bestAskVolume+lv2AskVolume+lv3AskVolume)-(bestBidVolume+lv2BidVolume+lv3BidVolume))/(bestAskVolume+lv2AskVolume+lv3AskVolume+bestBidVolume+lv2BidVolume+lv3BidVolume)*2')
        order_imbalance_best = pd.pivot_table(X_train,index='dt',columns=['obId'],values='orderImbalanceBest')
        order_imbalance = pd.pivot_table(X_train,index='dt',columns=['obId'],values='orderImbalance')

        ## append stock related values
        X_train['dt_round'] = X_train['dt'].map(lambda x:x.replace(microsecond=0))

        stocks_append = stocks.unstack().reset_index()
        stocks_append.columns = ['obId','dt_round','value']
        stocks_append['type'] = stocks_append['obId'].map(lambda x:x.split('.')[-1])
        stocks_append['obId'] = stocks_append['obId'].map(lambda x:x.split('.')[0])
        stocks_append = stocks_append[stocks_append['type']!=stocks_append['obId']]

        stocks_append = pd.pivot_table(stocks_append,index=['obId','dt_round'],columns='type',values='value').reset_index()
        stocks_append.reset_index(drop=True,inplace=True)

        X_train_all = pd.merge(X_train,stocks_append,on=['dt_round','obId'], how='left')
        X_train = X_train[original_cols]

        ## visualization
        #     stocks[stocks_cols].iplot(title='Stock mid price')
        #     stocks_spread[stocks_cols].resample('10s').last().iplot(title='spread')
        #     stocks[[x+'_vol_10min' for x in stocks_cols]].iplot(title='volatility')
        #     order_imbalance_best.resample('10s').last().iplot(title='imbalanceBest')
        #     order_imbalance.resample('10s').last().iplot(title='imbalance')
        return X_train_all.drop(columns=original_cols+['dt','mid'])
    
    
    def percentage_of_cancel(df):
    
        df_proc = df.copy()
    
        df_proc['operationID'] = 0
        df_proc.loc[df_proc['operation'] == 'CANCEL', 'operationID'] = 1

        df_cancel = df_proc.groupby(['member', 'obId']).agg(
    {
        'operationID': np.mean
    })

        f = df_cancel.reset_index()

        df_out = pd.merge(df,f,on=['member','obId'],how='left')

        return df_out['operationID']
    

    def change_in_volume_for_orderID(df):
        # function that returns the column of volume change specific to orderID
        df_vol_change = df.copy()

        df_vol_change['volume_change'] = df_vol_change.groupby(['orderId'])['volume'].diff()

        df_vol_change.loc[df['operation'] == 'INSERT', 'volume_change'] = df.loc[df['operation'] == 'INSERT', 'volume'] 

        return df_vol_change['volume_change']
    

    def possession_ordersize(df):
        # function that returns the ratio of current possession and order size as a column
        df_buy = df.copy()

        df_buy['buy'] = 0
        df_buy.loc[df_buy['isBuyer'] == True, 'buy'] = 1
        df_buy.loc[df_buy['isBuyer'] == False, 'buy'] = -1

        df_buy['buy_volume'] = df_buy['volume'] * df_buy['buy']

        df_buy.sort_values(['timestamp'],inplace=True)
        cols = ['member', 'obId','buy_volume']#'timestamp',
        df_possession = df_buy.groupby(['member', 'obId']).agg(
        {
        #     'timestamp':'last',
            'buy_volume': np.cumsum
        })

        df_buy['possession'] = df_possession['buy_volume']

        return df_buy['possession'] / df_buy['volume']
    
    def add_order_vs_avg_vol_df(df_X_train):
        df_X = df_X_train.copy()
        df_X_agg = df_X.groupby(['obId', 'member'])

        avg_order_size_dict = df_X_agg['volume'].mean().to_dict()
        def order_vs_avg_func(obId, member, volume):
            return volume/ avg_order_size_dict[(obId, member)]

        result2 = df_X_train.apply(lambda x: order_vs_avg_func(x.obId, x.member, x.volume),axis = 1)

        return result2
    
    
    df = pd.concat([df, add_stock_features(df)], axis = 1)
    df['per_cancel'] = percentage_of_cancel(df)
    df['change_vol'] = change_in_volume_for_orderID(df)
    df['possession_ordersize'] = possession_ordersize(df)
    df['ord_vs_avg_vol'] = add_order_vs_avg_vol_df(df)
    
    
    features_cols = [ 
    'spread',
     'orderImbalanceBest',
     'orderImbalance',
     'r_10min',
     'r_10s',
     'r_f10min',
     'r_f10s',
     'vol_10min',
     'per_cancel',
     'change_vol',
     'possession_ordersize',
     'ord_vs_avg_vol']


    df["isBid"] = (df.isBid*1).fillna(-1)
    df["isBuyer"] = (df.isBuyer*1).fillna(-1)
    df["isAggressor"] = (df.isAggressor*1).fillna(-1)
    df["type"] = (df.type == "ORDER")*1
    df["source"] = (df.source=="USER")*1
    df["timeSinceLastTrade"] = df[["timestamp","endUserRef"]].groupby("endUserRef").diff()

    additional_cols = ['isBid','isBuyer','isAggressor','type','source','timeSinceLastTrade']

    market_cols = ["price","volume","bestBid","bestAsk",'bestBidVolume',
                        'bestAskVolume','lv2Bid', 'lv2BidVolume','lv2Ask', 
                        'lv2AskVolume', 'lv3Bid', 'lv3BidVolume', 'lv3Ask',
                        'lv3AskVolume']


    features_cols = features_cols+additional_cols+market_cols
    df = df[features_cols]
    
    return df
    
    
#######
def detect_spoofying(X_train, X_test, y_train):
    

#     # clean up the data
#     X_clean = format_data(pd.concat([X_train, X_test]))
#     X_clean = X_clean.fillna(-1)
#     X_train_clean = X_clean.iloc[:X_train.shape[0],:]
#     X_test_clean = X_clean.iloc[X_train.shape[0]:,:]
#     X_train_clean_scaled = scale(X_train_clean)
#     X_test_clean_scaled = scale(X_test_clean)

#     # fit classifier
#     clf = LogisticRegression(random_state=0, class_weight='balanced').fit(X_train_clean_scaled, y_train)
#     y_train_prob_pred = clf.predict_proba(X_train_clean_scaled)
#     y_test_prob_pred = clf.predict_proba(X_test_clean_scaled)
    
    ## rebalance data


    # Separate majority and minority classes
    X_train['y'] = y_train

    df_1 = X_train[X_train.y==1]
    df_2 = X_train[X_train.y==2]
    df_0 = X_train[X_train.y==0]

    n = len(df_0)

    
    # Upsample minority class
    df_1_upsampled = resample(df_1, 
                                     replace=True,     # sample with replacement
                                     n_samples=n,    # to match majority class
                                     random_state=123) # reproducible results
    df_2_upsampled = resample(df_2, 
                                     replace=True,     # sample with replacement
                                     n_samples=n,    # to match majority class
                                     random_state=123) # reproducible results


    # Combine majority class w ith upsampled minority class
    df_upsampled = pd.concat([df_1_upsampled, df_2_upsampled, df_0])

    # Display new class counts
    df_upsampled.y.value_counts()

    X_train_upsampled = df_upsampled.drop(columns=['y'])
    y_train_upsampled = df_upsampled['y'].values
    
    
    #modeling
    model = XGBClassifier()
    #max_depth=6,nthread=-1
    model.fit(X_train_upsampled, y_train_upsampled)
    
    X_train = X_train.drop(columns=['y'])
    
    y_train_raw_pred = model.predict(X_train)
    y_test_raw_pred = model.predict(X_test)
    
    def type3_label(X_test, y_test_raw, model):
        #y_test = y_test_raw.copy()

        y_proba = model.predict_proba(X_test)

        y_proba_label1 = y_proba[:,0]

        ind1 = np.where(y_proba_label1 > 0.5)[0]
        ind2 = np.where(y_proba_label1 < 0.9)[0]

        rows_3 = np.intersect1d(ind1,ind2)

        prob3 = np.zeros((len(y_proba),1))
        y_proba = np.hstack((y_proba, prob3))
        y_proba[rows_3,3] = y_proba[rows_3, 0]
        y_proba[rows_3,0] = 0

        #y_test_raw[rows_3] = 3

        return y_proba
    
    y_train_prob_pred = type3_label(X_train, y_train_raw_pred, model)
    y_test_prob_pred = type3_label(X_test, y_test_raw_pred, model)
    
    return y_train_prob_pred, y_test_prob_pred

In [20]:
# X_train, X_test, y_train = read_data_small()

In [21]:
# X_test = format_data(X_test)

In [22]:
# X_test.head()

In [23]:
# X_train = format_data(X_train)

In [24]:
# X_train.head(10)

In [25]:
# %time y_train_prob_pred, y_test_prob_pred = detect_spoofying(X_train, X_test, y_train)

**score** is the function that we use to compare the results. An example is provided with scoring the predictions for the training dataset. True labels for the testing data set will be supplied to score the predictions for testing dataset.

Score is based on cohen's kappa measurement. https://en.wikipedia.org/wiki/Cohen%27s_kappa

In [26]:
from sklearn.metrics import cohen_kappa_score

def score(y_pred, y_true):
    """
    y_pred: a numpy 4d array of probabilities of point assigned to each label
    y_true: a numpy array of true labels
    """
    y_pred_label = np.argmax(y_pred, axis=1)
    return cohen_kappa_score(y_pred_label, y_true)

**wrapper** is the main function to read in unzipped data and output a score for evaluation. In addition, the function returns the y probability matrix (both train and test) for grading. More details about submitting format are outlined below.

In [1]:
def wrapper():
    # read in data
    X_train, X_test, y_train = read_data_small()
    # or if you have the computational power to work with the big data set, 
    # you can comment out the read_data_samll line and uncomment the following read_data_big
    # X_train, X_test, y_train = read_data_big()
    
    X_train = format_data(X_train)
    X_test = format_data(X_test)
    
    # process the data, train classifier and output probability matrix
    y_train_prob_pred, y_test_prob_pred = detect_spoofying(X_train, X_test, y_train)
    
    # score the predictions
    score_train = score(y_train_prob_pred, y_train)
    #score_test = score(y_test_prob_pred, y_test)
    
    # return the scores
    return score_train, y_train_prob_pred, y_test_prob_pred

Call function wrapper:

In [28]:
score_train, score_test, y_train_prob_pred, y_test_prob_pred = wrapper()

Score for training data set is:

In [29]:
print('Score train: ',score_train)

0.49521847311367384

### Submission Format

The classifier function wrote should return a 4d nparray with 4 columns. The columns are corresponding to the class labels: 0, 1, 2, 3. Please see examples below.

In [30]:
y_train_prob_pred

array([[9.99962926e-01, 1.63862242e-05, 2.07247394e-05, 0.00000000e+00],
       [9.99934435e-01, 2.31882186e-05, 4.23788260e-05, 0.00000000e+00],
       [9.99934435e-01, 2.31882186e-05, 4.23788260e-05, 0.00000000e+00],
       ...,
       [9.99561965e-01, 1.54756723e-04, 2.83220666e-04, 0.00000000e+00],
       [9.99583900e-01, 3.09392868e-04, 1.06672764e-04, 0.00000000e+00],
       [9.98822868e-01, 1.05205399e-03, 1.25032646e-04, 0.00000000e+00]])

In [31]:
y_test_prob_pred

array([[9.94330168e-01, 4.92920121e-03, 7.40602787e-04, 0.00000000e+00],
       [9.98912096e-01, 9.35230986e-04, 1.52646709e-04, 0.00000000e+00],
       [9.96430099e-01, 3.49597912e-03, 7.39955794e-05, 0.00000000e+00],
       ...,
       [9.91438508e-01, 7.66117685e-03, 9.00339859e-04, 0.00000000e+00],
       [9.98670816e-01, 8.62737128e-04, 4.66433266e-04, 0.00000000e+00],
       [9.99481261e-01, 2.43613569e-04, 2.75098457e-04, 0.00000000e+00]])

### Write test results to csv files

Please rename your file to indicate which data set you are working with. 

- If you are using the small dataset: *y_train_prob_pred_small.csv* and *y_test_prob_pred_small.csv*
- If you are using the small dataset: *y_train_prob_pred_big.csv* and *y_test_prob_pred_big.csv*
- If you are using the original dataset: *y_train_prob_pred.csv* and *y_test_prob_pred.csv*

In [32]:
pd.DataFrame(y_train_prob_pred).to_csv("y_train_prob_pred.csv")
pd.DataFrame(y_test_prob_pred).to_csv("y_test_prob_pred.csv")