In [4]:
import pandas as pd
import numpy as np
    
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import make_classification
from sklearn.multiclass import OneVsRestClassifier

from sklearn.preprocessing import MultiLabelBinarizer

**read_data_small** is the function to read in the small dataset about 30 MB

In [5]:
def read_data_small():
    X_train = pd.read_csv("data_small/X_train_small.csv")
    X_test = pd.read_csv("data_small/X_test_small.csv")
    y_train = np.asarray(pd.read_csv("data_small/y_train_small.csv", header=None)[0])
    return X_train, X_test, y_train

**read_data_big** is the function to read in the big dataset about 100 MB

In [6]:
def read_data_big():
    X_train = pd.read_csv("data_big/X_train_big.csv")
    X_test = pd.read_csv("data_big/X_test_big.csv")
    y_train = np.asarray(pd.read_csv("data_big/y_train_big.csv", header=None)[0])
    return X_train, X_test, y_train

**read_data** is the function to read in the whole dataset about 1.5 G

In [7]:
def read_data():
    X_train = pd.read_csv("data/X_train.csv")
    X_test = pd.read_csv("data/X_test.csv")
    y_train = np.asarray(pd.read_csv("data/y_train.csv", header=None)[0])
    return X_train, X_test, y_train

# Insert Your Code Here

**detect_spoofying** is the function for training the classifier and classify the results. 

Here we provide an simple example.

In [8]:
### import libraries here ###
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_validate

### code classifier here ###
def format_data(df):
    
    # append numberical columns
    rst = df.loc[:,["price","volume","bestBid","bestAsk",'bestBidVolume',
                    'bestAskVolume','lv2Bid', 'lv2BidVolume','lv2Ask', 
                    'lv2AskVolume', 'lv3Bid', 'lv3BidVolume', 'lv3Ask',
                    'lv3AskVolume']]
    
    # encode the binaries
    rst["isBid"] = df.isBid*1
    rst["isBuyer"] = df.isBuyer*1
    rst["isAggressor"] = df.isAggressor*1
    rst["type"] = (df.type == "ORDER")*1
    rst["source"] = (df.source=="USER")*1
    
    # parse the order id data
    rst["orderId"] = df.orderId.str.split('-').str[-1]
    rst["tradeId"] = df.tradeId.str.split('-').str[-1]
    rst["bidOrderId"] = df.bidOrderId.str.split('-').str[-1]
    rst["askOrderId"] = df.askOrderId.str.split('-').str[-1]
    
    # encode the multiple lable data
    tmp_operation = pd.DataFrame(pd.get_dummies(df.operation), columns=df.operation.unique()[:-1])
    rst = pd.concat([rst, tmp_operation], axis=1)
    tmp_endUserRef = pd.DataFrame(pd.get_dummies(df.endUserRef), columns=df.endUserRef.unique()[:-1])
    rst = pd.concat([rst, tmp_endUserRef], axis=1)
    
    # also feel free to add more columns inferred from data
    # smartly engineered features can be very useful to improve the classification resutls
    rst["timeSinceLastTrade"] = X_train[["timestamp","endUserRef"]].groupby("endUserRef").diff()
    print('shape', rst)
    return rst

def detect_spoofying(X_train, X_test, y_train):
    
    # clean up the data
    X_clean = format_data(pd.concat([X_train, X_test]))
    X_clean = X_clean.fillna(-1)
    X_train_clean = X_clean.iloc[:X_train.shape[0],:]
    X_test_clean = X_clean.iloc[X_train.shape[0]:,:]
    X_train_clean_scaled = scale(X_train_clean)
    X_test_clean_scaled = scale(X_test_clean)

    # fit classifier
    clf = LogisticRegression(random_state=0, class_weight='balanced')
#     clf = OneVsRestClassifier(XGBClassifier(n_jobs=-1, max_depth=4))

#     # You may need to use MultiLabelBinarizer to encode your variables from arrays [[x, y, z]] to a multilabel 
#     # format before training.
#     mlb = MultiLabelBinarizer()
#     y_train = mlb.fit_transform(y_train)
    clf.fit(X_train_clean_scaled, y_train)
    y_train_prob_pred = clf.predict_proba(X_train_clean_scaled)
    y_test_prob_pred = clf.predict_proba(X_test_clean_scaled)
    
    return y_train_prob_pred, y_test_prob_pred

**score** is the function that we use to compare the results. An example is provided with scoring the predictions for the training dataset. True labels for the testing data set will be supplied to score the predictions for testing dataset.

Score is based on cohen's kappa measurement. https://en.wikipedia.org/wiki/Cohen%27s_kappa

In [9]:
from sklearn.metrics import cohen_kappa_score

def score(y_pred, y_true):
    """
    y_pred: a numpy 4d array of probabilities of point assigned to each label
    y_true: a numpy array of true labels
    """
    y_pred_label = np.argmax(y_pred, axis=1)
    return cohen_kappa_score(y_pred_label, y_true)

### Optional: k-fold cross validation

In [10]:
### optional: examples of k-fold cross validation ###
# k-fold cross validation can help you compare the classification models
from sklearn.model_selection import KFold
n = 5 # here we choose a 10 fold cross validation
kf = KFold(n_splits = n)
X_train, X_test, y_train = read_data_small()
kf.get_n_splits(X_train)
print(kf)
kf_scores = pd.DataFrame(np.zeros([n,2]), columns=["train score", "test score"])
rowindex = 0
for train_index, test_index in kf.split(X_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    print(X_train.index)
    print(y_train)
    X_train_kf, X_test_kf = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_kf, y_test_kf = y_train[train_index], y_train[test_index]
    y_train_prob_pred_kf, y_test_prob_pred_kf = detect_spoofying(X_train_kf, X_test_kf, y_train_kf)
    score_train_kf = score(y_train_prob_pred_kf, y_train_kf)
    score_test_kf = score(y_test_prob_pred_kf, y_test_kf)
    kf_scores.iloc[rowindex, 0] = score_train_kf
    kf_scores.iloc[rowindex, 1] = score_test_kf
    rowindex += 1

KFold(n_splits=5, random_state=None, shuffle=False)
TRAIN: [ 28365  28366  28367 ... 141819 141820 141821] TEST: [    0     1     2 ... 28362 28363 28364]
RangeIndex(start=0, stop=141822, step=1)
[0 0 0 ... 0 0 0]
shape (141822, 497)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: [     0      1      2 ... 141819 141820 141821] TEST: [28365 28366 28367 ... 56727 56728 56729]
RangeIndex(start=0, stop=141822, step=1)
[0 0 0 ... 0 0 0]
shape (141822, 497)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: [     0      1      2 ... 141819 141820 141821] TEST: [56730 56731 56732 ... 85091 85092 85093]
RangeIndex(start=0, stop=141822, step=1)
[0 0 0 ... 0 0 0]
shape (141822, 497)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: [     0      1      2 ... 141819 141820 141821] TEST: [ 85094  85095  85096 ... 113455 113456 113457]
RangeIndex(start=0, stop=141822, step=1)
[0 0 0 ... 0 0 0]
shape (141822, 497)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


TRAIN: [     0      1      2 ... 113455 113456 113457] TEST: [113458 113459 113460 ... 141819 141820 141821]
RangeIndex(start=0, stop=141822, step=1)
[0 0 0 ... 0 0 0]
shape (141822, 497)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [11]:
kf_scores

Unnamed: 0,train score,test score
0,0.529404,0.289515
1,0.551094,0.316195
2,0.508271,0.324611
3,0.490446,0.293819
4,0.5012,0.390383


**wrapper** is the main function to read in unzipped data and output a score for evaluation. In addition, the function returns the y probability matrix (both train and test) for grading. More details about submitting format are outlined below.

In [12]:
def wrapper():
    # read in data
    X_train, X_test, y_train = read_data_small()
    # or if you have the computational power to work with the big data set, 
    # you can comment out the read_data_samll line and uncomment the following read_data_big
    # X_train, X_test, y_train = read_data_big()
    
    # process the data, train classifier and output probability matrix
    y_train_prob_pred, y_test_prob_pred = detect_spoofying(X_train, X_test, y_train)
    
    # score the predictions
    score_train = score(y_train_prob_pred, y_train)
    # score_test = score(y_test_prob_pred, y_test)
    
    # return the scores
    return score_train, y_train_prob_pred, y_test_prob_pred

Call function wrapper:

In [13]:
score_train, y_train_prob_pred, y_test_prob_pred = wrapper()

shape (202604, 497)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Score for training data set is:

In [14]:
score_train

0.5015044558194381

### Submission Format

The classifier function wrote should return a 4d nparray with 4 columns. The columns are corresponding to the class labels: 0, 1, 2, 3. Please see examples below.

In [15]:
y_train_prob_pred

array([[9.83035962e-01, 6.67557392e-09, 1.69640314e-02],
       [9.99987851e-01, 4.16102945e-11, 1.21493003e-05],
       [9.99987933e-01, 4.20093560e-11, 1.20665648e-05],
       ...,
       [9.99966630e-01, 3.27603891e-05, 6.09550203e-07],
       [9.99988727e-01, 1.06123127e-05, 6.60584615e-07],
       [9.93256130e-01, 5.90769821e-03, 8.36171624e-04]])

In [16]:
y_test_prob_pred

array([[9.99950738e-01, 4.91445027e-05, 1.17760348e-07],
       [9.99998736e-01, 2.22210303e-07, 1.04199108e-06],
       [9.99991185e-01, 8.67530840e-06, 1.39242389e-07],
       ...,
       [9.99984078e-01, 1.43797636e-05, 1.54195556e-06],
       [9.99994411e-01, 3.83492982e-06, 1.75368663e-06],
       [9.99997416e-01, 8.66882998e-07, 1.71682308e-06]])

### Write test results to csv files

Please rename your file to indicate which data set you are working with. 

- If you are using the small dataset: *y_train_prob_pred_small.csv* and *y_test_prob_pred_small.csv*
- If you are using the small dataset: *y_train_prob_pred_big.csv* and *y_test_prob_pred_big.csv*
- If you are using the original dataset: *y_train_prob_pred.csv* and *y_test_prob_pred.csv*

In [17]:
pd.DataFrame(y_train_prob_pred).to_csv("y_train_prob_pred.csv")
pd.DataFrame(y_test_prob_pred).to_csv("y_test_prob_pred.csv")