In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import random
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

from imblearn.datasets import make_imbalance
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from sklearn.utils import resample

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer

MAX_ROWS = 100000
VISUALIZE = False
RANDOM_STATE = 42
DOWNSAMPLE=True
print('ready')


# https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/
# https://imbalanced-learn.org/stable/api.html

ready


In [25]:
def make_readable(label):
    return re.sub("([a-z])([A-Z])","\g<1> \g<2>",label)

In [26]:
# https://elitedatascience.com/imbalanced-classes
def downsample(X_train, y_train, ratio=1):
    if not DOWNSAMPLE:
        return X_train, y_train
    
    X_train['target'] = y_train
    df = X_train
    print('old shape', X_train.shape)
    # Separate majority and minority classes
    df_majority = df[df.target==0]
    df_minority = df[df.target!=0]

    # Downsample majority class
    df_majority_downsampled = resample(df_majority, 
                                     replace=False,    # sample without replacement
                                     n_samples=len(df_minority.index)*ratio,     # to match minority class
                                     random_state=123) # reproducible results

    # Combine minority class with downsampled majority class
    df_downsampled = pd.concat([df_majority_downsampled, df_minority])

    # Display new class counts
    print('downsampled\n', df_downsampled['target'].value_counts())
    
    y_train = df_downsampled['target']
    X_train = df_downsampled.drop(['target'], axis=1)
    return X_train, y_train
    

**read_data_small** is the function to read in the small dataset about 30 MB

In [27]:
def read_data_small(max_rows):
    X_train = pd.read_csv("data_small/X_train_small.csv")
    X_test = pd.read_csv("data_small/X_test_small.csv")
    y_train = np.asarray(pd.read_csv("data_small/y_train_small.csv", header=None)[0])
    if max_rows:
        # Take random subset of rows
        
        rows = np.random.choice(np.arange(len(X_train)), max_rows, False)
        print(X_train.shape, X_test.shape, len(y_train))
        print('random rows', rows)
        return X_train.iloc[rows], X_test, y_train[rows]
#         return X_train[:max_rows], X_test[:max_rows], y_train[:max_rows]
    return X_train, X_test, y_train

**read_data_big** is the function to read in the big dataset about 100 MB

In [28]:
def read_data_big(max_rows):
    X_train = pd.read_csv("data_big/X_train_big.csv")
    X_test = pd.read_csv("data_big/X_test_big.csv")
    y_train = np.asarray(pd.read_csv("data_big/y_train_big.csv", header=None)[0])
    return X_train, X_test, y_train

**read_data** is the function to read in the whole dataset about 1.5 G

In [29]:
def read_data(max_rows):
    X_train = pd.read_csv("data/X_train.csv")
    X_test = pd.read_csv("data/X_test.csv")
    y_train = np.asarray(pd.read_csv("data/y_train.csv", header=None)[0])
    return X_train, X_test, y_train

In [30]:
def read_data_wrapper(max_rows=None):
    # return one of: read_data, read_data_big, read_data_wrapper
    return read_data(max_rows)

# Insert Your Code Here

**detect_spoofying** is the function for training the classifier and classify the results. 

Here we provide an simple example.

In [31]:
X_train, X_test, y_train = read_data_wrapper(MAX_ROWS)
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1542706 entries, 0 to 1542705
Data columns (total 29 columns):
timestamp        1542706 non-null int64
type             1542706 non-null object
obId             1542706 non-null object
member           1542706 non-null object
user             1542706 non-null object
endUserRef       1542706 non-null object
price            1542706 non-null float64
volume           1542706 non-null float64
operation        1490138 non-null object
isBid            1490138 non-null object
orderId          1490138 non-null object
source           1490138 non-null object
tradeId          52568 non-null object
bidOrderId       52568 non-null object
askOrderId       52568 non-null object
isBuyer          52568 non-null object
isAggressor      52568 non-null object
bestBid          1542703 non-null float64
bestBidVolume    1542703 non-null float64
bestAsk          1542684 non-null float64
bestAskVolume    1542684 non-null float64
lv2Bid           1542678 non-nu

In [32]:
# X_train.info()
print(len(set(X_train['user'])))
# set(X_train['type'])
print(len(set(X_train['member'])))

323
77


In [33]:
NUMERIC_COLS = ["price","volume","bestBid","bestAsk",'bestBidVolume',
                    'bestAskVolume','lv2Bid', 'lv2BidVolume','lv2Ask', 
                    'lv2AskVolume', 'lv3Bid', 'lv3BidVolume', 'lv3Ask',
                    'lv3AskVolume']
ENCODED_COLS = [] # ['member', 'user']


In [34]:
### import libraries here ###
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_validate

### code classifier here ###
# Feature engineering
def format_data(df):

    cols = []
    cols.extend(NUMERIC_COLS)
    cols.extend(ENCODED_COLS)
    
    # append numberical columns
    rst = df.loc[:, cols]
    
    # encode the binaries
    rst["isBid"] = df.isBid*1
    rst["isBuyer"] = df.isBuyer*1
    rst["isAggressor"] = df.isAggressor*1
    rst["type"] = (df.type == "ORDER")*1
    rst["source"] = (df.source=="USER")*1

    # parse the order id data
    rst["orderId"] = df.orderId.str.split('-').str[-1]
    rst["tradeId"] = df.tradeId.str.split('-').str[-1]
    rst["bidOrderId"] = df.bidOrderId.str.split('-').str[-1]
    rst["askOrderId"] = df.askOrderId.str.split('-').str[-1]
    
    
    # encode the multiple label data
    tmp_operation = pd.DataFrame(pd.get_dummies(df.operation, prefix="op"), columns=df.operation.unique()[:-1])
    rst = pd.concat([rst, tmp_operation], axis=1)
#     tmp_endUserRef = pd.DataFrame(pd.get_dummies(df.endUserRef, prefix="enduser"), columns=df.endUserRef.unique()[:-1])
#     rst = pd.concat([rst, tmp_endUserRef], axis=1)
    
    # also feel free to add more columns inferred from data
    # smartly engineered features can be very useful to improve the classification results
    rst["timeSinceLastTrade"] = X_train[["timestamp","endUserRef"]].groupby("endUserRef").diff()

    for col in ENCODED_COLS:
        # https://stackoverflow.com/questions/37292872/how-can-i-one-hot-encode-in-python
        # one hot encode
        # Get one hot encoding of columns B
        one_hot = pd.get_dummies(df[col], prefix=col)
        # Drop column B as it is now encoded
        rst = rst.drop(col, axis = 1)
        # Join the encoded df
        rst = rst.join(one_hot)
    print('data shape', rst.shape)
    print('cols', len(rst.columns.values))
    return rst

def get_scaled_data(X_train, X_test, y_train):
    
    X_train, y_train = downsample(X_train, y_train, 4)
        
    # clean up the data
#     df = df[~df.index.duplicated()]
    print('cleaning', X_train.shape, len(y_train))
    X_clean = format_data(pd.concat([X_train, X_test]))
#     for c in NUMERIC_COLS:
#         X_clean[c]= X_clean[c].fillna(X_clean[c].mean())
#     X_clean.fillna(method='ffill', inplace=True)
    X_clean = X_clean.fillna(-1)
    
    feature_columns = X_clean.columns.values
    print('done cleaning')
    X_train_clean = X_clean.iloc[:X_train.shape[0],:]
    X_test_clean = X_clean.iloc[X_train.shape[0]:,:]
    X_train_clean_scaled = scale(X_train_clean)
    X_test_clean_scaled = scale(X_test_clean)
    return X_train_clean_scaled, X_test_clean_scaled, y_train, feature_columns

# Classification algorithm
def detect_spoofying(X_train, X_test, y_train):
    print('detect_spoofing')
    X_train_clean_scaled, X_test_clean_scaled, y_train, feature_columns = get_scaled_data(X_train, X_test, y_train)
    
#     clf = ExtraTreesClassifier() # .89
#     clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15), random_state=1)
#     clf = LinearSVC(random_state=0, tol=1e-5, multi_class="crammer_singer")
#     clf = KNeighborsClassifier(n_neighbors=3)

#     clf = OneVsRestClassifier(XGBClassifier())
#     clf = OneVsRestClassifier(XGBClassifier(n_estimators=100, n_jobs=-1, max_depth=4))
    print('fitting', X_train_clean_scaled.shape, len(y_train))
    clf = OneVsRestClassifier(XGBClassifier())
    clf.fit(X_train_clean_scaled, y_train)
    print('classifying')
    # print('features', list(zip(feature_columns, clf.feature_importances_)))
    y_train_prob_pred = clf.predict_proba(X_train_clean_scaled)
    y_test_prob_pred = clf.predict_proba(X_test_clean_scaled)
    
    return y_train_prob_pred, y_test_prob_pred, y_train

In [35]:
if VISUALIZE:
    X_train, X_test, y_train = read_data_wrapper(MAX_ROWS)
    X_train_clean_scaled, X_test_clean_scaled, y_train, feature_columns = get_scaled_data(X_train, X_test, y_train)

    clf = BalancedRandomForestClassifier(max_depth=2, random_state=0)

    # fit classifier
    #     clf = LogisticRegression(random_state=0, class_weight='balanced')
    clf.fit(X_train_clean_scaled, y_train)
    print('features', list(zip(feature_columns, clf.feature_importances_)))


In [36]:
if VISUALIZE:
    names = feature_columns
    values = clf.feature_importances_
    zipped = zip(names, values)
    res = sorted(zipped, key = lambda x: x[1], reverse=True)

    n = 50
    names = [make_readable(i[0]) for i in res[:n]]
    values = [i[1] for i in res[:n]]

    plt.figure(figsize=(16, 9))
    sns.barplot(values, names)

    plt.title('Feature Importance')
    plt.ylabel('Importance')
    plt.xticks(rotation=90)
    plt.show()

In [37]:
X_num = X_train.loc[:, NUMERIC_COLS]

In [38]:
if VISUALIZE:
    X_num.describe()
    # set(y_train)
    y_categories = pd.get_dummies(pd.DataFrame(y_train).replace({0: '0', 1: '1', 2: '2'}),prefix='Category')
    # print(y_categories.loc[y_categories['0_Category 1'] == 1])
    corr_df = X_num.join(y_categories)
    corr = corr_df.corr()
    plt.figure(figsize = (16,9))
    ax = sns.heatmap(corr, annot=True, linewidths=.5, cmap="YlGnBu")

**score** is the function that we use to compare the results. An example is provided with scoring the predictions for the training dataset. True labels for the testing data set will be supplied to score the predictions for testing dataset.

Score is based on cohen's kappa measurement. https://en.wikipedia.org/wiki/Cohen%27s_kappa

In [39]:
from sklearn.metrics import cohen_kappa_score

def score(y_pred, y_true):
    """
    y_pred: a numpy 4d array of probabilities of point assigned to each label
    y_true: a numpy array of true labels
    """
    y_pred_label = np.argmax(y_pred, axis=1)
    return cohen_kappa_score(y_pred_label, y_true)

### Optional: k-fold cross validation

In [48]:
### optional: examples of k-fold cross validation ###
# k-fold cross validation can help you compare the classification models
if True:
    from sklearn.model_selection import KFold
    n = 5
    kf = KFold(n_splits = n)
    X_train, X_test, y_train = read_data_wrapper(MAX_ROWS)
    kf.get_n_splits(X_train)
    print(kf)
    kf_scores = pd.DataFrame(np.zeros([n,2]), columns=["train score", "test score"])
    rowindex = 0
    i = 0
    for train_index, test_index in kf.split(X_train):
        i+=1
        print('Step', i, "TRAIN:", train_index, "TEST:", test_index)
        print(X_train.index)
        print(y_train)
        X_train_kf, X_test_kf = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_kf, y_test_kf = y_train[train_index], y_train[test_index]
        y_train_prob_pred_kf, y_test_prob_pred_kf, new_y_train_kf = detect_spoofying(X_train_kf, X_test_kf, y_train_kf)
        score_train_kf = score(y_train_prob_pred_kf, new_y_train_kf)
        score_test_kf = score(y_test_prob_pred_kf, y_test_kf)
        kf_scores.iloc[rowindex, 0] = score_train_kf
        kf_scores.iloc[rowindex, 1] = score_test_kf
        rowindex += 1
    
    print('scores')
    print(kf_scores)

KFold(n_splits=5, random_state=None, shuffle=False)
Step 1 TRAIN: [ 308542  308543  308544 ... 1542703 1542704 1542705] TEST: [     0      1      2 ... 308539 308540 308541]
RangeIndex(start=0, stop=1542706, step=1)
[0 0 0 ... 0 0 0]
detect_spoofing
old shape (1234164, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


downsampled
 0    11848
2     1931
1     1031
Name: target, dtype: int64
cleaning (14810, 29) 14810
data shape (323352, 27)
cols 27
done cleaning
fitting (14810, 27) 14810
classifying
Step 2 TRAIN: [      0       1       2 ... 1542703 1542704 1542705] TEST: [308542 308543 308544 ... 617080 617081 617082]
RangeIndex(start=0, stop=1542706, step=1)
[0 0 0 ... 0 0 0]
detect_spoofing
old shape (1234165, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


downsampled
 0    10792
2     1750
1      948
Name: target, dtype: int64
cleaning (13490, 29) 13490
data shape (322031, 27)
cols 27
done cleaning
fitting (13490, 27) 13490
classifying
Step 3 TRAIN: [      0       1       2 ... 1542703 1542704 1542705] TEST: [617083 617084 617085 ... 925621 925622 925623]
RangeIndex(start=0, stop=1542706, step=1)
[0 0 0 ... 0 0 0]
detect_spoofing
old shape (1234165, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


downsampled
 0    11168
2     1852
1      940
Name: target, dtype: int64
cleaning (13960, 29) 13960
data shape (322501, 27)
cols 27
done cleaning
fitting (13960, 27) 13960
classifying
Step 4 TRAIN: [      0       1       2 ... 1542703 1542704 1542705] TEST: [ 925624  925625  925626 ... 1234162 1234163 1234164]
RangeIndex(start=0, stop=1542706, step=1)
[0 0 0 ... 0 0 0]
detect_spoofing
old shape (1234165, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


downsampled
 0    10668
2     1681
1      986
Name: target, dtype: int64
cleaning (13335, 29) 13335
data shape (321876, 27)
cols 27
done cleaning
fitting (13335, 27) 13335
classifying
Step 5 TRAIN: [      0       1       2 ... 1234162 1234163 1234164] TEST: [1234165 1234166 1234167 ... 1542703 1542704 1542705]
RangeIndex(start=0, stop=1542706, step=1)
[0 0 0 ... 0 0 0]
detect_spoofing
old shape (1234165, 30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


downsampled
 0    10884
2     1750
1      971
Name: target, dtype: int64
cleaning (13605, 29) 13605
data shape (322146, 27)
cols 27
done cleaning
fitting (13605, 27) 13605
classifying
scores
   train score  test score
0     0.999601    0.047232
1     0.999562    0.004164
2     0.999152    0.059182
3     0.999557    0.026199
4     0.999566    0.004601


**wrapper** is the main function to read in unzipped data and output a score for evaluation. In addition, the function returns the y probability matrix (both train and test) for grading. More details about submitting format are outlined below.

In [41]:
def wrapper():
    # read in data
    X_train, X_test, y_train = read_data_wrapper(MAX_ROWS)
    # or if you have the computational power to work with the big data set, 
    # you can comment out the read_data_samll line and uncomment the following read_data_big
    # X_train, X_test, y_train = read_data_big()
    
    # process the data, train classifier and output probability matrix
    y_train_prob_pred, y_test_prob_pred, new_y_train = detect_spoofying(X_train, X_test, y_train)
    
    # score the predictions
    print(len(y_train_prob_pred), len(y_train))
    score_train = score(y_train_prob_pred, new_y_train)
    # score_test = score(y_test_prob_pred, y_test)
    
    # return the scores
    return score_train, y_train_prob_pred, y_test_prob_pred

Call function wrapper:

In [42]:
score_train, y_train_prob_pred, y_test_prob_pred = wrapper()

detect_spoofing
old shape (1542706, 30)
downsampled
 0    13840
2     2241
1     1219
Name: target, dtype: int64
cleaning (17300, 29) 17300
data shape (678460, 27)
cols 27
done cleaning
fitting (17300, 27) 17300
classifying
17300 1542706


Score for training data set is:

In [43]:
score_train

0.9979510173322319

### Submission Format

The classifier function wrote should return a 4d nparray with 4 columns. The columns are corresponding to the class labels: 0, 1, 2, 3. Please see examples below.

In [44]:
y_train_prob_pred

array([[9.9999321e-01, 2.2099839e-06, 4.5808033e-06],
       [9.9965787e-01, 1.2427585e-06, 3.4092696e-04],
       [9.9922603e-01, 2.0334456e-04, 5.7060196e-04],
       ...,
       [9.3629485e-04, 9.9890101e-01, 1.6275818e-04],
       [1.1729641e-02, 9.6441585e-01, 2.3854554e-02],
       [1.1027574e-03, 9.9788499e-01, 1.0122677e-03]], dtype=float32)

In [45]:
y_test_prob_pred

array([[6.8414968e-01, 1.8423975e-02, 2.9742631e-01],
       [7.2338915e-01, 1.8591100e-02, 2.5801978e-01],
       [5.3898287e-01, 3.8485039e-02, 4.2253205e-01],
       ...,
       [7.4453640e-01, 2.5522441e-01, 2.3917068e-04],
       [9.0312320e-01, 9.6601933e-02, 2.7481004e-04],
       [1.2526077e-01, 5.2235682e-02, 8.2250357e-01]], dtype=float32)

In [47]:
pd.DataFrame(y_train_prob_pred).to_csv("y_train_prob_pred.csv")
pd.DataFrame(y_test_prob_pred).to_csv("y_test_prob_pred.csv")
print('done')

done
