In [1]:
import pandas as pd
import plotly.graph_objects as go
import pywt
import numpy as np
import plotly.express as px
from sklearn.decomposition import KernelPCA

In [2]:
def add_wavelets_final(df):
    df['ask_dwt'] = pd.Series(DWT(df, 'ask'), dtype='float64')
    df['bid_dwt'] = pd.Series(DWT(df, 'bid'), dtype='float64')
    df['ask_dwt_grad'] = pd.Series(np.gradient(df['ask_dwt'], 1))
    df['bid_dwt_grad'] = pd.Series(np.gradient(df['bid_dwt'], 1))
    return df

def DWT(df, str_):
    (ca, cd) = pywt.dwt(df[str_+'_price'], 'sym2', 'smooth')
    sigma_wv = np.median(np.abs(cd)) / 0.6745
    thresh_wv = sigma_wv * np.sqrt(2*np.log(len(df[str_+'_price'])))
    cd[np.abs(cd) < thresh_wv] = 0
    ts_recons = pywt.idwt(ca, cd, 'sym2', 'smooth')
    return ts_recons

def plot_class_graph(col_name, class_partition, units, df):
    fig = go.Figure()
    available_classes = np.sort(lv1_df[class_partition].unique())
    for cl in available_classes:
        sm_df = np.full(df.shape[0], None)
        ind = np.array(df[df[class_partition]==cl].index)
        sm_df[ind] = df[col_name][ind]
        sm_df = pd.DataFrame(sm_df)
        fig.add_trace(go.Scatter(x=sm_df.index, y=sm_df.iloc[:, 0], name='class '+str(int(cl))))

    fig.update_layout(
        title=col_name + ' ' + class_partition,
        xaxis_title="Observation",
        yaxis_title=units,
        width=800,
        height=500)
    fig.update_xaxes(rangeslider_visible=True)
    fig.show()

In [6]:
lv1_df = pd.read_csv('lv1_labs_LOF.csv')
lv1_df.drop(columns=['Unnamed: 0'], inplace=True)
lv1_df = add_wavelets_final(lv1_df)

Unnamed: 0,ask_price,ask_price_grad,ask_volat,bid_price,bid_price_grad,bid_volat,imbalance,spread,LOF,ask_dwt,bid_dwt,ask_dwt_grad,bid_dwt_grad
0,13440.0,4.0,1.342804e-07,13439.719423,3.831104,1.350915e-07,-0.409106,0.002088,-1.01906,13440.133975,13439.877586,8.098076,7.5714
1,13448.0,4.0,4.671656e-07,13447.17504,4.091773,4.345228e-07,-0.428571,0.006135,-1.200498,13448.232051,13447.448986,3.56863,3.554989
2,13448.0,0.0,4.581312e-07,13447.902969,0.363964,4.152889e-07,0.712644,0.000722,-2.07421,13447.271234,13446.987565,0.252845,0.614773
3,13448.0,1.5,4.581312e-07,13447.902969,1.169078,4.152889e-07,-0.91412,0.000722,-4.772669,13448.73774,13448.678532,1.489383,1.357157
4,13451.0,1.5,5.078854e-07,13450.241124,1.169078,4.455136e-07,0.714286,0.005642,-115.658778,13450.25,13449.70188,1.50613,1.112792


In [12]:
lv1_df['outlier'] = 0
lv1_df['outlier'][lv1_df['LOF'] < -100] = 1
lv1_df.head()

Unnamed: 0,ask_price,ask_price_grad,ask_volat,bid_price,bid_price_grad,bid_volat,imbalance,spread,LOF,ask_dwt,bid_dwt,ask_dwt_grad,bid_dwt_grad,outlier
0,13440.0,4.0,1.342804e-07,13439.719423,3.831104,1.350915e-07,-0.409106,0.002088,-1.01906,13440.133975,13439.877586,8.098076,7.5714,0
1,13448.0,4.0,4.671656e-07,13447.17504,4.091773,4.345228e-07,-0.428571,0.006135,-1.200498,13448.232051,13447.448986,3.56863,3.554989,0
2,13448.0,0.0,4.581312e-07,13447.902969,0.363964,4.152889e-07,0.712644,0.000722,-2.07421,13447.271234,13446.987565,0.252845,0.614773,0
3,13448.0,1.5,4.581312e-07,13447.902969,1.169078,4.152889e-07,-0.91412,0.000722,-4.772669,13448.73774,13448.678532,1.489383,1.357157,0
4,13451.0,1.5,5.078854e-07,13450.241124,1.169078,4.455136e-07,0.714286,0.005642,-115.658778,13450.25,13449.70188,1.50613,1.112792,1


In [24]:
#for class_ in ['class1', 'class2', 'class3']:
plot_class_graph('bid_price', 'outlier', 'Dollars', lv1_df)

In [17]:
def plot_3D(df, label):
    X = df.drop(columns=['LOF', 'outlier'])
    kpca_3d = KernelPCA(n_components = 3, kernel='linear', gamma=0) 
    kpca_3d_df = pd.DataFrame(kpca_3d.fit_transform(X))
    kpca_3d_df.columns = ['PC1', 'PC2', 'PC3']
    kpca_3d_df['outlier'] = lv1_df[label]
    fig_3D = px.scatter_3d(kpca_3d_df, x='PC1', y='PC2', z='PC3',
                color='outlier')
    fig_3D.show()

In [18]:
plot_3D(lv1_df, 'outlier')

In [19]:
def plot_2D(df, label):
        X = df.drop(columns=['LOF', 'outlier'])
        kpca_2d = KernelPCA(n_components = 2, kernel='linear')
        kpca_2d_df = pd.DataFrame(kpca_2d.fit_transform(X))
        kpca_2d_df.columns = ['PC1', 'PC2']
        kpca_2d_df['outlier'] = lv1_df[label]
        fig_2D = px.scatter(kpca_2d_df, x='PC1', y='PC2', color='outlier')
        fig_2D.show()

In [21]:
plot_2D(lv1_df, 'outlier')

In [22]:
lv1_df.groupby('outlier').mean()

Unnamed: 0_level_0,ask_price,ask_price_grad,ask_volat,bid_price,bid_price_grad,bid_volat,imbalance,spread,LOF,ask_dwt,bid_dwt,ask_dwt_grad,bid_dwt_grad
outlier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,13443.346514,0.002273,5.154201e-07,13442.491983,0.002886,5.103357e-07,-0.117885,0.006357,-2.489163,13443.346639,13442.492846,0.002864,0.003064
1,13477.027602,0.023139,2.981953e-07,13476.283957,-0.000355,2.946229e-07,-0.006039,0.005518,-1282.89542,13477.024187,13476.256917,0.01243,0.000965
