<a href="https://colab.research.google.com/github/ajaxx/cs5262-notebooks/blob/main/Crypto_Currency_Trading_Technical_Bias_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Background

This is the data analysis effort for the network latency and loss project.  The notebook contains an analysis of the data we have collected, primarily looking at the data samples that were observed when running on AWS-based hardware in us-east-2 and us-west-2.

## Observed Link Time

As measured from us-west-2 to us-east-2:

```
PING x.xxx.xxx.xxx (x.xxx.xxx.xxx) 56(84) bytes of data.
64 bytes from x.xxx.xxx.xxx: icmp_seq=1 ttl=105 time=48.4 ms
64 bytes from x.xxx.xxx.xxx: icmp_seq=2 ttl=105 time=48.4 ms
64 bytes from x.xxx.xxx.xxx: icmp_seq=3 ttl=105 time=48.4 ms
64 bytes from x.xxx.xxx.xxx: icmp_seq=4 ttl=105 time=48.3 ms
64 bytes from x.xxx.xxx.xxx: icmp_seq=5 ttl=105 time=48.3 ms
64 bytes from x.xxx.xxx.xxx: icmp_seq=6 ttl=105 time=48.3 ms
64 bytes from x.xxx.xxx.xxx: icmp_seq=7 ttl=105 time=48.3 ms
64 bytes from x.xxx.xxx.xxx: icmp_seq=8 ttl=105 time=48.3 ms
64 bytes from x.xxx.xxx.xxx: icmp_seq=9 ttl=105 time=49.4 ms
```

Similar times were observed when measuring from us-east-2 to us-west-2.

In [None]:
# Import libraries we will need

import os.path
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
def load_dataset(file_prefix):
    if os.path.exists(f'{file_prefix}.parquet'):
        df = pd.read_parquet(f'{file_prefix}.parquet')
    else:
        df = pd.read_json(f'{file_prefix}.json', orient='values')
        # Rename the input columns
        df.rename(columns = {0 : 'id', 1 : 'time_recv', 2 : 'latency', 3 : 'trade'}, inplace = True)
        if 'trade' in df:
            df['seq_no'] = df['trade'].map(lambda x: x['seq_no'])
            df['time_sent'] = df['trade'].map(lambda x: x['timestamp'])
            df['symbol'] = df['trade'].map(lambda x: x['symbol'])
            del df['trade']

        df['offset'] = df['time_recv'] - min(df['time_sent'])
        df.to_parquet(f'{file_prefix}.parquet', engine='fastparquet')

    return df

In [None]:
# Load the various TCP datasets
df_us_east_tcp_1 = load_dataset('./data-us-east-2-tcp-1')
df_us_east_tcp_2 = load_dataset('./data-us-east-2-tcp-2')
df_us_east_tcp_4 = load_dataset('./data-us-east-2-tcp-4')
df_us_east_tcp_6 = load_dataset('./data-us-east-2-tcp-6')
df_us_east_tcp_8 = load_dataset('./data-us-east-2-tcp-8')


In [None]:
def plot_latency(df):
    id_list = df['id'].unique()
    for id in id_list:
        dv = df[df['id'] == id]
        dv.loc[:, ('latency_ms')] = (dv.loc[:,('latency')] / 1000)
        dq = dv.rolling(window = 10000, step = 1000).agg({
            'offset': ['min', 'max'],
            'latency_ms': ['min', 'max', 'mean']
        })

        #plt.plot((dv['offset'] / 1000), dv['latency_ms'], label = id)
        plt.plot((dq[('offset', 'min')] / 1000), dq[('latency_ms', 'mean')], label = f'{id} (mean)')
        #plt.plot((dq[('offset', 'min')] / 1000), dq[('latency_ms', 'max')], label = f'{id} (max)')
        #plt.plot((dq[('offset', 'min')] / 1000), dq[('latency_ms', 'min')], label = f'{id} (min)')

        print((dq[('latency_ms','max')]).mean())
        #print((dq[dq[('offset', 'min')] > 1000000][('latency_ms','mean')]).mean())
        #print((dq[dq[('offset', 'min')] > 1000000][('latency_ms','min')]).mean())

    plt.title('Latency over time')
    plt.xlabel('Time (ms)')
    plt.ylabel('Latency (ms)')
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.show()

In [None]:
plot_latency(df_us_east_tcp_1)

In [None]:
plot_latency(df_us_east_tcp_2)

In [None]:
plot_latency(df_us_east_tcp_4)

In [None]:
plot_latency(df_us_east_tcp_6)

In [None]:
plot_latency(df_us_east_tcp_8)

# UDP Datasets

In [None]:
# Load the various TCP datasets
df_udp_1 = load_dataset('./data-us-east-2-udp-1')
df_udp_2 = load_dataset('./data-us-east-2-udp-2')
df_udp_4 = load_dataset('./data-us-east-2-udp-4')
df_udp_6 = load_dataset('./data-us-east-2-udp-6')
df_udp_8 = load_dataset('./data-us-east-2-udp-8')

In [None]:
plot_latency(df_udp_1)

In [None]:
plot_latency(df_udp_2)

In [None]:
plot_latency(df_udp_4)

In [None]:
plot_latency(df_udp_6)

In [None]:
plot_latency(df_udp_8)

In [None]:
def plot_loss(df, symbol):
    df_sorted = df.sort_values('seq_no')
    df_sorted['pseq_no'] = (df_sorted['seq_no'].shift(1))
    df_sorted['gap'] = (df_sorted['seq_no'] - df_sorted['pseq_no'])
    df_sorted['loss'] = df_sorted['gap'].apply(lambda x: 1 if x > 1 else 0)

    #df_test = df_sorted[(df_sorted['seq_no'] >= 240) & (df_sorted['seq_no'] <= 2055)]
    #display(df_test)

    min_time = df['time_sent'].min()

    #print(df_sorted[df_sorted['gap'] > 1]['gap'].head())
    #print(df_sorted[df_sorted['gap'] < 0]['gap'].head())

    rolling = df_sorted.rolling(window = 1000, step = 100)
    rolling = rolling.agg({'time_sent': ['min'], 'pseq_no': ['min'], 'seq_no': ['min', 'max'], 'gap': ['sum', 'max', 'count'], 'loss': [ 'sum', 'count' ]}).reset_index()
    rolling['segment_size'] = (rolling['seq_no']['max'] - rolling['pseq_no']['min'])
    rolling['segment_recv'] = rolling[('gap','count')]
    rolling['segment_loss'] = (1.0 - rolling['segment_recv'] / rolling['segment_size'])
    rolling['maximum_loss'] = rolling[('gap', 'max')]
    rolling['meta_loss'] = (rolling[('loss', 'sum')] / rolling[('loss', 'count')])
    rolling['time'] = (rolling['time_sent']['min'] - min_time)

    #print(rolling['time_sent'] < 0.0)
    #print(rolling[rolling['gap']['sum'] > 0])

    #display(rolling.head())

    rolling_2 = rolling.rolling(window = 10000, step = 10000).agg({
        ('time', ''): ['min'],
        ('segment_loss', ''): ['mean']
    })

    rolling_2 = rolling_2.rename(
        columns={
            ('time', '', 'min'): 'time',
            ('segment_loss', '', 'mean'): 'segment_loss'
        })
    
    plt.plot((rolling['time'] / 1000), (100.0 * rolling['meta_loss']), label = f'{symbol} (meta loss)')
    #plt.plot((rolling['time'] / 1000), (100.0 * rolling['segment_loss']), label = f'{symbol} (mean loss)')

def plot_max_loss(df, symbol):
    df_sorted = df.sort_values('seq_no')
    df_sorted['pseq_no'] = (df_sorted['seq_no'].shift(1))
    df_sorted['gap'] = (df_sorted['seq_no'] - df_sorted['pseq_no'])

    min_time = df['time_sent'].min()

    rolling = df_sorted.rolling(window = 1000, step = 100)
    rolling = rolling.agg({'time_sent': ['min'], 'pseq_no': ['min'], 'seq_no': ['min', 'max'], 'gap': ['sum', 'max', 'count']}).reset_index()
    rolling['segment_size'] = (rolling['seq_no']['max'] - rolling['pseq_no']['min'])
    rolling['segment_recv'] = rolling[('gap','count')]
    rolling['segment_loss'] = (1.0 - rolling['segment_recv'] / rolling['segment_size'])
    rolling['maximum_loss'] = rolling[('gap', 'max')]
    rolling['time'] = (rolling['time_sent']['min'] - min_time)

    plt.plot((rolling['time'] / 1000), rolling['maximum_loss'], label = f'{symbol} (maximum loss)')

def plot_loss_for_sender(df, sender_id):
    sym_list = df['symbol'].unique()
    for symbol in sym_list:
        dfx = df[df['symbol'] == symbol]
        plot_loss(dfx, symbol)

    plt.title(f'{sender_id}')
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.xlabel('Time (ms)')
    plt.ylabel('% Data Loss')
    plt.show()

    sym_list = df['symbol'].unique()
    for symbol in sym_list:
        dfx = df[df['symbol'] == symbol]
        plot_max_loss(dfx, symbol)

    plt.title(f'{sender_id}')
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.xlabel('Time (ms)')
    plt.ylabel('Messages Lost')
    plt.ticklabel_format(style='plain', axis='y')
    plt.show()




def plot_loss_for_all(df):
    id_list = df['id'].unique()
    for id in id_list:
        dfx = df[df['id'] == id]
        plot_loss_for_sender(dfx, id)




In [None]:
plot_loss_for_all(df_udp_1)



In [None]:
def print_latency_svg(df):
    df_symbol = df[df['symbol'] == 'AAPL']
    df_sorted = df_symbol.sort_values('seq_no')
    df_sorted['pseq_no'] = (df_sorted['seq_no'].shift(1))
    df_sorted['gap'] = (df_sorted['seq_no'] - df_sorted['pseq_no'])

    df_sorted = df_sorted[df_sorted['seq_no'] < 10000]

    #display(df_sorted[(df_sorted['pseq_no'] >= 100) & (df_sorted['seq_no'] <= 6392)].head(1000))

    start_seq = 0
    end_seq = df_sorted['seq_no'].max()

    height=1000

    print(f'<svg viewBox="0 0 {end_seq} {height}" xmlns="http://www.w3.org/2000/svg">')

    for index, row in df_sorted[(df_sorted['gap'] > 1)].iterrows():
        pseq_no = row['pseq_no']
        cseq_no = row['seq_no']
        gap = cseq_no - pseq_no

        print(f'<rect x="{start_seq}" y="0" width="{pseq_no - start_seq}" height="{height}" fill="green" stroke="black" stroke-width="0.25" />')
        print(f'<rect x="{pseq_no}" y="0" width="{cseq_no - pseq_no}" height="{height}" fill="#7c0a02" stroke="black" stroke-width="0.25"/>')
        start_seq = cseq_no

    print('</svg>')

print_latency_svg(df_udp_1)


In [None]:
plot_loss_for_all(df_udp_2)


In [None]:
plot_loss_for_all(df_udp_4)


In [None]:
plot_loss_for_all(df_udp_6)


In [None]:
plot_loss_for_all(df_udp_8)