In [None]:
# Includes
# os
import os
from time import sleep
import datetime
from datetime import timezone

# websockets and connectivity
from binance.websockets import BinanceSocketManager
from twisted.internet import reactor

# interactive shell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# binance
from binance.client import Client
import binance.helpers as btime

# pandas
import pandas as pd
from pandas import DataFrame as df
pd.set_option("display.max_rows", 100, "display.max_columns", None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


# numpy
import numpy as np

# plotting
from matplotlib import pyplot as plt
import seaborn as sb
from mpl_toolkits.mplot3d import Axes3D

# data science utils
from scipy import stats
from scipy import fft
from scipy import signal
import statsmodels.api as sm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pingouin as pg




In [None]:
# Load api login data

# open api login file
apifile = open("apilogin.txt", "r");
apilogin = (str(apifile.read()).split("\n"));

# extract key
apikey = apilogin[0].split(":")[1];
# extract secret
apisecret = apilogin[1].split(":")[1];

# check values
"key: " + apikey;
"secret: " + apisecret;

client=Client(api_key=apikey,api_secret=apisecret);

In [None]:
# define trade pair
base_asset = 'BTC'
quote_asset = 'USDT'
trade_pair = base_asset+quote_asset

# define request interval
time_interval = Client.KLINE_INTERVAL_1MINUTE

# define initial start as utc timestamp
start_time = btime.date_to_milliseconds('Jan 1, 2020')

# request initial historical data
candles = client.get_historical_klines(symbol=trade_pair,interval=time_interval, start_str=start_time,limit=1000)


In [None]:
len(candles)
# Verify the last known data point is today
end_unix_timestamp = candles[-1][0]/1000
datetime.datetime.fromtimestamp(end_unix_timestamp)

In [None]:
# Dynamically name headers
kline_headers = ['Open Time UTC', 
    'Open ' + quote_asset, 
    'High ' + quote_asset, 
    'Low ' + quote_asset, 
    'Close ' + quote_asset, 
    'Volume ' + base_asset, 
    'Close Time UTC', 
    'Volume ' + quote_asset, 
    'Number of Trades',
    'Taker Buy Volume ' + base_asset,
    'Taker Buy Volume ' + quote_asset]
kline_headers

In [None]:
# Create key-value pairs for each column and associated numpy dtype
typedict = {
    kline_headers[0]: np.uint64,
    kline_headers[1]: np.float32,
    kline_headers[2]: np.float32,
    kline_headers[3]: np.float32,
    kline_headers[4]: np.float32,
    kline_headers[5]: np.float32,
    kline_headers[6]: np.uint64,
    kline_headers[7]: np.float32,
    kline_headers[8]: np.float32,
    kline_headers[9]: np.float32,
    kline_headers[10]: np.float32,
}

# Instantiate klines dataframe and assign dtypes via dictionary
klines = df(data=[entry[:-1] for entry in candles], columns=kline_headers).astype(typedict)

# Sort klines dataframe by ascending UTC open
klines.sort_values(by=['Open Time UTC'], ascending=True, inplace=True)

# Assign open UTC as index
klines.index = klines['Open Time UTC']

klines.dtypes

klines.head()

In [None]:
# calculate close-open differences for each period
klines['Difference ' + quote_asset] = klines['Close ' + quote_asset]-klines['Open ' + quote_asset]
klines['Percent Change ' + quote_asset] = 100 * klines['Difference ' + quote_asset] / klines['Open ' + quote_asset]
klines.head()

In [None]:
# create Maker-Taker ratio
klines['Maker-Taker Volume ' + quote_asset + ' Ratio'] = ((klines['Volume ' + quote_asset] - klines['Taker Buy Volume ' + quote_asset]) / klines['Volume ' + quote_asset])

# create generic Ratios
klines['Price-Volume ' + quote_asset + ' Ratio'] = klines['Open ' + quote_asset] / klines['Volume ' + quote_asset]
klines['Trade-Volume ' + quote_asset + ' Ratio'] = klines['Number of Trades']/klines['Volume ' + quote_asset]
klines['Price-Taker Buy Volume ' + quote_asset + ' Ratio'] = klines['Open ' + quote_asset] / klines['Taker Buy Volume ' + quote_asset]
klines['Trade-Taker Buy Volume ' + quote_asset + ' Ratio'] = klines['Number of Trades']/klines['Taker Buy Volume ' + quote_asset]

# Create multiple diffs
klines_diff_1 = klines.diff(axis='index',periods=1)
klines_diff_1.columns = ['DIFF 1 ' + colname for colname in klines.columns]
klines_diff_1.drop(columns=[klines_diff_1.columns[0], klines_diff_1.columns[6]], inplace=True)

klines_diff_2 = klines.diff(axis='index',periods=2)
klines_diff_2.columns = ['DIFF 2 ' + colname for colname in klines.columns]
klines_diff_2.drop(columns=[klines_diff_2.columns[0], klines_diff_2.columns[6]], inplace=True)

klines_diff_3 = klines.diff(axis='index',periods=3)
klines_diff_3.columns = ['DIFF 3 ' + colname for colname in klines.columns]
klines_diff_3.drop(columns=[klines_diff_3.columns[0], klines_diff_3.columns[6]], inplace=True)


klines = klines.join(klines_diff_1,how='inner').join(klines_diff_2,how='inner').join(klines_diff_3,how='inner')
klines.head()

In [None]:
# Verify no NaNs except in first three rows, then remove them in place
klines[klines.isna().any(axis=1)].head()
klines[klines.isna().any(axis=1)].shape
klines.drop(axis='index', labels=klines[klines.isna().any(axis=1)]['Open Time UTC'].values.tolist(),inplace=True)


In [None]:
# Create df for price data only
price_df = df(klines['Open ' + quote_asset])
price_df.columns=['Price']
price_df.head()

# start with 1st order linear model
time_idx = price_df.index.to_numpy()
price = price_df.Price.to_numpy()
order1_regression = stats.linregress(time_idx,price)
order1_prediction = order1_regression.intercept + order1_regression.slope*time_idx
order1_residual = price-order1_prediction

In [None]:
# Graph true price vs 1st order approximation
plt.figure(figsize=(22,6))
plt.plot(time_idx, price)
plt.plot(time_idx, order1_prediction)

# Graph 1st order residual 
plt.figure(figsize=(22,6))
plt.plot(time_idx, order1_residual)

In [None]:
# remove DC component of order 1 residual
order1_residual = order1_residual - np.mean(order1_residual)

# n = signal length
n = price.size

# T = spacing
timestep = 1/(24*60) # unit is days

In [None]:
# Raw signal DCT
price_fft = fft.dct(order1_residual,norm=None)[1:n//2] # cosine amplitude of price signal 
fft_freq = fft.fftfreq(n, timestep)[1:n//2]*7 # cycles per week
fft_period = 1/fft_freq # period in weeks

# Plot DCT by wavelength
plt.figure(figsize=(24,8))
ax = plt.gca()
ax.plot(fft_period, (2.0/n * np.abs(price_fft)),linewidth=1,marker='o')
#ax.xaxis.set_major_locator(plt.MultipleLocator(0.1))
plt.grid()
plt.title('Discrete Cosine Transform: ' + base_asset + '-' + quote_asset)
plt.ylabel('Price Amplitude')
plt.xlabel('Period (weeks)')

# Plot DCT by frequency
plt.figure(figsize=(24,8));
ax = plt.gca();
ax.plot(fft_freq[:], (2.0/n * np.abs(price_fft[:])),linewidth=1);
plt.grid();
plt.title('Discrete Cosine Transform: ' + base_asset + '-' + quote_asset);
plt.ylabel('Price Amplitude');
plt.xlabel('Cycles/Week');

# Raw signal
order0_residual = price-np.mean(price) # remove DC component
ac_residual = np.correlate(order0_residual, order0_residual, 'same')[n//2+1:] # calucate autocorrelation
ac_residual /= ac_residual[0] # normalize by first value

lag = np.arange(len(ac_residual))*(1/(60*24))/7 # lag in weeks

# plot autocorrelation
plt.figure(figsize=(22,6));
ax = plt.gca();
plt.axhline(y=0, color='k',linewidth=1);
plt.plot(lag, ac_residual, linewidth=3);
plt.grid();
plt.title('Autocorrelation Function: ' + base_asset + '-' + quote_asset);
plt.ylabel('Autocorrelation');
plt.xlabel('Lag (weeks)');
ax.xaxis.set_major_locator(plt.MultipleLocator(1));

# Partial Autocorrelation
plt.figure(figsize=(22,6));
pac = sm.tsa.stattools.pacf(order0_residual, nlags=500, method='ld');
plt.plot(np.arange(len(pac))*(1/(24*60)), pac, linewidth=3);
plt.grid();
plt.title('Partial Autocorrelation');
plt.ylabel('Partial Correlation Coefficient');
plt.xlabel('Lag (days)');



time = np.arange(len(price))*1/(60*24)
price_loess_p1 = sm.nonparametric.lowess(endog=price,exog=time,frac=(50/len(price)),is_sorted=True,return_sorted=False)


price_lowess_p1_pac = sm.tsa.stattools.pacf(price_loess_p1, nlags=500,method='ld');


# plot LOWESS
plt.figure(figsize=(22,6));
ax = plt.gca();
ax.plot(time[:1000], price[:1000],linewidth=3);
ax.plot(time[:1000], price_loess_p1[:1000],linewidth=3);
plt.title('LOWESS Smoothing on Price Data Sample');
plt.xlabel('Time (days)');
plt.ylabel('Price');

residual_lowess = price-price_loess_p1
residual_lowess = residual_lowess - np.mean(residual_lowess)
plt.figure(figsize=(22,4))
plt.scatter(time,residual_lowess,s=1)
plt.title('First-Pass LOWESS Residual')
plt.ylabel('Price Residual')
plt.xlabel('Time (days)')

nr = residual_lowess.size

# Residual DST
residual_fft = fft.dct(residual_lowess,norm=None)[1:nr//2] # sine amplitude of residual signal 
residual_freq = fft.fftfreq(nr, timestep)[1:nr//2]*7 # cycles per month
residual_period = 1/fft_freq # period in months

# Plot DST by wavelength
plt.figure(figsize=(24,8))
ax = plt.gca()
ax.plot(residual_period, (2.0/n * np.abs(residual_fft)),linewidth=1,marker='o')
ax.xaxis.set_major_locator(plt.MultipleLocator(5))
plt.grid()
plt.title('Discrete Sine Transform of LOWESS-Smoothed Price Residual: ' + base_asset + '-' + quote_asset)
plt.ylabel('Residual Amplitude')
plt.xlabel('Period (weeks)')

# Plot DST by frequency
plt.figure(figsize=(24,8));
ax = plt.gca();
ax.plot(residual_freq[:], (2.0/n * np.abs(residual_fft[:])),linewidth=1);
plt.grid();
plt.title('Discrete Sine Transform of LOWESS-Smoothed Price Residual: ' + base_asset + '-' + quote_asset);
plt.ylabel('Price Amplitude');
plt.xlabel('Cycles/Week');

# Smoothed signal residual
ac_lowess_residual = np.correlate(residual_lowess, residual_lowess, 'same')[nr//2+1:] # calucate autocorrelation
ac_lowess_residual /= ac_lowess_residual[0] # normalize by first value

lag = np.arange(len(ac_lowess_residual))*(1/(60*24))/7 # lag in weeks

# plot autocorrelation
plt.figure(figsize=(22,6));
ax = plt.gca();
plt.axhline(y=0, color='k',linewidth=1);
plt.plot(lag, ac_lowess_residual, linewidth=3);
plt.grid();
plt.title('LOWESS Residual Autocorrelation Function: ' + base_asset + '-' + quote_asset);
plt.ylabel('Autocorrelation');
plt.xlabel('Lag (weeks)');
ax.xaxis.set_major_locator(plt.MultipleLocator(1));

# Partial Autocorrelation
plt.figure(figsize=(22,6));
lowess_pac = sm.tsa.stattools.pacf(residual_lowess, nlags=500, method='ld');
plt.plot(np.arange(len(lowess_pac))*(1/(24*60)), lowess_pac, linewidth=3);
plt.grid();
plt.title('Partial Autocorrelation');
plt.ylabel('Partial Correlation Coefficient');
plt.xlabel('Lag (days)');


klines.shape

plt.figure(figsize=(22,4))
smoothed_price_xcorrelation = np.correlate(price_loess_p1, price, 'same')[nr//2+1:]
smoothed_price_xcorrelation /= smoothed_price_xcorrelation[0]
plt.plot(lag, smoothed_price_xcorrelation)
plt.title('Loess-Smoothed Autocorrelation')


In [None]:
complete_data = klines.drop(['Open Time UTC','Close Time UTC'],axis='columns')

# add difference data to complete data df
diff1 = complete_data.shift(-1).drop(labels=['Open ' + quote_asset, 'High ' + quote_asset, 'Low ' + quote_asset, 'Close ' + quote_asset],axis='columns').rename(columns={x: x + ' DIFF -1' for x in klines.columns})
diff2 = complete_data.shift(-2).drop(labels=['Open ' + quote_asset, 'High ' + quote_asset, 'Low ' + quote_asset, 'Close ' + quote_asset],axis='columns').rename(columns={x: x + ' DIFF -2' for x in klines.columns})
diff3 = complete_data.shift(-3).drop(labels=['Open ' + quote_asset, 'High ' + quote_asset, 'Low ' + quote_asset, 'Close ' + quote_asset],axis='columns').rename(columns={x: x + ' DIFF -3' for x in klines.columns})

complete_data = complete_data.join(other=[diff1,diff2,diff3],how='inner').dropna()

complete_data.columns
complete_data


In [None]:
# pairplot = sb.pairplot(complete_data.iloc[:,:].sample(n=1660, axis='index'), diag_kind='kde', kind='scatter',height=5, corner=True,plot_kws=dict(marker='+',linewidth=1))

# pairplot.savefig('pairplot_' + trade_pair + '.png',transparent=False)


In [None]:
pairwise = pg.pairwise_corr(complete_data.sample(10000), method='spearman', alternative='two-sided',padjust='bonf').sort_values(by='r',ascending=False)
pairwise

In [None]:
pg.pcorr(complete_data).head()
pairwise.head(50)

In [None]:
corr_data = pairwise.sort_values(by='r',ascending=False).drop(columns=['method','alternative','n','power','p-unc'])
corr_data = corr_data[(~corr_data['X'].str.contains(base_asset)) ].set_index('X').drop(columns='p-adjust')

In [None]:
corr_data
plt.figure(figsize=(10,10))
corr_data.r.plot.density(bw_method='scott')
plt.title('Smoothed Distribution of\nSpearman Correlation Coefficient for Predictive Pairs')
plt.xlabel('r-value')
plt.ylabel('PDF')

In [None]:
corr_summary = corr_data.groupby(by='Y')

In [None]:
for x in corr_summary.groups:
    if ('Percent Change ' + quote_asset in corr_summary.get_group(x).index):
        corr_summary.get_group(x).sort_values(by='r',ascending=False)[np.abs(corr_summary.get_group(x)['r'])>0.10]

In [None]:
df(complete_data.corr()['Maker-Taker Volume ' + quote_asset + ' Ratio']).transpose()[[colname for colname in complete_data.columns if not 'BTC' in colname]].transpose().sort_values(by='Maker-Taker Volume ' + quote_asset + ' Ratio',ascending=False)

df(complete_data.corr()['Percent Change ' + quote_asset]).transpose()[[colname for colname in complete_data.columns if not 'BTC' in colname]].transpose().sort_values(by='Percent Change ' + quote_asset,ascending=False)


In [None]:
lagged_correlation = df.from_dict(
    {x: [complete_data['Percent Change ' + quote_asset].corr(complete_data[x].shift(-t)) for t in range(10)] for x in complete_data.columns})
lagged_correlation = lagged_correlation*100



In [None]:
x_corr_vol = lagged_correlation[[col for col in lagged_correlation.columns if 'Volume ' + quote_asset in col]].transpose().style.background_gradient(cmap=sb.color_palette("coolwarm", as_cmap=True),axis=0)



In [None]:
x_corr_vol

In [None]:
plt.figure(figsize=(22,8));
plt.plot(lagged_correlation[[col for col in lagged_correlation.columns if 'Volume ' + quote_asset in col or 'Ratio' in col]]);
plt.legend(lagged_correlation[[col for col in lagged_correlation.columns if 'Volume ' + quote_asset in col or 'Ratio' in col]].columns);
plt.title('Cross Correlation Between Price Change Percentage and Predictors');
plt.ylabel('r-value');
plt.xlabel('lag (minutes)');
plt.xticks(range(0,10));
plt.grid();



In [None]:
df(complete_data.columns)

In [None]:
# check normality assumption in Maker-Taker Volume Ratio

# plot KDE, and flip for visual symmetry check
mtvol_sample = complete_data.sample(100000)['Maker-Taker Volume ' + quote_asset + ' Ratio']

mtvol_desc_stats = mtvol_sample.describe()
mtvol_sample_normalized = (mtvol_sample - mtvol_desc_stats['mean']) / mtvol_desc_stats['std']

# plot QQ, and flip 180 deg for visual symmetry check
ax = plt.figure(figsize=(20,10));
mtvol_sample_normalized.plot.kde(lw=3,label='KDE Estimate');

(-1.0*mtvol_sample_normalized).plot.kde(lw=3,label='Reversed KDE Estimate');
plt.legend()

plt.title('KDE for the Maker-Taker Volume Ratio');

sm.qqplot(complete_data['Maker-Taker Volume ' + quote_asset + ' Ratio'],
fit=True,markersize=2,label='Maker-Taker QQ').set_size_inches((10,10));

ax = plt.gca();
ax.scatter(-ax.lines[0].get_xdata(),-ax.lines[0].get_ydata(),s=2,c='r',label='$180^\circ$ rotated QQ Plot');
ax.axline([-4,-4],[4,4],c='k',label='Normal Distribution');

plt.title('Maker-Taker Volume Ratio QQ Plot');
plt.legend();

print('Maker-Taker Ratio Kurtosis = ' + str(complete_data['Maker-Taker Volume ' + quote_asset + ' Ratio'].kurtosis()))

# distribution is appproximately normal with slight left-skew and slight negative kurtosis


In [None]:
plt.figure(figsize=(24,12));
# ax = plt.axes(projection='3d')
plt.axhline(0,color='black',lw=1);
plt.axvline(0,color='black',lw=1);

plot_sample = complete_data.sample(16000)
plot_sample['Maker-Taker Volume ' + quote_asset + ' Ratio']

plt.scatter(plot_sample['Maker-Taker Volume ' + quote_asset + ' Ratio']-0.5,plot_sample['Percent Change ' + quote_asset],
c=plot_sample.index, ###
# c=np.log(1/((plot_sample['Price-Taker Buy Volume ' + quote_asset + ' Ratio']))),
s=3,alpha=0.3,cmap='jet');
plt.colorbar();
plt.xlabel('Maker-Taker Volume Ratio');
plt.ylabel('Price Change (%)');

#ax=plt.gca();
#plt.xlim([0,1])
#plt.ylim([-2,2])

#ax.yaxis.set_ticks(np.arange(-2,2.2,0.2));

plt.yscale('symlog',linthreshy = 0.001);
plt.xscale('symlog',linthreshx = 0.001);
#plt.colorbar();


In [None]:
plt.figure(figsize=(22,8))
plt.scatter(plot_sample.index, plot_sample['Number of Trades'],s=1,alpha=0.2)


plt.figure(figsize=(22,8))
sb.violinplot(y=plot_sample['Number of Trades'],x=plot_sample.index)

df(plot_sample['Volume ' + quote_asset]).sample(10000).plot.hist(bins=500)

df(plot_sample['Volume ' + quote_asset]).describe()
