# Statistical Arbitrage Crypto

Some calculations to do with statistical arbitrage on binance using perpetual futures

In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import coint
import statsmodels.api as sm
import requests
import datetime
import time
import csv
import ccxt

import seaborn as sns
import matplotlib.pyplot as plt

## Step One (Correlation)

We will gather a universe of possible USDT perpetual trading pairs that are available on binance. We want to filter out pairs that aren't correlated together before we can do our co-integration

In [None]:
futures_pairs =  [
    "BTCUSDT", "ETHUSDT", "LINKUSDT", "BNBUSDT", "TRXUSDT", "DOTUSDT", "ADAUSDT", "EOSUSDT", "LTCUSDT",
    "BCHUSDT", "XRPUSDT", "ETCUSDT", "FILUSDT", "EGLDUSDT", "DOGEUSDT", "UNIUSDT", "THETAUSDT", "XLMUSDT",
    "SOLUSDT", "FTMUSDT", "SANDUSDT", "MANAUSDT", "AVAXUSDT", "GALAUSDT", "MATICUSDT", "NEARUSDT",
    "ATOMUSDT", "AAVEUSDT", "AXSUSDT", "ROSEUSDT", "XTZUSDT", "ICXUSDT", "ALGOUSDT", "RUNEUSDT",
    "APEUSDT", "VETUSDT", "ZILUSDT", "KNCUSDT", "XMRUSDT", "GMTUSDT", "OPUSDT", "ENSUSDT", "CHZUSDT", "APTUSDT"
]

# futures_pairs =  [
#     "BTCUSDT", "ETHUSDT"
# ]


def fetch_historical_data(pair,  since, timeframe = "1h"):
    binance = ccxt.binance({'rateLimit': 1200}) # Binance rate limit is 1200 ms
    all_candles = []
    limit = 1000  # Max number of candles per request
    while since < binance.milliseconds():
        candles = binance.fetch_ohlcv(pair, timeframe, since, limit)
        if len(candles) == 0:
            break
        since = candles[-1][0] + 1  # start the next call right where the last one ended
        all_candles += candles
        time.sleep(binance.rateLimit / 1000)  # sleep for rateLimit milliseconds
    
    df = pd.DataFrame(all_candles, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    df.set_index('timestamp', inplace=True)
    return df['close']



def visualize_heatmap(correlation_matrix):

    plt.style.use("dark_background")

    # Creating a custom colormap: Green for positive, Red for negative correlation
    custom_cmap = sns.diverging_palette(10, 130, s=80, l=55, as_cmap=True)  # Using green-red palette

    plt.figure(figsize=(25, 25))  # Increased size

    sns.heatmap(correlation_matrix, annot=True, cmap=custom_cmap, 
                center=0, vmin=-1, vmax=1, fmt=".2f", 
                linewidths=.5, cbar_kws={"shrink": 0.75},
                annot_kws={"size": 10, "color": "white"})  # Setting the font color to white for visibility on dark background

    plt.xticks(rotation=45)
    plt.yticks(rotation=45)
    plt.title("Correlation Matrix Heatmap", fontsize=20)  # Increased font size for the title
    plt.tight_layout()
    plt.show() 

binance = ccxt.binance()

# Define a time span: 2 months in milliseconds
two_months_ms = 2 * 30 * 24 * 60 * 60 * 1000
since = binance.milliseconds() - two_months_ms

dataframes = {}
for pair in futures_pairs:
    print(f"Fetching data for {pair}...")
    dataframes[pair] = fetch_historical_data(pair, since)

# Combining all pairs' close prices into one dataframe
combined_df = pd.concat(dataframes, axis=1)

# Calculating correlation matrix
correlation_matrix = combined_df.corr()

visualize_heatmap(correlation_matrix)

high_correlation_pairs = []

# Iterate over the upper triangular matrix to avoid duplicate pairs and self-correlation
for i, crypto1 in enumerate(correlation_matrix.columns):
    for j, crypto2 in enumerate(correlation_matrix.columns):
        if i < j and correlation_matrix.loc[crypto1, crypto2] > 0.75:
            high_correlation_pairs.append((crypto1, crypto2, correlation_matrix.loc[crypto1, crypto2]))

# print(high_correlation_pairs)



## Step 2 (Cointegration)



In [None]:


suitable_pairs = []


for pair1, pair2, correlation in high_correlation_pairs:
    ohlcv1 = fetch_historical_data(pair1, since, '1h')
    ohlcv2 = fetch_historical_data(pair2, since, '1h')
    
    df1 = pd.DataFrame(ohlcv1, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
    df2 = pd.DataFrame(ohlcv2, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])

    min_length = min(len(df1), len(df2))
    df1 = df1.iloc[-min_length:]
    df2 = df2.iloc[-min_length:]

    _, pvalue, _ = coint(df1['close'], df2['close'])
    
    if pvalue < 0.05:  # Check for cointegration 
        suitable_pairs.append((pair1, pair2))

print(suitable_pairs)

# Constructing the Spread

In [None]:
spreads = {}

# Calculate the spread for each pair
for pair in suitable_pairs:
    asset_A_prices = fetch_historical_data(pair[0], since, '1h')
    asset_B_prices = fetch_historical_data(pair[1], since, '1h')
    
    # Compute the hedge ratio using linear regression
    asset_A_prices_const = sm.add_constant(asset_A_prices)
    model = sm.OLS(asset_B_prices, asset_A_prices_const).fit()
    hedge_ratio = model.params[1]
    
    # Calculate the spread
    spread = np.array(asset_A_prices) - hedge_ratio * np.array(asset_B_prices)
    spreads[pair] = spread

print(spreads)

## Spread Analysis

Now we must analyze the spread for mean-reverting properties using the z score.  The z-score is a measure of how many standard deviations a data point is away from the mean.

In [None]:
from statsmodels.tsa.stattools import adfuller

# A dictionary to store z-scores for each pair
z_scores = {}

# Calculate the z-scores for each spread
for pair, spread in spreads.items():
    mean_spread = np.mean(spread)
    std_spread = np.std(spread)
    z = (spread - mean_spread) / std_spread
    z_scores[pair] = z

mean_reverting_pairs = []

# Test each z-score series for mean reversion
for pair, z in z_scores.items():
    result = adfuller(z)
    adf_statistic = result[0]
    p_value = result[1]
    
    # Let's consider a p-value threshold of 0.05, for instance
    if p_value < 0.05:
        mean_reverting_pairs.append(pair)
        print(f"{pair} is likely mean-reverting with p-value: {p_value:.4f}")
        
        # Optional: Plotting the z-score for the mean-reverting pair
        plt.figure(figsize=(12, 6))
        plt.plot(z, label=f'Z-Score of {pair}')
        plt.axhline(0, color='gray', linestyle='--')
        plt.axhline(2.0, color='red', linestyle='--', label='Upper Threshold')
        plt.axhline(-2.0, color='green', linestyle='--', label='Lower Threshold')
        plt.legend()
        plt.title(f'Z-Score for {pair}')
        plt.show()

print("Mean-reverting pairs:", mean_reverting_pairs)

