In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import tqdm
import pandas as pd
import pandas_ta as ta

import pathlib
import itertools
import warnings
import collections


In [229]:
from binance.enums import KLINE_INTERVAL_1HOUR, KLINE_INTERVAL_1DAY
from crypto_research.analysis.signals.technical import SIGNALS_1H, SIGNALS_1D

DATA_PATH = pathlib.Path("/Users/borsden/Projects/crypto_research/data")
def get_data(interval: int):
    """Use cached version of data instead of influxdb"""
    data_path = DATA_PATH / interval
    pairs_paths = data_path.glob("*.csv")
    for pair_path in tqdm.tqdm(pairs_paths):
        data = pd.read_csv(pair_path, index_col=['time'], parse_dates=['time'])
        # Todo: OHLC is for x:59:59 ...  - That is OK for testing purposes to round it to next time instead.
        data.index = data.index.round('H')
        yield pair_path.stem, data

        
INTERVALS = {
    KLINE_INTERVAL_1HOUR,
    KLINE_INTERVAL_1DAY
}

SIGNALS_DICT = {KLINE_INTERVAL_1HOUR: SIGNALS_1H, KLINE_INTERVAL_1DAY: SIGNALS_1D}

DATA = {}
for interval in INTERVALS:
    DATA[interval] = dict(get_data(interval))

50it [00:13,  3.68it/s]
50it [00:00, 106.87it/s]


In [249]:
from crypto_research.analysis.signals import combine_signals


def _iterator(data):
    for interval, interval_data in data.items():
        for pair, pair_data in interval_data.items():
            yield pair, pair_data, interval

iterator = list(_iterator(DATA))

PAIRS_FEATURES = collections.defaultdict(dict)
for pair, pair_data, interval in tqdm.tqdm(iterator):
    signal_dict = SIGNALS_DICT[interval]
    indicators, signals = combine_signals(pair_data, signal_dict)
    
    # # We shift to 1 because they used to be predictive signals.
    # indicators = indicators
    # signals = signals
    
    PAIRS_FEATURES[pair][interval] = signals
    
        

100%|██████████| 100/100 [02:00<00:00,  1.20s/it]


In [252]:
import numpy as np

HORIZONS = [1, 2, 3, 5, 8, 13, 21]

def get_returns_diff(close: pd.Series, horizons=HORIZONS):
    """Calculate diff returns for """
    returns_diff = []
    
    returns_diff_1 = close.pct_change()
    for horizon in horizons:
        returns_diff_horizon = (1 + returns_diff_1).rolling(window=horizon).apply(np.prod, raw=True) - 1
        # Shift it because they expect to be predictive variables.
        returns_diff_horizon = returns_diff_horizon.shift(-horizon)
        returns_diff_horizon.name = horizon
        returns_diff.append(returns_diff_horizon)
    returns_diff = pd.concat(returns_diff, axis=1)
    return returns_diff


In [253]:
PAIR_RETURNS = collections.defaultdict(dict)
for pair, pair_data, interval in tqdm.tqdm(iterator):
    PAIR_RETURNS[pair][interval] = get_returns_diff(pair_data.close)


100%|██████████| 100/100 [00:19<00:00,  5.00it/s]


In [254]:
PAIR_RETURNS['btcusdt']['1d']

Unnamed: 0_level_0,1,2,3,5,8,13,21
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-12-13 00:00:00+00:00,0.019951,0.025037,0.033240,0.213670,0.248562,0.315426,0.712761
2020-12-14 00:00:00+00:00,0.004987,0.013029,0.113258,0.205959,0.185399,0.382463,0.723828
2020-12-15 00:00:00+00:00,0.008002,0.107734,0.184025,0.237489,0.236436,0.365059,0.662164
2020-12-16 00:00:00+00:00,0.098939,0.174625,0.190448,0.208395,0.196313,0.395253,0.750305
2020-12-17 00:00:00+00:00,0.068872,0.083270,0.117136,0.064801,0.111991,0.283697,0.725021
...,...,...,...,...,...,...,...
2023-12-08 00:00:00+00:00,0.021099,0.010123,0.011872,,,,
2023-12-09 00:00:00+00:00,-0.010749,-0.009037,-0.042222,,,,
2023-12-10 00:00:00+00:00,0.001731,-0.031815,,,,,
2023-12-11 00:00:00+00:00,-0.033488,,,,,,


In [255]:
import collections

def add_prefix_to_columns(df, prefix):
    """
    Add a prefix to each column name in a dataframe.
    """
    return df.add_prefix(prefix)

def resample_daily_data_to_hourly(df):
    """
    Resample daily dataframe to hourly frequency by forward-filling.
    """
    return df.resample('H').ffill()

def merge_hourly_and_daily_data(hourly_df, daily_df):
    """
    Merge hourly and daily dataframes on the hourly timestamps.
    """
    hourly_df = add_prefix_to_columns(hourly_df, '1h_')
    daily_df = add_prefix_to_columns(daily_df, '1d_')
    daily_df_resampled = resample_daily_data_to_hourly(daily_df)
    
    return hourly_df.merge(daily_df_resampled, left_index=True, right_index=True, how='left')
        
for pair, features in tqdm.tqdm(PAIRS_FEATURES.items()):
    features['merged'] = merge_hourly_and_daily_data(
        features[KLINE_INTERVAL_1HOUR], features[KLINE_INTERVAL_1DAY]
    )

100%|██████████| 50/50 [00:04<00:00, 11.36it/s]


In [256]:

# 
# RESULTS = collections.defaultdict(dict)
# for pair, interval, features, ret, returns in tqdm.tqdm(_iterator()):
#     result = run_models(features, returns, models)
#     RESULTS[pair][(interval, ret)] = dict(result)
#     break
    

In [311]:

# def iterator_for_models():
#     for pair, _, interval in iterator:
#         returns_all = PAIR_RETURNS[pair][interval].fillna(0)
#         features = PAIRS_FEATURES[pair][interval]
#         for ret in returns_all.columns:
#             returns = returns_all[ret]
#             yield pair, interval, features, ret, returns

def categorize_by_percentile(data, lower_percentile, upper_percentile):
    """
    Categorize data based on percentile thresholds.
    """
    
    categorized_data = pd.Series(index=data.index)

    categorized_data[data < lower_percentile] = -1  # Below 25th percentile
    categorized_data[data > upper_percentile] = 1   # Above 75th percentile
    categorized_data[(data >= lower_percentile) & (data <= upper_percentile)] = 0  # Between 25th and 75th percentile

    return categorized_data

def backtest_strategy(buy_sell, actual_returns):
    """
    Backtest the trading strategy.

    """
    strategy_returns = buy_sell * actual_returns
    
    cumulative_returns = (1 + strategy_returns).cumprod()

    sharpe_ratio = np.mean(strategy_returns) / np.std(strategy_returns)
    print(sharpe_ratio)
    max_drawdown = np.min(cumulative_returns) / np.max(cumulative_returns) - 1
    results = {
        'Cumulative Returns': cumulative_returns.iloc[-1],
        'Sharpe Ratio': sharpe_ratio,
        'Max Drawdown': max_drawdown
    }

    return results

from crypto_research.analysis.models import CatBoostRegressor, LassoRegressor, RandomForestRegressor
from crypto_research.analysis.models.utils import get_IS_OS
from sklearn.metrics import mean_squared_error


In [312]:
RATIO = 0.8

pair = 'btcusdt'
interval = '1h'
features = PAIRS_FEATURES[pair][interval]
returns = PAIR_RETURNS[pair][interval]

LOWER, UPPER = 0.2, 0.8




RESULTS = collections.defaultdict(dict)
train_X, test_X = get_IS_OS(features, RATIO)

for ret_pred_distance in tqdm.tqdm(returns.columns):
    ret_X = returns[ret_pred_distance]
    # Todo: add some metrics to evaluate quality of models.
    train_y, test_y = get_IS_OS(ret_X.fillna(0), RATIO)
    models = [CatBoostRegressor(verbose=False), LassoRegressor()]
    for model in tqdm.tqdm(models):
        # model.fit(train_X, train_y, eval_set=(test_X, test_y))
        model.fit(train_X, train_y)
        
        train_predict = model.predict(train_X)
        test_predict = model.predict(test_X)
        
        lower_percentile = train_predict.quantile(LOWER)
        upper_percentile = train_predict.quantile(UPPER)
        
        categorized_train_predict = categorize_by_percentile(train_predict, lower_percentile, upper_percentile)
        categorized_test_predict = categorize_by_percentile(test_predict, lower_percentile, upper_percentile)
        
        train_metrics = backtest_strategy(categorized_train_predict, train_y)
        test_metrics = backtest_strategy(categorized_test_predict, test_y)
        RESULTS[ret_pred_distance][model.__class__.__name__] = [train_metrics, test_metrics]
    break
        

  0%|          | 0/7 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:26<00:26, 26.79s/it][A

0.43526106696356176
0.0067020864632584545


  sharpe_ratio = np.mean(strategy_returns) / np.std(strategy_returns)

100%|██████████| 2/2 [00:27<00:00, 13.61s/it][A
  0%|          | 0/7 [00:27<?, ?it/s]

nan
nan





In [310]:
def result_iterator(result):
    for ret, vals in result.items():
        for model, (train, test) in vals.items():
            train = {f"TRAIN_{k}": v for k, v in train.items()}
            test = {f"TEST_{k}": v for k, v in test.items()}
            yield {
                'return': ret, 'model': model, **train, **test
            }
pd.DataFrame(list(result_iterator(result)))

Unnamed: 0,return,model,TRAIN_Cumulative Returns,TRAIN_Sharpe Ratio,TRAIN_Max Drawdown,TEST_Cumulative Returns,TEST_Sharpe Ratio,TEST_Max Drawdown
0,1,CatBoostRegressor,1.641723,0.435261,-0.772423,1.434186,0.006702,-0.441095
1,1,LassoRegressor,1.641723,,-0.772423,1.434186,,-0.441095


In [307]:
result

defaultdict(dict,
            {1: {'CatBoostRegressor': [{'Cumulative Returns': 1.641722729980654,
                'Sharpe Ratio': 0.43526106696356176,
                'Max Drawdown': -0.7724232825946666},
               {'Cumulative Returns': 1.4341862943004715,
                'Sharpe Ratio': 0.0067020864632584545,
                'Max Drawdown': -0.4410953064704307}],
              'LassoRegressor': [{'Cumulative Returns': 1.641722729980654,
                'Sharpe Ratio': nan,
                'Max Drawdown': -0.7724232825946666},
               {'Cumulative Returns': 1.4341862943004715,
                'Sharpe Ratio': nan,
                'Max Drawdown': -0.4410953064704307}]}})