In [1]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm

TRAIN_CSV = r'C:\Users\e0817820\Desktop\tokka\data\raw\train.csv'
TRAIN_COMBINE_CSV = r'C:\Users\e0817820\Desktop\tokka\data\raw\train_combine.csv'
TEST_CSV = r'C:\Users\e0817820\Desktop\tokka\data\raw\test.csv'
ADDITIONAL_TRAIN_CSV = r'C:\Users\e0817820\Desktop\tokka\data\raw\add_train.csv'
MODELS_DIR = r'C:\Users\e0817820\Desktop\tokka\models'

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
use_additional_train = True

In [3]:
train = pd.read_csv(TRAIN_CSV)
lower_limit = train['timestamp'].min()
upper_limit = train['timestamp'].max()
if use_additional_train: 
    additional_train = pd.read_csv(ADDITIONAL_TRAIN_CSV)
    train = pd.concat([train, additional_train], axis=0)
    train.to_csv(TRAIN_COMBINE_CSV, index=False)
    upper_limit = train['timestamp'].max()
    del additional_train
if 'id' in train.columns:
    train.drop(columns=['id'], inplace=True)
if 'Unnamed: 0' in train.columns:
    train.drop(columns=['Unnamed: 0'], inplace=True)
train.dropna(inplace=True)

In [4]:
print(train.shape)

(13790220, 12)


In [5]:
train.head()

Unnamed: 0,timestamp,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_volume,taker_sell_volume,symbol,log_return
0,1625097600000,11.94,11.962,11.94,11.95,714.795,8542.886,24,309.891,3704.539178,AVAX,-0.008066
1,1625097600000,1.3853,1.3862,1.3821,1.3822,235752.6,326467.6,443,100826.38,139621.840322,ADA,-0.007261
2,1625097600000,35.51,35.567,35.365,35.365,5133.412,182152.9,339,2189.544,77751.422759,SOL,0.000113
3,1625097600000,303.75,304.0,302.71,302.78,3306.352,1003334.0,1194,1063.7045,322869.467118,BNB,-0.007459
4,1625097600000,0.06828,0.06832,0.06817,0.06818,1696205.0,115809.8,336,835699.0,57068.976098,TRX,-0.0121


In [6]:
list_cryptocurrencies = ['AVAX', 'ADA', 'SOL', 'BNB', 'TRX', 'DOGE', 'LINK', 'XRP', 'BTC', 'ETH']

In [7]:
combined_train = pd.DataFrame()
combined_train[train.columns] = 0
for id in tqdm(range(10)):
    combined_train = combined_train.merge(train.loc[train["symbol"] == list_cryptocurrencies[id], ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'log_return']].copy(), on="timestamp", how='outer', suffixes=['', "_"+list_cryptocurrencies[id]])
combined_train = combined_train.drop(train.columns.drop("timestamp"), axis=1)
combined_train = combined_train.sort_values('timestamp', ascending=True)
combined_train = combined_train.fillna(method='ffill')
display(combined_train.head())

100%|██████████| 10/10 [00:17<00:00,  1.76s/it]
  combined_train = combined_train.fillna(method='ffill')


Unnamed: 0,timestamp,open_AVAX,high_AVAX,low_AVAX,close_AVAX,volume_AVAX,log_return_AVAX,open_ADA,high_ADA,low_ADA,...,low_BTC,close_BTC,volume_BTC,log_return_BTC,open_ETH,high_ETH,low_ETH,close_ETH,volume_ETH,log_return_ETH
0,1625097600000,11.94,11.962,11.94,11.95,714.795,-0.008066,1.3853,1.3862,1.3821,...,34966.09,34980.47,100.833562,-0.00303,2275.68,2275.93,2269.77,2270.06,699.78407,-0.007508
1,1625097660000,11.946,11.949,11.929,11.929,420.382,-0.008503,1.3822,1.383,1.3787,...,34974.08,34974.51,53.304605,-0.003791,2270.03,2274.49,2267.06,2267.25,709.3209,-0.008393
2,1625097720000,11.909,11.914,11.894,11.9,127.602,-0.007761,1.3788,1.3791,1.3766,...,34901.0,34951.5,102.61487,-0.0046,2267.03,2268.89,2262.71,2266.11,913.31677,-0.009965
3,1625097780000,11.908,11.908,11.896,11.9,107.587,-0.00683,1.3771,1.3785,1.3765,...,34946.43,34950.0,30.444503,-0.003673,2266.11,2268.44,2263.0,2263.01,383.03679,-0.007554
4,1625097840000,11.909,11.919,11.909,11.917,114.725,-0.009782,1.3768,1.3804,1.3768,...,34949.99,34962.89,17.500656,-0.003813,2263.01,2268.86,2263.01,2265.45,527.44357,-0.008712


In [8]:
print(combined_train.shape)

(1379022, 61)


In [9]:
def delay(x, d):
    """Value of x d days ago"""
    return x.shift(d)

def correlation(x, y, d):
    """Time-series correlation of x and y for the past d days"""
    return x.rolling(window=d).corr(y)

def delta(x, d):
    """Today's value of x minus the value of x d days ago"""
    return x.diff(d)

def ts_max(x, d):
    """Time-series max over the past d days"""
    return x.rolling(window=d).max()

def ts_min(x, d):
    """Time-series min over the past d days"""
    return x.rolling(window=d).min()

In [10]:
def bollinger_mavg(close, window=20):
    """Calculate the middle Bollinger Band."""
    return close.rolling(window=window).mean()

def bollinger_hband(close, window=20, k=2):
    """Calculate the upper Bollinger Band."""
    mavg = bollinger_mavg(close, window)
    mstd = close.rolling(window=window).std()
    return mavg + (mstd * k)

def bollinger_lband(close, window=20, k=2):
    """Calculate the lower Bollinger Band."""
    mavg = bollinger_mavg(close, window)
    mstd = close.rolling(window=window).std()
    return mavg - (mstd * k)

def bollinger_wband(close, window=20, k=2):
    """Calculate the width of the Bollinger Bands."""
    hband = bollinger_hband(close, window, k)
    lband = bollinger_lband(close, window, k)
    mavg = bollinger_mavg(close, window)
    return ((hband - lband) / mavg) * 100

def exponential_moving_average(close, span=20):
    """Calculate the Exponential Moving Average (EMA)."""
    return close.ewm(span=span, adjust=False).mean()

def relative_strength_index(close, window=14):
    """Calculate the Relative Strength Index (RSI)."""
    delta = close.diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi
  
def stochastic_oscillator(high, low, close, window=14):
    """Calculate the Stochastic Oscillator."""
    low_min = low.rolling(window=window).min()
    high_max = high.rolling(window=window).max()
    k = ((close - low_min) / (high_max - low_min)) * 100
    return k

def williams_r(high, low, close, window=14):
    """Calculate Williams %R."""
    high_max = high.rolling(window=window).max()
    low_min = low.rolling(window=window).min()
    r = -100 * ((high_max - close) / (high_max - low_min))
    return r
  
def rate_of_change(close, window=14):
    """Calculate the Rate of Change (ROC)."""
    roc = ((close - close.shift(window)) / close.shift(window)) * 100
    return roc

In [11]:
def getFeatures(df):
    lags_normal = [8, 20, 120]
    lags_ema = [15]
    lags_rsi = [8, 14]
    lags_stoch_osci = [10, 25]
    roc_window = [8, 15]
    williams_percentage_r = [10, 20]
    lags_wbands = [10, 50]
    for id in range(10):   
        symbol = list_cryptocurrencies[id] 
        for lag in lags_normal:
            df[f'log_close/mean_{lag}_{symbol}'] = np.log( np.array(df[f'close_{symbol}']) /  np.roll(np.append(np.convolve( np.array(df[f'close_{symbol}']), np.ones(lag)/lag, mode="valid"), np.ones(lag-1)), lag-1)  )
            df[f'log_return_{lag}_{symbol}'] = np.log( np.array(df[f'close_{symbol}']) /  np.roll(np.array(df[f'close_{symbol}']), lag)  )
    for lag in lags_normal:
        df[f'mean_close/mean_{lag}'] =  np.mean(df.iloc[:,df.columns.str.startswith(f'log_close/mean_{lag}_')], axis=1)
        df[f'mean_log_returns_{lag}'] = np.mean(df.iloc[:,df.columns.str.startswith(f'log_return_{lag}_')], axis=1)
        for id in range(10):
            symbol = list_cryptocurrencies[id] 
            df[f'log_close/mean_{lag}-mean_close/mean_{lag}_{symbol}'] = np.array( df[f'log_close/mean_{lag}_{symbol}']) - np.array( df[f'mean_close/mean_{lag}']  )
            df[f'log_return_{lag}-mean_log_returns_{lag}_{symbol}'] = np.array( df[f'log_return_{lag}_{symbol}']) - np.array( df[f'mean_log_returns_{lag}'] )
    for id in range(10):
        symbol = list_cryptocurrencies[id]
        for lag in lags_ema:
            # df[f'ema_close_{lag}_{symbol}'] = ta.trend.ema_indicator(df[f'close_{symbol}'], window=lag, fillna=False)
            df[f'ema_close_{lag}_{symbol}'] = exponential_moving_average(df[f'close_{symbol}'], span=lag)
        for lag in lags_rsi:
            # df[f'rsi_close_{lag}_{symbol}'] = ta.momentum.rsi(df[f'close_{symbol}'], window=lag, fillna=False)
            df[f'rsi_close_{lag}_{symbol}'] = relative_strength_index(df[f'close_{symbol}'], window=lag)
        for lag in lags_stoch_osci:
            # df[f'stoch_oscil_close_{lag}_{symbol}'] = ta.momentum.stoch(df[f'high_{symbol}'], df[f'low_{symbol}'], df[f'close_{symbol}'], window=lag, fillna=False)
            df[f'stoch_oscil_close_{lag}_{symbol}'] = stochastic_oscillator(df[f'high_{symbol}'], df[f'low_{symbol}'], df[f'close_{symbol}'], window=lag)
        for lag in roc_window:
            # df[f'roc_close_{lag}_{symbol}'] = ta.momentum.roc(df[f'close_{symbol}'], window=lag, fillna=False)
            df[f'roc_close_{lag}_{symbol}'] = rate_of_change(df[f'close_{symbol}'], window=lag)
        for lag in williams_percentage_r:
            # df[f'williams_close_{lag}_{symbol}'] = ta.momentum.williams_r(df[f'high_{symbol}'], df[f'low_{symbol}'], df[f'close_{symbol}'], lbp=lag, fillna=False)  
            df[f'williams_close_{lag}_{symbol}'] = williams_r(df[f'high_{symbol}'], df[f'low_{symbol}'], df[f'close_{symbol}'], window=lag)
        for lag in lags_wbands:
            # df[f'wbbands_close_{lag}_{symbol}'] = ta.volatility.bollinger_wband(df[f'close_{symbol}'], window=lag, fillna=False)
            df[f'wbbands_close_{lag}_{symbol}'] = bollinger_wband(df[f'close_{symbol}'], window=lag)
        # Alpha#12: (sign(delta(volume, 1)) * (-1 * delta(close, 1))) 
        df['alpha12_'+symbol] = np.sign(delta(df['volume_'+symbol], 1)) * (-1 * delta(df['close_'+symbol], 1))
        # Alpha#41: (((high * low)^0.5) - close)
        df['alpha41_'+symbol] = ((df['high_'+symbol] * df['low_'+symbol])**0.5) - df['close_'+symbol]
        # Alpha#53: (-1 * delta((((close - low) - (high - close)) / (close - low + 0.0000001)), 9))
        df['alpha53_'+symbol] = -1 * delta((((df['close_'+symbol] - df['low_'+symbol]) - (df['high_'+symbol] - df['close_'+symbol])) / (df['close_'+symbol] - df['low_'+symbol] + 0.0000001)), 9)
        # Alpha#6: (-1 * correlation(open, volume, 10)) 
        df['alpha6_'+symbol] = -1 * correlation(df['open_'+symbol], df['volume_'+symbol], 10)
        # Alpha#9: ((0 < ts_min(delta(close, 1), 5)) ? delta(close, 1) : ((ts_max(delta(close, 1), 5) < 0) ? delta(close, 1) : (-1 * delta(close, 1)))) 
        df['alpha9_'+symbol] = np.where(0 < ts_min(delta(df['close_'+symbol], 1), 5), delta(df['close_'+symbol], 1), np.where(ts_max(delta(df['close_'+symbol], 1), 5) < 0, delta(df['close_'+symbol], 1), -1 * delta(df['close_'+symbol], 1)))

    gc.collect()  
    df.dropna(inplace=True)
    for symbol in list_cryptocurrencies:
        df.drop(columns=[f'high_{symbol}', f'low_{symbol}', f'open_{symbol}', f'volume_{symbol}'], inplace=True)
    df.drop_duplicates(subset=['timestamp'], inplace=True)
    # df.to_csv(r'C:\Users\e0817820\Desktop\tokka\data\processed\enriched_train2.csv', index=False)  
    return df   

In [12]:
def separateFeaturesForEachAsset(df):
    for id in range(10):
        symbol = list_cryptocurrencies[id]
        columns = [col for col in df.columns if symbol in col] + ['timestamp']
        df_asset = df[columns]
        df_asset.to_csv(f'C:\\Users\\e0817820\\Desktop\\tokka\\data\\processed\\train_{symbol}2.csv', index=False)
        print(df_asset.shape)
        gc.collect()

In [13]:
combined_train = getFeatures(combined_train)

  df[f'log_close/mean_{lag}-mean_close/mean_{lag}_{symbol}'] = np.array( df[f'log_close/mean_{lag}_{symbol}']) - np.array( df[f'mean_close/mean_{lag}']  )
  df[f'log_return_{lag}-mean_log_returns_{lag}_{symbol}'] = np.array( df[f'log_return_{lag}_{symbol}']) - np.array( df[f'mean_log_returns_{lag}'] )
  df[f'log_close/mean_{lag}-mean_close/mean_{lag}_{symbol}'] = np.array( df[f'log_close/mean_{lag}_{symbol}']) - np.array( df[f'mean_close/mean_{lag}']  )
  df[f'log_return_{lag}-mean_log_returns_{lag}_{symbol}'] = np.array( df[f'log_return_{lag}_{symbol}']) - np.array( df[f'mean_log_returns_{lag}'] )
  df[f'log_close/mean_{lag}-mean_close/mean_{lag}_{symbol}'] = np.array( df[f'log_close/mean_{lag}_{symbol}']) - np.array( df[f'mean_close/mean_{lag}']  )
  df[f'log_return_{lag}-mean_log_returns_{lag}_{symbol}'] = np.array( df[f'log_return_{lag}_{symbol}']) - np.array( df[f'mean_log_returns_{lag}'] )
  df[f'mean_close/mean_{lag}'] =  np.mean(df.iloc[:,df.columns.str.startswith(f'log_close/m

In [14]:
separateFeaturesForEachAsset(combined_train)

(1374186, 31)
(1374186, 31)
(1374186, 31)
(1374186, 31)
(1374186, 31)
(1374186, 31)
(1374186, 31)
(1374186, 31)
(1374186, 31)
(1374186, 31)


In [15]:
print(combined_train.shape)

(1374186, 307)
