# Loading the data

References: [Tutorial to the G-Research Crypto Competition](https://www.kaggle.com/cstein06/tutorial-to-the-g-research-crypto-competition)

In [1]:
import os
import gc
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from numba import jit
from lightgbm import LGBMRegressor

import gresearch_crypto

# from cuml import ForestInference


TRAIN_CSV = '../input/g-research-crypto-forecasting/train.csv'
TRAIN_SUPPLEMENTAL_CSV = '../input/g-research-crypto-forecasting/supplemental_train.csv'
ASSET_DETAILS_CSV = '../input/g-research-crypto-forecasting/asset_details.csv'

SEED = 42

REMOVE_LB_TEST_OVERLAPPING_DATA = False
REMOVE_SOME_DATA_AT_THE_BEGINNING = True

EVALUATION = False

In [2]:
def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

fix_all_seeds(SEED)

In [3]:
%%time

train_df = pd.read_csv(TRAIN_CSV, dtype={
                'timestamp': np.int32, 'Asset_ID': np.int8,
                'Open': np.float32, 'High': np.float32,
                'Low': np.float32, 'Close': np.float32,
                'Count': np.int32, 'Volume': np.float32,
                'VWAP': np.float32})

train_supplemental_df = pd.read_csv(TRAIN_SUPPLEMENTAL_CSV, dtype={
                'timestamp': np.int32, 'Asset_ID': np.int8,
                'Open': np.float32, 'High': np.float32,
                'Low': np.float32, 'Close': np.float32,
                'Count': np.int32, 'Volume': np.float32,
                'VWAP': np.float32})


gc.collect()
train_df.head()

CPU times: user 38.9 s, sys: 3.49 s, total: 42.4 s
Wall time: 1min 15s


Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40,2376.580078,2399.5,2357.139893,2374.590088,19.233006,2373.116455,-0.004218
1,1514764860,0,5,8.53,8.53,8.53,8.53,78.379997,8.53,-0.014399
2,1514764860,1,229,13835.194336,14013.799805,13666.110352,13850.175781,31.550062,13827.0625,-0.014643
3,1514764860,5,32,7.6596,7.6596,7.6567,7.6576,6626.713379,7.657713,-0.013922
4,1514764860,7,5,25.92,25.92,25.874001,25.877001,121.087311,25.891363,-0.008264


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24236806 entries, 0 to 24236805
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   timestamp  int32  
 1   Asset_ID   int8   
 2   Count      int32  
 3   Open       float32
 4   High       float32
 5   Low        float32
 6   Close      float32
 7   Volume     float32
 8   VWAP       float32
 9   Target     float64
dtypes: float32(6), float64(1), int32(2), int8(1)
memory usage: 947.7 MB


## Keep only values _before_ the LB test set

In [5]:
# Remove the future
train_df['datetime'] = pd.to_datetime(train_df['timestamp'], unit='s')

if EVALUATION:
    test_df = train_df[train_df['datetime'] >= '2021-06-13 00:00:00'].copy()

if REMOVE_LB_TEST_OVERLAPPING_DATA:
    train_df = train_df[train_df['datetime'] < '2021-06-13 00:00:00']
    
if REMOVE_SOME_DATA_AT_THE_BEGINNING:
    train_df = train_df[train_df['datetime'] >= '2018-07-01 00:00:00']

In [6]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
df_asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin
3,5,1.386294,EOS.IO
5,6,5.894403,Ethereum
4,7,2.079442,Ethereum Classic
11,8,1.098612,IOTA
6,9,2.397895,Litecoin


# Preprocessing

Concat train and train_supplemental

In [7]:
train_df = pd.concat([train_df, train_supplemental_df])

In [8]:
train_df.sort_values(by='timestamp', inplace=True)

### Forward filling missing values

In [9]:
def asset_ffill(grp):
    new_index = range(grp.index.get_level_values(1)[0],grp.index.get_level_values(1)[-1]+60,60)
    return grp.reset_index(level=0,drop=True).reindex(new_index, method='pad')

train_df = train_df.set_index(['Asset_ID', 'timestamp'])
train_df = train_df.groupby(level='Asset_ID').apply(asset_ffill).reset_index()
train_df['datetime'] = pd.to_datetime(train_df['timestamp'], unit='s')


# Training

## Feature engineering

In [10]:
%%time

# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

# define function to compute log returns
def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

# Consecutive price trends
@jit(nopython=True)
def create_up_down(price_array, window):
    pastUD = np.zeros(len(price_array))
    for i in range(window+1, len(price_array)):
        pastUD[i] = window - 2*np.sum(price_array[i-window:i] < price_array[i-window-1:i-1])
    return pastUD

# A utility function to build features from the original df
# It works for rows to, so we can reutilize it.
def calc_features(df, row=False):
    df_feat = df
    df_feat["Upper_Shadow"] = upper_shadow(df_feat)
    df_feat["Lower_Shadow"] = lower_shadow(df_feat)

    ## Adding some more features
    df_feat["Close/Open"] = (df_feat["Close"] / df_feat["Open"]).astype(np.float32)
    df_feat["Close-Open"] = (df_feat["Close"] - df_feat["Open"]).astype(np.float32) 
    df_feat["High-Low"] = (df_feat["High"] - df_feat["Low"]).astype(np.float32)
    df_feat["High/Low"] = (df_feat["High"] / df_feat["Low"]).astype(np.float32)
    
    df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1).astype(np.float32)

    df_feat["High/Mean"] = (df_feat["High"] / df_feat["Mean"]).astype(np.float32)
#     df_feat["Low/Mean"] = df_feat["Low"] / df_feat["Mean"]
    df_feat["Volume/Count"] = (df_feat["Volume"] / (df_feat["Count"] + 1) ).astype(np.float32)

    ## possible seasonality, datetime  features (unlikely to me meaningful, given very short time-frames)
    ### to do: add cyclical features for seasonality
    times = pd.to_datetime(df_feat["timestamp"],unit="s",infer_datetime_format=True)

    df_feat["hour"] = times.dt.hour  # .dt
    df_feat["dayofweek"] = times.dt.dayofweek 
    df_feat["day"] = times.dt.day 

#     df_feat["Median"] = df_feat[["Open", "High", "Low", "Close"]].median(axis=1)
#     df_feat["High/Median"] = df_feat["High"] / df_feat["Median"]
#     df_feat["Low/Median"] = df_feat["Low"] / df_feat["Median"]
    
    # [log_return(btc.VWAP,periods=5), log_return(btc.VWAP,periods=1).abs()
    
    df_feat['Close/VWAP'] = (df_feat['Close'] / df_feat['VWAP']).astype(np.float32)
    
    # Lags and rollings
    df_feat_close_grpby = df_feat.groupby('Asset_ID')['Close']
    df_feat_vwap_grpby = df_feat.groupby('Asset_ID')['VWAP']
    
    for window in [15, 30, 60]:#
        df_feat['sma{}'.format(window)] = df_feat_close_grpby.transform(lambda x: x.rolling(window).mean() / x - 1).astype(np.float32)
        df_feat['log_return{}'.format(window)] = df_feat_close_grpby.transform(lambda x: log_return(x, window)).astype(np.float32)
        df_feat['up_down{}'.format(window)] = df_feat_close_grpby.transform(lambda x: create_up_down(x.values, window)).astype(np.float32)
        
#     df_feat['up_down30'] = df_feat_close_grpby.transform(lambda x: create_up_down(x.values, 30)).astype(np.float32)

    df_feat['sma_diff15-60'] = (df_feat['sma15'] - df_feat['sma60']).astype(np.float32)
    
    df_feat['ma_close/vwap15'.format(15)] = df_feat.groupby('Asset_ID')['Close/VWAP'].transform(lambda x: x.rolling(15).mean()).astype(np.float32)
    
    df_feat['log_return1'] = df_feat_close_grpby.transform(lambda x: log_return(x, 1)).astype(np.float32)
    
    df_feat['vwap_log_return1'] = df_feat_vwap_grpby.transform(lambda x: log_return(x, 1)).astype(np.float32)
    df_feat['vwap_log_return5'] = df_feat_vwap_grpby.transform(lambda x: log_return(x, 5)).astype(np.float32)
    df_feat['vwap_log_return15'] = df_feat_vwap_grpby.transform(lambda x: log_return(x, 15)).astype(np.float32)
    
    df_feat['volume_rmean5'] = df_feat.groupby('Asset_ID')['Volume'].transform(lambda x: x.rolling(5).mean()).astype(np.float32)
    df_feat['volume_rmean15'] = df_feat.groupby('Asset_ID')['Volume'].transform(lambda x: x.rolling(15).mean()).astype(np.float32)
    
#     df_feat['close/open_rmean15'] = df_feat.groupby('Asset_ID')['Close/Open'].transform(lambda x: x.rolling(15).mean())
    
    # Grouping by timestamp (current market condition)
    df_feat['time_sma15'] = df_feat.groupby('timestamp')['sma15'].transform('mean').astype(np.float32)
#     df_feat['time_up_down30'] = df_feat.groupby('timestamp')['up_down30'].transform('mean').astype(np.float32)
#     df_feat['time_sma1440'] = df_feat.groupby('timestamp')['sma1440'].transform('mean')
#     df_feat['time_sma60'] = df_feat.groupby('timestamp')['sma60'].transform('mean') # Score got worse, need CV test

#     df_feat['time_log_return15'] = df_feat.groupby('timestamp')['log_return15'].transform('mean') # Score slightly got worse
    
#     df_feat['time_volume_rmean15'] = df_feat.groupby('timestamp')['volume_rmean15'].transform('mean') # Score got worse
#     df_feat['time_volume_rmean5'] = df_feat.groupby('timestamp')['volume_rmean5'].transform('mean')
#     df_feat['time_ma_close/vwap_mean'] = df_feat.groupby('timestamp')['ma_close/vwap15'].transform('mean')vwap_log_return15

#     df_feat['time_close/open_rmean15_mean'] = df_feat.groupby('timestamp')['close/open_rmean15'].transform('mean')

    df_feat = df_feat.fillna(0)
        
    return df_feat

train_df = calc_features(train_df)
train_df = train_df.dropna(how="any")
gc.collect()

  result = getattr(ufunc, method)(*inputs, **kwargs)


CPU times: user 3min 48s, sys: 18.3 s, total: 4min 6s
Wall time: 4min 6s


22

In [11]:
if EVALUATION:
    test_df = calc_features(test_df)

In [12]:
train_df.head(30)

Unnamed: 0,Asset_ID,timestamp,Count,Open,High,Low,Close,Volume,VWAP,Target,...,up_down60,sma_diff15-60,ma_close/vwap15,log_return1,vwap_log_return1,vwap_log_return5,vwap_log_return15,volume_rmean5,volume_rmean15,time_sma15
0,0,1530403260,29,14.6931,14.722,14.69,14.69,604.219971,14.699802,0.003204,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1530403320,109,14.69,14.7222,14.6501,14.6701,4506.430176,14.694633,-0.000143,...,0.0,0.0,0.0,-0.001355,-0.000352,0.0,0.0,0.0,0.0,0.0
2,0,1530403380,26,14.6783,14.6918,14.662,14.678,591.23999,14.676436,-0.004589,...,0.0,0.0,0.0,0.000538,-0.001239,0.0,0.0,0.0,0.0,0.0
3,0,1530403440,36,14.683,14.73,14.675,14.7076,823.119995,14.712523,-0.000531,...,0.0,0.0,0.0,0.002014,0.002456,0.0,0.0,0.0,0.0,0.0
4,0,1530403500,16,14.708,14.7181,14.6896,14.6896,211.289993,14.706444,-0.004728,...,0.0,0.0,0.0,-0.001225,-0.000413,0.0,0.0,1347.26001,0.0,0.0
5,0,1530403560,10,14.7128,14.7128,14.69,14.701,87.800003,14.705889,-0.006988,...,0.0,0.0,0.0,0.000776,-3.8e-05,0.000414,0.0,1243.976074,0.0,0.0
6,0,1530403620,11,14.71,14.71,14.701,14.71,236.759995,14.705564,-0.005472,...,0.0,0.0,0.0,0.000612,-2.2e-05,0.000744,0.0,390.041992,0.0,0.0
7,0,1530403680,89,14.71,14.7485,14.694,14.694,2551.590088,14.721313,-0.011141,...,0.0,0.0,0.0,-0.001088,0.00107,0.003053,0.0,782.112,0.0,0.0
8,0,1530403740,68,14.6851,14.7499,14.6746,14.7204,1385.589966,14.705611,-0.011512,...,0.0,0.0,0.0,0.001795,-0.001067,-0.00047,0.0,894.606018,0.0,0.0
9,0,1530403800,42,14.7204,14.7428,14.6939,14.7413,1356.959961,14.717756,-0.006135,...,0.0,0.0,0.0,0.001419,0.000826,0.000769,0.0,1123.73999,0.0,0.0


In [13]:
train_df['datetime']

0          2018-07-01 00:01:00
1          2018-07-01 00:02:00
2          2018-07-01 00:03:00
3          2018-07-01 00:04:00
4          2018-07-01 00:05:00
                   ...        
25573784   2022-01-09 23:56:00
25573785   2022-01-09 23:57:00
25573786   2022-01-09 23:58:00
25573787   2022-01-09 23:59:00
25573788   2022-01-10 00:00:00
Name: datetime, Length: 25573789, dtype: datetime64[ns]

### Prepare weights for evaluation

In [14]:
# https://stackoverflow.com/questions/38641691/weighted-correlation-coefficient-with-pandas
def wmean(x, w):
    return np.sum(x * w) / np.sum(w)

def wcov(x, y, w):
    return np.sum(w * (x - wmean(x, w)) * (y - wmean(y, w))) / np.sum(w)

def wcorr(x, y, w):
    return wcov(x, y, w) / np.sqrt(wcov(x, x, w) * wcov(y, y, w))

def eval_wcorr(preds, train_data):
    w = train_data.add_w.values.flatten()
    y_true = train_data.get_label()
    return 'eval_wcorr', wcorr(preds, y_true, w), True

#create dictionnary of weights
dict_weights = {}
for i in range(df_asset_details.shape[0]):
    dict_weights[df_asset_details.iloc[i,0]] = df_asset_details.iloc[i,1]

#     train_df['weights'] = train_df.Asset_ID.map(dict_weights).astype('float32')
if EVALUATION:
    test_df['weights'] = test_df.Asset_ID.map(dict_weights).astype('float32')
    test_df['preds'] = 0

## Loop over all assets

In [15]:
%%time

params = {'n_estimators': 600,
        'objective': 'regression',
        'metric': 'None',
        'boosting_type': 'gbdt',
        'max_depth': -1, 
        'learning_rate': 0.01,
        'seed': SEED,
        'verbose': -1,
        }


models = {}

for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    
    asset_df = train_df[train_df["Asset_ID"] == asset_id]
    
        
    features = train_df.columns.difference(['timestamp', 'Open', 'High', 'Low', 'Close', 
                                        'VWAP', 'Mean', 'Asset_ID', 'datetime', 'Target'])  

    X = asset_df[features]
    y = asset_df['Target']
    
    model = LGBMRegressor(**params)
    model.fit(X, y)
    
    models[asset_id] = model
    
    # Evaluation
    if EVALUATION:
        x_test = test_df.loc[test_df["Asset_ID"] == asset_id, features]
        preds = model.predict(x_test)
        test_df.loc[test_df["Asset_ID"] == asset_id, 'preds'] = preds
        score = np.corrcoef(test_df.loc[test_df['Asset_ID'] == asset_id, 'preds'],
                            test_df.loc[test_df['Asset_ID'] == asset_id, 'Target'])[0,1]
        print(f'Test score for {asset_name}: ', f"{score:.4f}")

Training model for Binance Coin     (ID=0 )
Training model for Bitcoin          (ID=1 )
Training model for Bitcoin Cash     (ID=2 )
Training model for Cardano          (ID=3 )
Training model for Dogecoin         (ID=4 )
Training model for EOS.IO           (ID=5 )
Training model for Ethereum         (ID=6 )
Training model for Ethereum Classic (ID=7 )
Training model for IOTA             (ID=8 )
Training model for Litecoin         (ID=9 )
Training model for Maker            (ID=10)
Training model for Monero           (ID=11)
Training model for Stellar          (ID=12)
Training model for TRON             (ID=13)
CPU times: user 41min 42s, sys: 8.01 s, total: 41min 50s
Wall time: 10min 47s


In [16]:
if EVALUATION:
    final_score = wcorr(test_df['preds'], test_df['Target'], test_df['weights'])
    print(f'Final test score: {final_score:.4f}')

In [17]:
# Check the model interface
x = train_df.iloc[1]
x = x[features]
y_pred = models[0].predict([x])
y_pred[0]

-6.7466836125097135e-06

# Predict & submit

References: [Detailed API Introduction](https://www.kaggle.com/sohier/detailed-api-introduction)

Something that helped me understand this iterator was adding a pdb checkpoint inside of the for loop:

```python
import pdb; pdb.set_trace()
```

See [Python Debugging With Pdb](https://realpython.com/python-debugging-pdb/) if you want to use it and you don't know how to.


In [18]:
all_df_test = []

# gresearch_crypto.make_env.__called__ = False

env = gresearch_crypto.make_env()
iter_test = env.iter_test()

history = pd.DataFrame()

for i, (df_test, df_pred) in tqdm(enumerate(iter_test)):
    df_test_opt = df_test.astype({
                'timestamp': np.int32, 'Asset_ID': np.int8,
                'Open': np.float32, 'High': np.float32,
                'Low': np.float32, 'Close': np.float32,
                'Count': np.int32, 'Volume': np.float32,
                'VWAP': np.float32})
    
    history = pd.concat([history, df_test_opt])
    history = history.iloc[-856:] # Keeping only last needed rows
    df_test_feat = calc_features(history.copy()) 
    
    df_test_feat = df_test_feat.iloc[-len(df_test):] # Getting latest test data
    try:
        df_pred['Target'] = df_test_feat.apply(lambda row: models[row['Asset_ID']].predict([row[features]])[0], axis=1)
    except:
        df_pred['Target'] = 0

    # Send submissions
    env.predict(df_pred)

0it [00:00, ?it/s]

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


4it [00:00,  7.76it/s]
