# G-Research predictions
Ekaterina Kryukova, Ayman Mezghani

In [None]:
import os
import gc
from datetime import datetime
import time

import math
from itertools import product

import numpy as np
import pandas as pd

from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima_model import ARMA, ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler, StandardScaler

from scipy import stats

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

# https://github.com/philipperemy/keras-tcn
#from tcn import TCN

import tensorflow_probability as tfp

from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
#data_path = '/kaggle/input/'
data_path = 'data/'

## Data Exploration

In [None]:
info = pd.read_csv(data_path + "g-research-crypto-forecasting/asset_details.csv")
train = pd.read_csv(data_path + "g-research-crypto-forecasting/train.csv")

In [None]:
info.Asset_Name.unique()

In [None]:
train.columns

In [None]:
# Convert timestamp
train['timestamp'] = pd.to_datetime(train['timestamp'], unit='s')
train

### Candlestick charts

In [None]:
#Dictionary for assets
rename_dict = {}
asset_details = info 
for a in asset_details['Asset_ID']:
    rename_dict[a] = asset_details[asset_details.Asset_ID == a].Asset_Name.values[0]

display(rename_dict)

In [None]:
#supplemental_train['timestamp'] = supplemental_train['timestamp'].astype('datetime64[s]')
#example_test['timestamp'] = example_test['timestamp'].astype('datetime64[s]')

#train['date'] = pd.DatetimeIndex(train['timestamp']).date
#supplemental_train['date'] = pd.DatetimeIndex(supplemental_train['timestamp']).date
#example_test['date'] = pd.DatetimeIndex(example_test['timestamp']).date

# Resample
train_daily = pd.DataFrame()

for asset_id in asset_details.Asset_ID:
    train_single = train[train.Asset_ID == asset_id].copy()

    train_single_new = train_single[['timestamp', 'Count']].resample('D', on='timestamp').sum()
    train_single_new['Open'] = train_single[['timestamp', 'Open']].resample('D', on='timestamp').first()['Open']
    train_single_new['High'] = train_single[['timestamp', 'High']].resample('D', on='timestamp').max()['High']
    train_single_new['Low'] = train_single[['timestamp', 'Low']].resample('D', on='timestamp').min()['Low']
    train_single_new['Close'] = train_single[['timestamp', 'Close']].resample('D', on='timestamp').last()['Close']
    train_single_new['Volume'] = train_single[['timestamp', 'Volume']].resample('D', on='timestamp').sum()['Volume']
    # train_single_new['VWAP']
    #train_single_new['Target'] = train_single[['timestamp','Target']].resample('D', on='timestamp').mean()['Target']
    train_single_new['Asset_ID'] = asset_id

    train_daily = train_daily.append(train_single_new.reset_index(drop=False))
    
train_daily = train_daily.sort_values(by=['timestamp', 'Asset_ID']).reset_index(drop=True)

train_daily = train_daily.pivot(index='timestamp', columns='Asset_ID')[['Count', 'Open', 'High', 'Low', 'Close', 'Volume']]

train_daily.reset_index(drop=False, inplace=True)

display(train_daily.head(10))

In [None]:
#visualize Bitcoin for recent data rows - last 200 rows
crypto_df = train

crypto_df.index = pd.to_datetime(crypto_df.timestamp, unit='s')
btc = crypto_df[crypto_df["Asset_ID"] == 1] # Asset_ID = 1 for Bitcoin
btc_mini = btc.iloc[-200:] # Select recent data rows

fig = go.Figure(data=[go.Candlestick(x=btc_mini.index, open=btc_mini['Open'], high=btc_mini['High'], low=btc_mini['Low'], close=btc_mini['Close'])])
fig.show()

### Training Data Distribution among differnet Assets 

In [None]:
asset_count = []
for i in range(14):
    count = (train["Asset_ID"] == i).sum()
    asset_count.append(count)

fig = px.bar(x=asset_details.sort_values("Asset_ID")["Asset_Name"],
             y=asset_count , 
             color=asset_count ,
             color_continuous_scale="Emrld") 

fig.update_xaxes(title="Assets")
fig.update_yaxes(title="Number of Rows")

fig.update_layout(showlegend = True,
                  title={'text': 'Data Distribution ',
                         'y':0.95,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'})

fig.show()

1. Data visualtion for 14 popular cryptocurrency
2. Price History for selected individual cryptocurrency
3. Basic Arima Model for price prediction

### Time history of 3 coins and their returns
https://www.kaggle.com/fangya/cryptocurrency-data-visualization-arima

In [None]:
# Input Missing Time Values
def fill_timestamp(asset_id, data=train):
    df = train[train["Asset_ID"]==asset_id].copy()
    df = df.set_index("timestamp").sort_index()
    df = df.reindex(pd.date_range(df.index[0], df.index[-1], freq='min'), method="pad")
    return df

In [None]:
#  Bitcoin
btc = fill_timestamp(asset_id=1)

#  Ethereum
eth = fill_timestamp(asset_id=6)

#  Cardano
ada = fill_timestamp(asset_id=3)

In [None]:
btc

### Cryptocurrency Log Return Correlation Plot for 2021

In [None]:
# Log Return 
def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

In [None]:
btc[btc.index.year == 2021]

In [None]:
all2021 = []
for asset_id, asset_name in zip(info.Asset_ID, info.Asset_Name):
    asset = fill_timestamp(asset_id, data=train)
    
    #take the specific timeframe
    asset = asset[asset.index.year == 2021]

    lret = log_return(asset.Close.fillna(0))[1:]
    
    lret.rename(asset_name, inplace=True)
    
    all2021.append(lret)

In [None]:
all2021 = pd.concat(all2021, axis=1, keys=[s.name for s in all2021])

In [None]:
all2021

In [None]:
# plot the heatmap
corr = all2021.corr()
fig = sns.heatmap(corr, 
                  xticklabels=corr.columns,
                  yticklabels=corr.columns)

fig.get_figure().savefig('corr.jpg')

In [None]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

get_top_abs_correlations(all2021, n=10)

In [None]:
# join two asset in single DataFrame

lret_btc_long = log_return(btc.Close)[1:]
lret_eth_long = log_return(eth.Close)[1:]
lret_btc_long.rename('lret_btc', inplace=True)
lret_eth_long.rename('lret_eth', inplace=True)
two_assets = pd.concat([lret_btc_long, lret_eth_long], axis=1)

# group consecutive rows and use .corr() for correlation between columns
corr_time = two_assets.groupby(two_assets.index//(10000*60)).corr().loc[:, "lret_btc"].loc[:, "lret_eth"]

corr_time.plot();
plt.xticks([])
plt.ylabel("Correlation")
plt.title("Correlation between BTC and ETH over time");

From the Correlation plot we can see Bitcoin Cash is highly correlated with EOS.IO.

Binance Coin is correlated with many crypto currencies, such as , Bitcoin, Bitcoin Cash, Cardano, Ethereum.

We can have a roughly conclude that Binance Coin, Bitcoin Cash prices have a strong relation with other cryptocurrencies. Whereas Dogecoin and Monero prices are quite independent. Ealier we mentioned Monero is based a donation, which makes sense it is not correlated so much with other coins.



The corr changes from time 

In [None]:
all2021.shape

In [None]:
# Plot the Closing Price for BTC, ETH, ADA
f = plt.figure(figsize=(10,12))  

def gplot(no , data, price, label, ylabel, color):
    ax = f.add_subplot(no)
    plt.plot(data[price], label=label, color=color)
    plt.legend()
    plt.xlabel("Time")
    plt.ylabel(ylabel)
    return plt

gplot(no=311, data=btc, price="Close", label="BTC 2021 Overall Performance", ylabel="BTC Closing Price", color="Lightskyblue")
gplot(no=312, data=eth, price="Close", label="ETH 2021 Overall Performance", ylabel="ETH Closing Price", color="Coral")
gplot(no=313, data=ada, price="Close", label="Cardano 2021 Overall Performance", ylabel="ADA Closing Price", color="khaki")

plt.tight_layout()
plt.show()

In [None]:
#Target : 15 minute resudualized returns
#Residual Return: An asset's residual return equals its excess return minus beta times the benchmark excess return.

f = plt.figure(figsize=(10,12))  
gplot(no=311, data=btc, price="Target", label="BTC 2021 15min Return Residue", ylabel="BTC residual return", color="Aqua")
gplot(no=312, data=eth, price="Target", label="ETH 2021 15min Return Residue", ylabel="ETH residual return", color="Pink")
gplot(no=313, data=ada, price="Target", label="ADA 2021 15min Return Residue", ylabel="ADA residual return", color="gold")


plt.tight_layout()
plt.show()

As we can see ETH, and BTC residual return are relatively stable compared to ADA. This might be a good implication that if the investor would take short time trading opportunities, ADA is a better choice.

If the investor is risk averse, BTC or ETH will be a better fit

In [None]:
#candlestick
def c_chart(data,label):
    candlestick = go.Figure(data = [go.Candlestick(x =data.index, 
                                               open = data[('Open')], 
                                               high = data[('High')], 
                                               low = data[('Low')], 
                                               close = data[('Close')])])
    candlestick.update_xaxes(title_text = 'Time',
                             rangeslider_visible = True)

    candlestick.update_layout(
    title = {
        'text': '{:} Candelstick Chart'.format(label),
        "y":0.8,
        "x":0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

    candlestick.update_yaxes(title_text = 'Price in USD', ticksuffix = '$')
    return candlestick

In [None]:
%matplotlib inline
btc_candle = c_chart(btc[-90:], label="BTC Price")
btc_candle.show()

eth_candle = c_chart(eth[-90:], label="ETH Price ")
eth_candle.show()

## Submit to Kaggle

In [None]:
#This is used to emulate the env.iter_test()

test_df = pd.read_csv('data/g-research-crypto-forecasting/example_test.csv')
sub_df = pd.read_csv('data/g-research-crypto-forecasting/example_sample_submission.csv')
sub_df['Target'] = sub_df.Target.astype(float)
iter_test = [(test_df[test_df.group_num == g].reset_index(drop=True).drop(columns='group_num'), sub_df[sub_df.group_num == g].reset_index(drop=True).drop(columns='group_num')) for g in test_df.group_num.unique()]

In [None]:
# load data that is just before the test set
sup = pd.read_csv('data/g-research-crypto-forecasting/supplemental_train.csv').sort_values(by=['timestamp', 'Asset_ID'])

In [None]:
# only the last 15 min are useful
sup = sup[-WINDOW_SIZE * (N_ASSETS):]

In [None]:
# prepare the sup data. Test data will be appended
sup = feature_eng(sup)
test_sample = np.array(sup[feature_cols])
test_sample = test_sample.reshape(-1, (N_ASSETS), test_sample.shape[-1])
test_sample = np.expand_dims(test_sample, axis=0)

In [None]:
"""
env = gresearch_crypto.make_env()
iter_test = env.iter_test()
"""
for (test_df, sample_prediction_df) in iter_test:
    # get the features
    test_df = feature_eng(test_df)
    
    # always sort by time, then by Asset_ID
    test_df = test_df.sort_values(by=['timestamp', 'Asset_ID'])
    
    # to map to the corresponding rows    
    asset_id_row_id_map = {a_id: r_id for a_id, r_id in test_df[['Asset_ID', 'row_id']].values}
    
    # matrix of features
    test = np.array(test_df[feature_cols].fillna(0))
    
    # reshaping (similar to the train)
    test = test.reshape(-1, 1, N_ASSETS, test.shape[-1])
    
    # Stack the test data to previous data, keep only last 15 min 
    test_sample = np.hstack([test_sample, test])[:,-WINDOW_SIZE:]
    
    y_pred = model.predict(test_sample).squeeze().reshape(-1, 1).squeeze()
    
    for i, p in enumerate(y_pred):
        sample_prediction_df.loc[sample_prediction_df['row_id'] == asset_id_row_id_map[i], 'Target'] = p
    """env.predict(sample_prediction_df)"""