In [1]:
import pandas as pd
import numpy as np
import gresearch_crypto
import xgboost as xgb
import traceback
import datetime


TRAIN_CSV = '/kaggle/input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'

df = pd.read_csv(TRAIN_CSV)
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")

asset_to_weight = df_asset_details.Weight.values
df["Weight"] = df["Asset_ID"].apply(lambda x: asset_to_weight[x])

In [2]:
def clean(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df = df.dropna(how="any", inplace=True)

def test_train_split(df):
    X_train = df[df['timestamp'] <= 1623542400].drop('Target', axis=1)
    y_train = df[df['timestamp'] <= 1623542400].Target
    X_test = df[df['timestamp'] > 1623542400].iloc[:-1].drop('Target', axis=1)
    y_test = df[df['timestamp'] > 1623542400].iloc[:-1].Target
    return X_train, y_train, X_test, y_test

clean(df)
X_train, y_train, X_test, y_test = test_train_split(df)

In [3]:
def process(df):
    df["Unity"] = 1
    df['log_ret'] = np.log(df.Close/df.Open).fillna(0)
    df['weird_feature'] = -(df['log_ret'] - (df['Weight'] * df['log_ret']).sum() / df['Weight'].sum())
    
    norm_cols = ['Open','VWAP']
    ref = "Close"
    for col in norm_cols:
        df["norm_" + col] = df[col] / df[ref]
    
    return df
    #return pd.concat([df, time_lag(df[["VWAP", "Volume", "Open", "Close"]], n_in=1, n_out=0, dropnan=False, interpolate=True)], axis=1)

X_train = process(X_train)
X_test = process(X_test)

In [4]:
from sklearn.preprocessing import StandardScaler


class BestModel:
    def __init__(self):
        self.beta = None
        self.scaler = StandardScaler()
    
    def fit(self, X_train, y_train):
        #self.scaler.fit(X_train)
        #X = self.scaler.transform(X_train)
        X = X_train.values
        mat = X.T@X
        self.beta = np.linalg.inv(mat)@X.T@y_train.values
        
    def predict(self, X_test):
        #X = self.scaler.transform(X_test)
        X = X_test.values
        return X@self.beta

In [5]:
features = ["Count", "norm_Open", "Open", "Close", "Volume", "norm_VWAP", "VWAP", "Unity", 'weird_feature']
print(X_train.columns)

Index(['timestamp', 'Asset_ID', 'Count', 'Open', 'High', 'Low', 'Close',
       'Volume', 'VWAP', 'Weight', 'Unity', 'log_ret', 'weird_feature',
       'norm_Open', 'norm_VWAP'],
      dtype='object')


In [6]:
models = [BestModel() for _ in range(len(df_asset_details))]

y_pred = pd.Series(data=np.full_like(y_test.values, np.nan), index=y_test.index)
for asset_ID, model in enumerate(models):
    X_asset_train = X_train[X_train.Asset_ID == asset_ID]
    y_asset_train = y_train[X_train.Asset_ID == asset_ID]
    X_asset_test = X_test[X_test.Asset_ID == asset_ID]
    
    model.fit(X_asset_train[features], y_asset_train)
    y_pred[X_test.Asset_ID == asset_ID] = model.predict(X_asset_test[features])
    print(f"Trained model for asset {asset_ID}")

Trained model for asset 0
Trained model for asset 1
Trained model for asset 2
Trained model for asset 3
Trained model for asset 4
Trained model for asset 5
Trained model for asset 6
Trained model for asset 7
Trained model for asset 8
Trained model for asset 9
Trained model for asset 10
Trained model for asset 11
Trained model for asset 12
Trained model for asset 13


# Predict & submit

References: [Detailed API Introduction](https://www.kaggle.com/sohier/detailed-api-introduction)

Something that helped me understand this iterator was adding a pdb checkpoint inside of the for loop:

```python
import pdb; pdb.set_trace()
```

See [Python Debugging With Pdb](https://realpython.com/python-debugging-pdb/) if you want to use it and you don't know how to.


In [7]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    print(i)
    df_pred['Target'] = np.nan
    
    df_test["Weight"] = df_test["Asset_ID"].apply(lambda x: asset_to_weight[x])
    df_test = process(df_test)
    for asset_ID, model in enumerate(models):
        X_asset_test = df_test[df_test.Asset_ID == asset_ID]
        df_pred.loc[df_test.Asset_ID == asset_ID, 'Target'] = model.predict(X_asset_test[features])
    df_pred['Target'] = df_pred['Target'].interpolate('nearest')
    env.predict(df_pred)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
0
1
2
3


In [8]:
print("oh yes!")

oh yes!
