In [None]:
# BRANCH_NAME: start_over
# COMMIT: 997cdced3a412ba7b3382a37bf345cfd19457985
# COMMIT_MSG: renaming preprocessing to pipeline, and moving ingest data to data.py

# MESSAGE: MAYOR REFACTORING, SAME MODEL FIXING BUG OF LEAKAGE
# LASTEST_COMMIT_DATE: 2021-12-02 20:35:37
# DATE: 2021-12-02 21:13:56

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import sys
from IPython.display import display

In [3]:
def on_kaggle() -> bool:
    try:
        import gresearch_crypto
        return True
    except ModuleNotFoundError:
        return False

In [4]:
# HYPER PARAMETERS
ON_KAGGLE = on_kaggle()
SAMPLE_LEVEL = 1
FORCE_REWRITE = True

In [5]:
if not ON_KAGGLE and os.path.abspath('.').endswith('notebook'):
    os.chdir('../')

In [6]:
# IMPORTED FROM src/data.py
import pandas as pd
import numpy as np

# INGEST DATA
# DATASET DTYPES FOR SAVING MEMORY
DTYPES = {'Asset_ID': 'int32',
          'Open': 'float32',
          'High': 'float32',
          'Low': 'float32',
          'Close': 'float32',
          'VWAP': 'float32'}


def merge_asset_details(df: pd.DataFrame, asset_details_path: str) -> pd.DataFrame:
    asset_details = pd.read_csv(asset_details_path)
    df = df.merge(asset_details[['Asset_ID', 'Asset_Name']], on='Asset_ID', how='left')
    assert df['Asset_Name'].isna().sum() == 0, 'unexpected Asset ID'
    return df


def infer_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    # replace inf with NaNs
    df = df.replace([np.inf, -np.inf], np.nan)
    return df.astype(DTYPES)


def date_to_timestamp(dates: pd.Series) -> pd.Series:
    return dates.astype(np.int64) // 10 ** 9


def create_valid_timestamp_range(data: pd.DataFrame, dt_col: str = 'timestamp') -> np.ndarray:
    start_ttp, end_ttp = data[dt_col].agg(('min', 'max'))
    return np.arange(start_ttp, end_ttp+60, 60)
    

def fill_gaps_with_timestmap(data: pd.DataFrame, dt_col: str = 'timestamp') -> pd.DataFrame:
    assert data[dt_col].duplicated().sum() == 0, f'{dt_col} contains duplicates, cant reindex from duplicated values'
    valid_ttp_range = create_valid_timestamp_range(data, dt_col)
    data = data.set_index(dt_col)
    filled_data = data.reindex(valid_ttp_range)
    return filled_data.reset_index().rename(columns={'index': dt_col})


def fill_gaps_crypto_data(data: pd.DataFrame,
                          dt_col: str = 'timestamp'):
    
    asset_id = np.unique(data['Asset_ID'])
    assert len(asset_id) == 1, 'expected one Asset_ID'
    data = fill_gaps_with_timestmap(data, dt_col)
    data['Asset_ID'] = int(asset_id[0])
    return data


def get_mask_for_asset(data: pd.DataFrame, asset_id: int):
    return (data['Asset_ID'] == asset_id)


def get_data_for_asset(data: pd.DataFrame, asset_id: int):
    mask = get_mask_for_asset(data, asset_id)
    return data.loc[mask, :].reset_index(drop=True)



# IMPORTED FROM src/pipeline/feature_gen.py
import pandas as pd
import numpy as np
from typing import List, Callable, Dict
from scipy.stats import linregress


feature_dict_dtype = Dict[str, List[Callable]]
RAW_FEATURES = ['Count', 'Open', 'High', 'Low', 'Close',
                'Volume', 'VWAP']
SUFFIX_FOMRAT = '__{n}min_ft'


# FEATURE GEN FUNCTION
def log_return(x: pd.Series, periods: int = 1) -> pd.Series:
    return np.log(x).diff(periods=periods).fillna(0)


def realized_volatility(series: pd.Series) -> float:
    return np.sqrt(np.sum(np.power(series.to_numpy(), 2)))


def linear_slope(series: pd.Series) -> float:
    linreg = linregress(np.arange(len(series)), series)
    return linreg.slope

# UTIL
def join_columns(columns):
    return list(map(lambda f: '__'.join(map(str, f)), columns))

BASE_FEATURES_TO_DROP = ['Open']
def compute_instant_features(df: pd.DataFrame) -> pd.DataFrame:
    assert np.isin(RAW_FEATURES, df.columns).all(), \
           'missing raw features'

    # normalize High and Low features
    df['High'] = df['High'] / df['Open']
    df['Low'] = df['Low'] / df['Open']
    # create price features 
    # TODO: is it better to take the log or not?
    # TODO: is it better to take the ratio or the difference?
    # ratio will normalize features but if we use a single model for each asset will make no difference
    # for the time, lets take the ratio
    df['high_low_return'] = np.log1p(df['High'] / df['Low'])
    df['open_close_return'] = np.log1p(df['Close'] / df['Open'])
    df['upper_shadow'] = df['High'] / np.maximum(df['Close'], df['Open'])
    df['lower_shadow'] = np.minimum(df['Close'], df['Open']) / df['Low']

    # vol and count features
    # TODO: is it useful dolar_amount?
    df['dolar_amount'] = df['Close'] * df['Volume']
    df['vol_per_trades'] = df['Volume'] / df['Count']
    return df.drop(BASE_FEATURES_TO_DROP, axis=1)



# # FEATURES TO COMPUTE
# FEATURE_DICT = {'High': [np.max],
#                 'Low': [np.min],
#                 'Close': [np.mean],
#                 'price_return_1': [np.sum, realized_volatility],
#                 'vwap_return_1': [np.sum, realized_volatility],
#                 'Count': [np.sum, np.max],
#                 'Volume': [np.sum, np.max],
#                 'high_low_return': [np.mean],
#                 'open_close_return': [np.mean],
#                }

# FEATURE_DICT = {
#                 'Close': [np.mean],
#                }



# def map_function_to_dataframe(X: pd.DataFrame,
#                  feature_dict: feature_dict_dtype) -> Dict[str, float]:
#     features = {f'{name}__{func.__name__}': func(X[name])
#                 for name, func_list in feature_dict.items()
#                 for func in func_list}
#     return features


# def compute_features_on_inference(X: pd.DataFrame, n: int,
#                                  feature_dict: feature_dict_dtype) -> pd.DataFrame:
#     features = map_function_to_dataframe(X.tail(n), feature_dict)
#     return pd.DataFrame([features]).add_suffix(SUFFIX_FOMRAT.format(n=n)).astype(np.float32)


# def compute_features_on_train(X: pd.DataFrame, n: int,
#                              feature_dict: feature_dict_dtype) -> pd.DataFrame:
#     assert X['Asset_ID'].nunique() == 1, \
#            'expected only one Asset_ID'
    
#     mov_features = X.rolling(n, min_periods=1).agg(feature_dict)
#     mov_features.columns = join_columns(mov_features.columns)
#     mov_features = mov_features.add_suffix(SUFFIX_FOMRAT.format(n=n))
    
#     assert len(mov_features) == len(X), 'output lenght do not match the input lenght'
#     return mov_features.astype(np.float32)


# IMPORTED FROM src/pipeline/transforms.py
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from typing import List
import numpy as np
from pandas.api.types import is_numeric_dtype


def check_target(y):
    assert np.isnan(y).sum() == 0, 'target has nan'
    assert np.isinf(y).sum() == 0, 'target has inf'


def check_features(X: pd.DataFrame,
                   allow_nan: bool = False):
    for name, value in X.items():
        assert np.isinf(value).sum() == 0, f'{name} has inf values'
        assert allow_nan or np.isnan(value), f'{name} has NaNs'
        assert is_numeric_dtype(value), f'{name} is not numeric'


class FilterFeatures(BaseEstimator, TransformerMixin):
    def __init__(self,
                 features: List[str] = None,
                 sort: bool = False,
                  allow_nan: bool = True):
        self.sort = sort
        self.features = features[:]
        self.allow_nan = allow_nan
        if self.sort:
            self.features.sort()
    
    def fit(self, X: pd.DataFrame, y=None):
        return self
    
    def transform(self, X: pd.DataFrame):
        outX = X.loc[:, self.features]
        check_features(outX, allow_nan=self.allow_nan)
        return outX


# IMPORTED FROM src/pipeline/__init__.py
import pandas as pd
import time
import numpy as np
import gc
from typing import Tuple, Dict, Any


EXPECTED_RAW_COLS = ['timestamp', 'Asset_ID', 'Count',
                     'Open', 'High', 'Low', 'Close',
                     'Volume', 'VWAP']

# def process_train_data(df: pd.DataFrame,
#                        window: int = 60) -> pd.DataFrame:
#     asset_ids = sorted(df['Asset_ID'].unique())
    
#     global_features = []
#     for asset_id in asset_ids:
#         print(f'processing asset_id={asset_id}')
#         raw_local_data = df.query("Asset_ID==@asset_id").reset_index(drop=True)
#         # fill nan gaps
#         raw_local_data = fill_gaps_crypto_data(raw_local_data)
#         raw_local_data = infer_dtypes(raw_local_data)
#         # base features
#         raw_features = compute_base_features(raw_local_data)
        
#         # compute history features
#         start_time = time.time()
#         features = compute_features_on_train(raw_features, window, FEATURE_DICT)
#         elapsed_time = (time.time() - start_time) / 60
        
#         print(f'elapsed time: {elapsed_time:.4f}min')
#         # add timestamp
#         features['timestamp'] = raw_features['timestamp'].to_numpy()
#         features['Asset_ID'] = asset_id
#         global_features.append(features)

#         del raw_local_data, raw_features
#         gc.collect()
#     print('joining datasets')
#     global_features = pd.concat(global_features, axis=0, ignore_index=True)
#     assert global_features['Asset_ID'].nunique() == len(asset_ids), \
#            f'missing Asset_IDs'
#     return global_features


# def process_test_data(test_dict: Dict[str, float], local_history_df: pd.DataFrame,
#                       window: int = 60) -> Tuple[pd.DataFrame, pd.DataFrame]:

#     last_timestamp = local_history_df.iloc[-1]['timestamp']
#     current_timestamp = test_dict['timestamp']
#     # add new observation forget the last first row
#     local_history_df = local_history_df.append([test_dict], ignore_index=True)
#     minute_diff = (current_timestamp - last_timestamp) // 60

#     assert minute_diff > 0, f'current timestamp included in history df, {current_timestamp} <= {last_timestamp}'

#     if minute_diff > 1:
#         print(f'missing more than one minut of data, missing minutes: {minute_diff}')
#         print(f'filling gaps')
#         local_history_df = fill_gaps_crypto_data(local_history_df)
#     raw_features = compute_base_features(local_history_df)
#     features = compute_features_on_inference(raw_features, n=window, feature_dict=FEATURE_DICT)

#     return features, local_history_df


def test_submission_format(submission: pd.DataFrame, expected_len: int = 14):
    assert list(submission.columns) == ['row_id', "Target"], 'submission do not match expected columns'
    assert len(submission) == expected_len, 'submission do not match expected lenght'
    assert submission['Target'].isna().sum() == 0, 'target includes NaNs'
    assert submission['row_id'].dtype == np.int32
    assert submission['Target'].dtype == np.float64
    assert submission['Target'].isna().sum() == 0, 'submission contains NaN values'
    assert np.isinf(submission['Target']).sum() == 0 ,'submission contains inf values'


def inference(test_data: pd.DataFrame, submission: pd.DataFrame,
             models: Dict[str, Any],
             ) -> pd.DataFrame:
    expected_len = len(submission)
    test_data = infer_dtypes(test_data)
    features = compute_instant_features(test_data.loc[:, EXPECTED_RAW_COLS])
    records = features.to_dict('records')
    for index, asset_features in enumerate(records):
        # get the asset ID
        asset_id = int(asset_features['Asset_ID'])
        assert asset_id in models, f'{asset_id} not in TRAINED MODELS'
        # get model
        model = models[asset_id]
        asset_frame = pd.DataFrame([asset_features])
        local_test_yhat = model.predict(asset_frame)
        # add to submission format
        submission.iloc[index, 1] = local_test_yhat[0]
    # testing submission format
    test_submission_format(submission, expected_len=expected_len)
    return submission


# IMPORTED FROM src/metrics.py
import numpy as np
from typing import Tuple
import pandas as pd


ASSET_WEIGHT = {
'Bitcoin Cash': 2.3978952727983707,
'Binance Coin': 4.30406509320417,
'Bitcoin': 6.779921907472252,
'EOS.IO': 1.3862943611198906,
'Ethereum Classic': 2.079441541679836,
'Ethereum': 5.8944028342648505,
'Litecoin': 2.3978952727983707,
'Monero': 1.6094379124341005,
'TRON': 1.791759469228055,
'Stellar': 2.079441541679836,
'Cardano': 4.406719247264253,
'IOTA': 1.0986122886681098,
'Maker': 1.0986122886681098,
'Dogecoin': 3.555348061489413}


TOTAL_WEIGHT_SUM = sum(ASSET_WEIGHT.values())

#### weighted correlation cofficient
def compute_weighted_mean(x: np.ndarray, w: np.ndarray) -> float:
    return np.average(x, weights=w)


def compute_weighted_var(x: np.ndarray, w: np.ndarray) -> float:
    mean = compute_weighted_mean(x, w)
    dev = np.square(x - mean)
    return compute_weighted_mean(dev, w)


def compute_weighted_cov(y: np.ndarray, yhat: np.ndarray, w: np.ndarray) -> float:
    y_mean = compute_weighted_mean(y, w)
    yhat_mean = compute_weighted_mean(yhat, w)
    return compute_weighted_mean((y - y_mean) * (yhat - yhat_mean), w)


def compute_weighted_corr(y: np.ndarray, yhat: np.ndarray,
                          w: np.ndarray = None) -> float:
    if w is None:
        w = np.ones(len(y))
    assert len(y) == len(yhat)
    var_y = compute_weighted_var(y, w)
    var_yhat = compute_weighted_var(yhat, w)
    
    return compute_weighted_cov(y, yhat, w) / np.sqrt(var_y * var_yhat)


def compute_correlation(df: pd.DataFrame,
                        target_name: str = 'Target',
                        yhat_name: str = 'yhat',
                        group_col: str = 'Asset_ID') -> pd.DataFrame:
    def _spearman_corr(d: pd.DataFrame):
        return np.corrcoef(d[target_name], d[yhat_name])[0, 1]
    
    assert df[target_name].isna().sum() == 0, f'{target_name} includes NaN'
    corrs = df.groupby(group_col).apply(_spearman_corr)
    return corrs.to_frame('corr').reset_index()


def compute_sharpe(df: pd.DataFrame,
                   period: int = 60*24*7,   # weekly
                   target_name: str = 'Target',
                   yhat_name: str = 'yhat',
                   weight_name: str = 'weight',
                   ) -> float:
    
    timesteps = (df['timestamp'].max() - df['timestamp']) // 60   # from 0 up to n min,
    time_groups = timesteps // period
    corrs = df.groupby(time_groups).apply(lambda d: compute_weighted_corr(y=d[target_name].to_numpy(),
                                                                          yhat=d[yhat_name].to_numpy(),
                                                                          w=d[weight_name].to_numpy()))
    assert np.isnan(corrs).sum() == 0, 'period corrs contains NaN values'
    mean = corrs.mean()
    std = corrs.std()
    consistency = (corrs > 0.001).mean()
    return {'sharpe': mean / (std + 1e-15),
            'corr_period_mean': mean,
            'corr_period_std': std,
            'consistency': consistency,
            'min_period_corr': corrs.min()}


def compute_metrics(df: pd.DataFrame,
                    target_name: str = 'Target',
                    yhat_name: str = 'yhat',
                    group_col: str = 'Asset_Name') -> Tuple[pd.Series, pd.DataFrame]:

    # BASE APPROACH, COMPUTE CORR AND THE WEIGHTED THEM
    corrs_df = compute_correlation(df, target_name=target_name,
                                   yhat_name=yhat_name,
                                   group_col=group_col)    
    corr_stats = corrs_df['corr'].agg(('min', 'max', 'std')).add_prefix('corr_').to_dict()
    # COMPUTE WEIGHTED CORRELATION USING FORMULA
    df['_weight'] = df[group_col].map(ASSET_WEIGHT)
    theor_corr = compute_weighted_corr(y=df[target_name], yhat=df[yhat_name], w=df['_weight'].to_numpy())
    # DIVIDE IT INTO DAILY CHUNKS AND COMPUTE SHARPE
    sharpe_scores = compute_sharpe(df, target_name=target_name, yhat_name=yhat_name, weight_name='_weight')
    scores = {'corr': theor_corr,
              'crypto_consistency': (corrs_df['corr'] >= 0.001).sum()}
    scores.update(sharpe_scores)
    scores.update(corr_stats)
    df.drop('_weight', axis=1, inplace=True)
    return pd.Series(scores)


# IMPORTED FROM src/cv.py
import numpy as np
from sklearn.model_selection import KFold
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from typing import List, Tuple
import pandas as pd
from datetime import datetime



def get_date_range(dates: pd.Series):
    return dates.agg(('min', 'max'))


class TimeSeriesSplit(_BaseKFold):
    def __init__(self, periods: List[Tuple[str, str]],
                 train_days: int = None,
                 gap: int = 1,
                 gap_unit: int = 'd',
                 dt_col: str = 'date'):
        self.dt_col = dt_col
        self.periods = periods
        self.train_days = train_days
        self.gap = gap
        self.gap_unit = gap_unit
        
    def __len__(self) -> int:
        return len(self.periods)
    
    def check_input(self, X: pd.DataFrame, y=None, groups=None):
        assert self.dt_col in X.columns, f'{self.dt_col} do not exits in input dataframe'
        
    def split(self, X: pd.DataFrame, y=None, groups=None):
        dates = X[self.dt_col]
        self.check_input(X)
        
        first_date = dates.min()
        
        indices = np.arange(len(X))
        for period in self.periods:
            first_valid_date = pd.to_datetime(period[0])
            
            last_train_date = first_valid_date - pd.to_timedelta(self.gap, unit=self.gap_unit)
            
            if self.train_days:
                first_train_date = last_train_date - pd.to_timedelta(self.train_days, unit='d')
                first_train_date = np.maximum(first_train_date, first_date)
            else:
                first_train_date = first_date
            
            valid_mask = dates.between(*period)
            train_mask = (dates.between(first_train_date, last_train_date)) & (dates < first_valid_date)
            
            yield indices[train_mask], indices[valid_mask]



def gen_eval_periods(start_date: str,
                     n_test: int,
                     n_splits: int,
                     unit: str = 'd') -> List[Tuple[datetime, datetime]]:
    start_date = pd.to_datetime(start_date)
    eval_periods = []
    for _ in range(n_splits):
        end_date = start_date + pd.to_timedelta(n_test, unit=unit)
        eval_periods.append([start_date, end_date])
        start_date = end_date + pd.to_timedelta(1, unit=unit)
    return eval_periods
    
    
    

In [7]:
def setup_dir(on_kaggle: bool = True, sample_level: int = 0):
    if on_kaggle:
        data_dir = Path('../input/g-research-crypto-forecasting/')
        raw_train_dir = (Path('../input/create-sample-dataset/data/raw/')
                         if sample_level > 0 else data_dir) 
    else:
        data_dir = raw_train_dir = Path('data/raw')
    
    if sample_level > 0:
        raw_train_dir = raw_train_dir.joinpath('sample', str(sample_level))
    
    return data_dir, raw_train_dir

In [8]:
DATA_DIR, RAW_TRAIN_DIR = setup_dir(ON_KAGGLE, sample_level=SAMPLE_LEVEL)
RAW_TRAIN_PATH = RAW_TRAIN_DIR / 'train.csv'
ASSET_DETAILS_PATH = DATA_DIR / 'asset_details.csv'

In [9]:
RAW_TRAIN_DIR

PosixPath('data/raw')

In [10]:
DATA_DIR

PosixPath('data/raw')

In [11]:
%%time 
raw_data = pd.read_csv(RAW_TRAIN_PATH)

CPU times: user 30.8 s, sys: 3.88 s, total: 34.6 s
Wall time: 36 s


## create train set

In [12]:
PREPRO_PARAMS = {'window': 60}
MAIN_INDEX = ['timestamp', 'Asset_ID']

In [13]:
# get valid data only, drop where the target is NaN 
data = raw_data.dropna(subset=['Target'])
# format time to human readable 
data['time'] = pd.to_datetime(data['timestamp'], unit='s')
# merge asset names
data = merge_asset_details(data, ASSET_DETAILS_PATH)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [14]:
data.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,time,Asset_Name
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218,2018-01-01 00:01:00,Bitcoin Cash
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399,2018-01-01 00:01:00,Binance Coin
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643,2018-01-01 00:01:00,Bitcoin
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922,2018-01-01 00:01:00,EOS.IO
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264,2018-01-01 00:01:00,Ethereum Classic


In [15]:
data.tail()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,time,Asset_Name
23486463,1632181440,9,163.0,156.502,156.62,156.0,156.075667,738.1963,156.260253,-0.001154,2021-09-20 23:44:00,Litecoin
23486464,1632181440,10,23.0,2420.146667,2421.0,2412.779,2414.069667,1.315794,2417.326038,0.009831,2021-09-20 23:44:00,Maker
23486465,1632181440,13,512.0,0.090989,0.091133,0.090831,0.090885,1900695.0,0.090943,0.004163,2021-09-20 23:44:00,TRON
23486466,1632181440,12,193.0,0.280952,0.2813,0.280175,0.280403,162869.9,0.280655,0.000449,2021-09-20 23:44:00,Stellar
23486467,1632181440,11,38.0,232.075,232.3,231.31,231.68,49.14804,231.794078,0.002568,2021-09-20 23:44:00,Monero


In [16]:
data = infer_dtypes(data)

In [17]:
data = compute_instant_features(data)

In [18]:
data.head()

Unnamed: 0,timestamp,Asset_ID,Count,High,Low,Close,Volume,VWAP,Target,time,Asset_Name,high_low_return,open_close_return,upper_shadow,lower_shadow,dolar_amount,vol_per_trades
0,1514764860,2,40.0,1.009644,0.99182,2374.590088,19.233005,2373.116455,-0.004218,2018-01-01 00:01:00,Bitcoin Cash,0.702093,0.692728,0.000425,2394.174316,45670.503485,0.480825
1,1514764860,0,5.0,1.0,1.0,8.53,78.38,8.53,-0.014399,2018-01-01 00:01:00,Binance Coin,0.693147,0.693147,0.117233,8.53,668.581379,15.676
2,1514764860,1,229.0,1.01291,0.987779,13850.175781,31.550062,13827.0625,-0.014643,2018-01-01 00:01:00,Bitcoin,0.705788,0.693689,7.3e-05,14006.370117,436973.897961,0.137773
3,1514764860,5,32.0,1.0,0.999621,7.6576,6626.71337,7.657713,-0.013922,2018-01-01 00:01:00,EOS.IO,0.693337,0.693017,0.130555,7.6605,50744.719811,207.084793
4,1514764860,7,5.0,1.0,0.998225,25.877001,121.08731,25.891363,-0.008264,2018-01-01 00:01:00,Ethereum Classic,0.694036,0.692317,0.03858,25.923006,3133.376419,24.217462


In [19]:
data.isna().mean()

timestamp            0.000000e+00
Asset_ID             0.000000e+00
Count                0.000000e+00
High                 0.000000e+00
Low                  0.000000e+00
Close                0.000000e+00
Volume               0.000000e+00
VWAP                 3.831994e-07
Target               0.000000e+00
time                 0.000000e+00
Asset_Name           0.000000e+00
high_low_return      0.000000e+00
open_close_return    0.000000e+00
upper_shadow         0.000000e+00
lower_shadow         0.000000e+00
dolar_amount         0.000000e+00
vol_per_trades       0.000000e+00
dtype: float64

In [20]:
data.fillna({'VWAP': data['VWAP'].mean()}, inplace=True)

In [21]:
data.isna().mean()

timestamp            0.0
Asset_ID             0.0
Count                0.0
High                 0.0
Low                  0.0
Close                0.0
Volume               0.0
VWAP                 0.0
Target               0.0
time                 0.0
Asset_Name           0.0
high_low_return      0.0
open_close_return    0.0
upper_shadow         0.0
lower_shadow         0.0
dolar_amount         0.0
vol_per_trades       0.0
dtype: float64

In [22]:
list(data.columns)

['timestamp',
 'Asset_ID',
 'Count',
 'High',
 'Low',
 'Close',
 'Volume',
 'VWAP',
 'Target',
 'time',
 'Asset_Name',
 'high_low_return',
 'open_close_return',
 'upper_shadow',
 'lower_shadow',
 'dolar_amount',
 'vol_per_trades']

In [23]:
FEATURES = [
'Count',
'High',
'Low',
'Close',
'Volume',
'VWAP',
'high_low_return',
'open_close_return',
'upper_shadow',
'lower_shadow',
'dolar_amount',
'vol_per_trades'
]

In [24]:
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline

def build_model(params={'alpha': 0.001}):
    model = Pipeline([('filter', FilterFeatures(FEATURES)),
                      ('norm', MinMaxScaler()),
                      ('model', Ridge(params['alpha'], random_state=1))])
    return model

In [25]:
def train_model(config, train_data, valid_data, pipeline=None):
    model  = build_model(config['model'])

    if config['training']['time_decay_alpha'] is not None:
        time_decay_alpha = config['training']['time_decay_alpha']
        print(f'using exponential_time_decay with alpha {time_decay_alpha}')
        timesteps = ((train_data['timestamp'].max() - train_data['timestamp'])//60//60//24)
        weight = time_decay_alpha ** timesteps
    else:
        weight = None

    model.fit(train_data, train_data['Target'], model__sample_weight=weight)
    return model

In [26]:
PULIC_LB_RANGE = ['2021-06-13',
                  '2021-09-13'] # 3 MONTH WORTH OF DATA

In [33]:
EVAL_PERIODS = [PULIC_LB_RANGE]

In [36]:
CV_PARAMS = {'gap_unit': 'min', 'dt_col': 'time'}

CV = TimeSeriesSplit(EVAL_PERIODS, **CV_PARAMS)

In [37]:
train_idx, valid_idx = next(iter(CV.split(data)))

In [38]:
train_data = data.loc[train_idx, :].reset_index(drop=True)

In [39]:
valid_data = data.loc[valid_idx, :].reset_index(drop=True)

In [40]:
MODEL_CONFIG = {'alpha': 0.001}
TRAIN_CONFIG = {'time_decay_alpha': 0.99}
CONFIG = {'model': MODEL_CONFIG, 'training': TRAIN_CONFIG}

In [41]:
asset_ids = data['Asset_ID'].unique()
MODELS = {}
for asset_id in asset_ids:
    print(f'training asset_id = {asset_id}')
    train_asset_data = get_data_for_asset(train_data, asset_id)
    model = train_model(CONFIG, train_asset_data, valid_data)
    MODELS[asset_id] = model

training asset_id = 2
using exponential_time_decay with alpha 0.99
training asset_id = 0
using exponential_time_decay with alpha 0.99
training asset_id = 1
using exponential_time_decay with alpha 0.99
training asset_id = 5
using exponential_time_decay with alpha 0.99
training asset_id = 7
using exponential_time_decay with alpha 0.99
training asset_id = 6
using exponential_time_decay with alpha 0.99
training asset_id = 9
using exponential_time_decay with alpha 0.99
training asset_id = 11
using exponential_time_decay with alpha 0.99
training asset_id = 13
using exponential_time_decay with alpha 0.99
training asset_id = 12
using exponential_time_decay with alpha 0.99
training asset_id = 3
using exponential_time_decay with alpha 0.99
training asset_id = 8
using exponential_time_decay with alpha 0.99
training asset_id = 10
using exponential_time_decay with alpha 0.99
training asset_id = 4
using exponential_time_decay with alpha 0.99


In [42]:
if not ON_KAGGLE:
    sys.path.append(str(DATA_DIR))

import gresearch_crypto
import traceback
env = gresearch_crypto.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

In [43]:
for i, (raw_test_df, submission) in enumerate(iter_test):

    submission = inference(test_data=raw_test_df, submission=submission,
                            models=MODELS)
    if i % 1000 == 0 or i < 10:
        display(submission)    
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


Unnamed: 0,row_id,Target
0,0,-0.000199
1,1,-5.2e-05
2,2,-0.000152
3,3,7.5e-05
4,4,-0.000214
5,5,-0.000119
6,6,-6.9e-05
7,7,-1.6e-05
8,8,0.000194
9,9,-8.3e-05


Unnamed: 0,row_id,Target
0,14,6.196665e-07
1,15,-8.949383e-06
2,16,-4.630478e-05
3,17,8.507234e-05
4,18,-1.035209e-05
5,19,-0.0001562369
6,20,1.403401e-05
7,21,-6.647741e-06
8,22,0.0002407233
9,23,3.95601e-05


Unnamed: 0,row_id,Target
0,28,0.000126
1,29,8.5e-05
2,30,-4e-05
3,31,7.1e-05
4,32,9.3e-05
5,33,-0.000103
6,34,0.000193
7,35,-4e-06
8,36,0.000247
9,37,5.2e-05


Unnamed: 0,row_id,Target
0,42,0.000275
1,43,3.5e-05
2,44,1.7e-05
3,45,1.8e-05
4,46,-3.3e-05
5,47,-2.2e-05
6,48,-3.5e-05
7,49,5.8e-05
8,50,0.000226
9,51,8.3e-05
