In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import sys
from IPython.display import display

In [2]:
ON_KAGGLE = False
USE_TOY_SAMPLE=False
USE_SAMPLE=True

In [3]:
if not ON_KAGGLE and os.path.abspath('.').endswith('notebook'):
    os.chdir('../')

In [4]:
#IMPORT_SCRIPT!
# THIS CELL WILL BE REMOVE WITH SCRIPTS IN SRC
if './src/' not in sys.path:
    sys.path.append('./src')

from preprocessing import *
from metrics import *
from cv import PurgedGroupTimeSeriesSplit

In [5]:
if ON_KAGGLE:
    RAW_DIR = Path('../input/g-research-crypto-forecasting/')
    SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/sample/')
    TOY_SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/toy_sample/')
else:
    RAW_DIR =  Path('data/raw')
    TOY_SAMPLE_DIR = RAW_DIR.joinpath('toy_sample')
    SAMPLE_DIR = RAW_DIR.joinpath('sample')

# filename
TRAIN_FILE = 'train.csv'
ASSET_DETAILS_PATH = RAW_DIR / 'asset_details.csv'

In [6]:
if USE_TOY_SAMPLE:
    print('USING TOY DATASET')
    RAW_TRAIN_PATH = TOY_SAMPLE_DIR / TRAIN_FILE

elif USE_SAMPLE:
    print('USING SAMPLE DATASET')
    RAW_TRAIN_PATH = SAMPLE_DIR / TRAIN_FILE

else:
    print('USING RAW DATASET')
    RAW_TRAIN_PATH = RAW_DIR / TRAIN_FILE

assert RAW_TRAIN_PATH.exists()

USING SAMPLE DATASET


In [7]:
!ls {RAW_DIR}

asset_details.csv		   sample
example_sample_submission.csv	   supplemental_train.csv
example_test.csv		   toy_sample
g-research-crypto-forecasting.zip  train.csv
gresearch_crypto


In [8]:
%%time 
raw_df = pd.read_csv(RAW_TRAIN_PATH)

CPU times: user 5.15 s, sys: 438 ms, total: 5.59 s
Wall time: 5.59 s


In [9]:
raw_df = ingest_data(raw_df, asset_details_path=ASSET_DETAILS_PATH)

In [10]:
raw_df.sort_values(by=['Asset_ID', 'date'], inplace=True)
raw_df.reset_index(drop=True, inplace=True)

In [11]:
raw_df.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Asset_Name,date
0,1609459260,0,219.0,37.384998,37.4221,37.348701,37.389,2749.5692,37.387058,-0.001669,Binance Coin,2021-01-01 00:01:00
1,1609459320,0,133.0,37.390499,37.402,37.3298,37.331299,778.868,37.351677,-0.001542,Binance Coin,2021-01-01 00:02:00
2,1609459380,0,151.0,37.317051,37.3367,37.2729,37.2915,890.921,37.301258,-0.001028,Binance Coin,2021-01-01 00:03:00
3,1609459440,0,123.0,37.299149,37.323002,37.250198,37.291901,489.9361,37.297272,0.000174,Binance Coin,2021-01-01 00:04:00
4,1609459500,0,623.0,37.212799,37.285,37.104801,37.213001,15144.3836,37.184082,0.00095,Binance Coin,2021-01-01 00:05:00


In [36]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from typing import List, Tuple

class TimeSeriesSplit(_BaseKFold):
    def __init__(self, periods: List[Tuple[str, str]],
                 train_days: int = None,
                 gap: int = 0,
                 gap_unit: int = 'd',
                 dt_col: str = 'date'):
        self.dt_col = dt_col
        self.periods = periods
        self.train_days = train_days
        self.gap = gap
        self.gap_unit = gap_unit
        
    def __len__(self):
        return len(self.periods)
    
    def check_input(self, X: pd.DataFrame, y=None, groups=None):
        assert self.dt_col in X.columns, f'{dt_col} do not exits in input dataframe'
        
    def split(self, X: pd.DataFrame, y=None, groups=None):
        dates = X[self.dt_col]
        self.check_input(X)
        
        first_date = dates.min()
        
        indices = np.arange(len(X))
        for period in self.periods:
            first_valid_date = pd.to_datetime(period[0])
            
            last_train_date = first_valid_date - pd.to_timedelta(self.gap, unit=self.gap_unit)
            
            first_train_data = (last_train_date - pd.to_timedelta(self.train_days, unit='d')
                                if self.train_days else first_date)
            
            valid_mask = dates.between(*period)
            train_mask = (dates.between(first_train_data, last_train_date)) & (dates < first_valid_date)
            
            yield indices[train_mask], indices[valid_mask]            

In [90]:

def gen_eval_periods(start_date: str,
                     n_test: int,
                     n_splits: int,
                     unit: str = 'd') -> List[Tuple[pd.DatetimeTZDtype, pd.DatetimeTZDtype]]:
    start_date = pd.to_datetime(start_date)
    eval_periods = []
    for _ in range(n_splits):
        end_date = start_date + pd.to_timedelta(n_test, unit=unit)
        eval_periods.append([start_date, end_date])
        start_date = end_date + pd.to_timedelta(1, unit=unit)
    return eval_periods
    
    
    

In [91]:
isinstance(pd.to_datetime('2021-01-01'), pd.DatetimeTZDtype)

False

In [51]:
periods = [['2021-04-01', '2021-06-01'],
           ['2021-07-28', '2021-08-15'],
           ['2021-09-01', '2021-09-18'],
          ]