In [None]:
# BRANCH_NAME: start_over
# COMMIT: 1cb067b4a788e493f6eb146bf23c8bf762ac90b5
# COMMIT_MSG: updating notebooks

# MESSAGE: REMOVING MOV STATS CAUSED TOOK MUCH TIME, SIMPLER MINUTE FEAUTRES, USING ILOC INSTEAD OF LOC IN SUBMISSION
# LASTEST_COMMIT_DATE: 2021-11-28 19:33:42
# DATE: 2021-11-28 19:42:17

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import sys
from IPython.display import display

In [3]:
def on_kaggle() -> bool:
    try:
        import gresearch_crypto
        return True
    except ModuleNotFoundError:
        return False

In [4]:
# HYPER PARAMETERS
ON_KAGGLE = on_kaggle()
SAMPLE_LEVEL = 1
USE_SAMPLE = SAMPLE_LEVEL == 1
USE_TOY_SAMPLE = SAMPLE_LEVEL == 2

FORCE_REWRITE = True#(ON_KAGGLE and SAMPLE_LEVEL == 0)

In [5]:
if not ON_KAGGLE and os.path.abspath('.').endswith('notebook'):
    os.chdir('../')

In [6]:
# IMPORTED FROM src/preprocessing/ingest_data.py
import pandas as pd
import numpy as np

# INGEST DATA
# DATASET DTYPES FOR SAVING MEMORY
DTYPES = {'Asset_ID': 'int32',
          'Open': 'float32',
          'High': 'float32',
          'Low': 'float32',
          'Close': 'float32',
          'VWAP': 'float32'}


def merge_asset_details(df: pd.DataFrame, asset_details_path: str) -> pd.DataFrame:
    asset_details = pd.read_csv(asset_details_path)
    df = df.merge(asset_details[['Asset_ID', 'Asset_Name']], on='Asset_ID', how='left')
    assert df['Asset_Name'].isna().sum() == 0, 'unexpected Asset ID'
    return df


def infer_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    # replace inf with NaNs
    df = df.replace([np.inf, -np.inf], np.nan)
    return df.astype(DTYPES)


def date_to_timestamp(dates: pd.Series) -> pd.Series:
    return dates.astype(np.int64) // 10 ** 9


def create_valid_timestamp_range(data: pd.DataFrame, dt_col: str = 'timestamp') -> np.ndarray:
    start_ttp, end_ttp = data[dt_col].agg(('min', 'max'))
    return np.arange(start_ttp, end_ttp+60, 60)
    

def fill_gaps_with_timestmap(data: pd.DataFrame, dt_col: str = 'timestamp') -> pd.DataFrame:
    assert data[dt_col].duplicated().sum() == 0, f'{dt_col} contains duplicates, cant reindex from duplicated values'
    valid_ttp_range = create_valid_timestamp_range(data, dt_col)
    data = data.set_index(dt_col)
    filled_data = data.reindex(valid_ttp_range)
    return filled_data.reset_index().rename(columns={'index': dt_col})


def fill_gaps_crypto_data(data: pd.DataFrame,
                          dt_col: str = 'timestamp'):
    
    asset_id = np.unique(data['Asset_ID'])
    assert len(asset_id) == 1, 'expected one Asset_ID'
    data = fill_gaps_with_timestmap(data, dt_col)
    data['Asset_ID'] = int(asset_id[0])
    return data


# IMPORTED FROM src/preprocessing/feature_gen.py
import pandas as pd
import numpy as np
from typing import List, Callable, Dict
from scipy.stats import linregress


feature_dict_dtype = Dict[str, List[Callable]]
RAW_FEATURES = ['Count', 'Open', 'High', 'Low', 'Close',
                'Volume', 'VWAP']
SUFFIX_FOMRAT = '__{n}min_ft'


# FEATURE GEN FUNCTION
def log_return(x: pd.Series, periods: int = 1) -> pd.Series:
    return np.log(x).diff(periods=periods).fillna(0)


def realized_volatility(series: pd.Series) -> float:
    return np.sqrt(np.sum(np.power(series.to_numpy(), 2)))


def linear_slope(series: pd.Series) -> float:
    linreg = linregress(np.arange(len(series)), series)
    return linreg.slope

# UTIL
def join_columns(columns):
    return list(map(lambda f: '__'.join(map(str, f)), columns))

BASE_FEATURES_TO_DROP = ['Open']
def compute_instant_features(df: pd.DataFrame) -> pd.DataFrame:
    assert np.isin(RAW_FEATURES, df.columns).all(), \
           'missing raw features'

    # normalize High and Low features
    df['High'] = df['High'] / df['Open']
    df['Low'] = df['Low'] / df['Open']
    # create price features 
    # TODO: is it better to take the log or not?
    # TODO: is it better to take the ratio or the difference?
    # ratio will normalize features but if we use a single model for each asset will make no difference
    # for the time, lets take the ratio
    df['high_low_return'] = np.log1p(df['High'] / df['Low'])
    df['open_close_return'] = np.log1p(df['Close'] / df['Open'])
    df['upper_shadow'] = df['High'] / np.maximum(df['Close'], df['Open'])
    df['lower_shadow'] = np.minimum(df['Close'], df['Open']) / df['Low']

    # vol and count features
    # TODO: is it useful dolar_amount?
    df['dolar_amount'] = df['Close'] * df['Volume']
    df['vol_per_trades'] = df['Volume'] / df['Count']
    return df.drop(BASE_FEATURES_TO_DROP, axis=1)



# # FEATURES TO COMPUTE
# FEATURE_DICT = {'High': [np.max],
#                 'Low': [np.min],
#                 'Close': [np.mean],
#                 'price_return_1': [np.sum, realized_volatility],
#                 'vwap_return_1': [np.sum, realized_volatility],
#                 'Count': [np.sum, np.max],
#                 'Volume': [np.sum, np.max],
#                 'high_low_return': [np.mean],
#                 'open_close_return': [np.mean],
#                }

# FEATURE_DICT = {
#                 'Close': [np.mean],
#                }



# def map_function_to_dataframe(X: pd.DataFrame,
#                  feature_dict: feature_dict_dtype) -> Dict[str, float]:
#     features = {f'{name}__{func.__name__}': func(X[name])
#                 for name, func_list in feature_dict.items()
#                 for func in func_list}
#     return features


# def compute_features_on_inference(X: pd.DataFrame, n: int,
#                                  feature_dict: feature_dict_dtype) -> pd.DataFrame:
#     features = map_function_to_dataframe(X.tail(n), feature_dict)
#     return pd.DataFrame([features]).add_suffix(SUFFIX_FOMRAT.format(n=n)).astype(np.float32)


# def compute_features_on_train(X: pd.DataFrame, n: int,
#                              feature_dict: feature_dict_dtype) -> pd.DataFrame:
#     assert X['Asset_ID'].nunique() == 1, \
#            'expected only one Asset_ID'
    
#     mov_features = X.rolling(n, min_periods=1).agg(feature_dict)
#     mov_features.columns = join_columns(mov_features.columns)
#     mov_features = mov_features.add_suffix(SUFFIX_FOMRAT.format(n=n))
    
#     assert len(mov_features) == len(X), 'output lenght do not match the input lenght'
#     return mov_features.astype(np.float32)


# IMPORTED FROM src/preprocessing/__init__.py
import pandas as pd
import time
import numpy as np
import gc
from typing import Tuple, Dict, Any


EXPECTED_RAW_COLS = ['timestamp', 'Asset_ID', 'Count',
                     'Open', 'High', 'Low', 'Close',
                     'Volume', 'VWAP']

# def process_train_data(df: pd.DataFrame,
#                        window: int = 60) -> pd.DataFrame:
#     asset_ids = sorted(df['Asset_ID'].unique())
    
#     global_features = []
#     for asset_id in asset_ids:
#         print(f'processing asset_id={asset_id}')
#         raw_local_data = df.query("Asset_ID==@asset_id").reset_index(drop=True)
#         # fill nan gaps
#         raw_local_data = fill_gaps_crypto_data(raw_local_data)
#         raw_local_data = infer_dtypes(raw_local_data)
#         # base features
#         raw_features = compute_base_features(raw_local_data)
        
#         # compute history features
#         start_time = time.time()
#         features = compute_features_on_train(raw_features, window, FEATURE_DICT)
#         elapsed_time = (time.time() - start_time) / 60
        
#         print(f'elapsed time: {elapsed_time:.4f}min')
#         # add timestamp
#         features['timestamp'] = raw_features['timestamp'].to_numpy()
#         features['Asset_ID'] = asset_id
#         global_features.append(features)

#         del raw_local_data, raw_features
#         gc.collect()
#     print('joining datasets')
#     global_features = pd.concat(global_features, axis=0, ignore_index=True)
#     assert global_features['Asset_ID'].nunique() == len(asset_ids), \
#            f'missing Asset_IDs'
#     return global_features


# def process_test_data(test_dict: Dict[str, float], local_history_df: pd.DataFrame,
#                       window: int = 60) -> Tuple[pd.DataFrame, pd.DataFrame]:

#     last_timestamp = local_history_df.iloc[-1]['timestamp']
#     current_timestamp = test_dict['timestamp']
#     # add new observation forget the last first row
#     local_history_df = local_history_df.append([test_dict], ignore_index=True)
#     minute_diff = (current_timestamp - last_timestamp) // 60

#     assert minute_diff > 0, f'current timestamp included in history df, {current_timestamp} <= {last_timestamp}'

#     if minute_diff > 1:
#         print(f'missing more than one minut of data, missing minutes: {minute_diff}')
#         print(f'filling gaps')
#         local_history_df = fill_gaps_crypto_data(local_history_df)
#     raw_features = compute_base_features(local_history_df)
#     features = compute_features_on_inference(raw_features, n=window, feature_dict=FEATURE_DICT)

#     return features, local_history_df


def test_submission_format(submission: pd.DataFrame, expected_len: int = 14):
    assert list(submission.columns) == ['row_id', "Target"], 'submission do not match expected columns'
    assert len(submission) == expected_len, 'submission do not match expected lenght'
    assert submission['Target'].isna().sum() == 0, 'target includes NaNs'
    assert submission['row_id'].dtype == np.int32
    assert submission['Target'].dtype == np.float64
    assert submission['Target'].isna().sum() == 0, 'submission contains NaN values'
    assert np.isinf(submission['Target']).sum() == 0 ,'submission contains inf values'


def inference(test_data: pd.DataFrame, submission: pd.DataFrame,
             models: Dict[str, Any],
             ) -> pd.DataFrame:
    expected_len = len(submission)
    test_data = infer_dtypes(test_data)
    features = compute_instant_features(test_data.loc[:, EXPECTED_RAW_COLS])
    records = features.to_dict('records')
    for index, asset_features in enumerate(records):
        # get the asset ID
        asset_id = int(asset_features['Asset_ID'])
        assert asset_id in models, f'{asset_id} not in TRAINED MODELS'
        # get model
        model = models[asset_id]
        asset_frame = pd.DataFrame([asset_features])
        local_test_yhat = model.predict(asset_frame)
        # add to submission format
        submission.iloc[index, 1] = local_test_yhat[0]
    # testing submission format
    test_submission_format(submission, expected_len=expected_len)
    return submission


# IMPORTED FROM src/metrics.py
import numpy as np
from typing import Tuple
import pandas as pd


ASSET_WEIGHT = {
'Bitcoin Cash': 2.3978952727983707,
'Binance Coin': 4.30406509320417,
'Bitcoin': 6.779921907472252,
'EOS.IO': 1.3862943611198906,
'Ethereum Classic': 2.079441541679836,
'Ethereum': 5.8944028342648505,
'Litecoin': 2.3978952727983707,
'Monero': 1.6094379124341005,
'TRON': 1.791759469228055,
'Stellar': 2.079441541679836,
'Cardano': 4.406719247264253,
'IOTA': 1.0986122886681098,
'Maker': 1.0986122886681098,
'Dogecoin': 3.555348061489413}


TOTAL_WEIGHT_SUM = sum(ASSET_WEIGHT.values())

#### weighted correlation cofficient
def compute_weighted_mean(x: np.ndarray, w: np.ndarray) -> float:
    return np.average(x, weights=w)


def compute_weighted_var(x: np.ndarray, w: np.ndarray) -> float:
    mean = compute_weighted_mean(x, w)
    dev = np.square(x - mean)
    return compute_weighted_mean(dev, w)


def compute_weighted_cov(y: np.ndarray, yhat: np.ndarray, w: np.ndarray) -> float:
    y_mean = compute_weighted_mean(y, w)
    yhat_mean = compute_weighted_mean(yhat, w)
    return compute_weighted_mean((y - y_mean) * (yhat - yhat_mean), w)


def compute_weighted_corr(y: np.ndarray, yhat: np.ndarray,
                          w: np.ndarray = None) -> float:
    if w is None:
        w = np.ones(len(y))
    assert len(y) == len(yhat)
    var_y = compute_weighted_var(y, w)
    var_yhat = compute_weighted_var(yhat, w)
    
    return compute_weighted_cov(y, yhat, w) / np.sqrt(var_y * var_yhat)


def compute_correlation(df: pd.DataFrame,
                        target_name: str = 'Target',
                        yhat_name: str = 'yhat',
                        group_col: str = 'Asset_ID') -> pd.DataFrame:
    def _spearman_corr(d: pd.DataFrame):
        return np.corrcoef(d[target_name], d[yhat_name])[0, 1]
    
    assert df[target_name].isna().sum() == 0, f'{target_name} includes NaN'
    corrs = df.groupby(group_col).apply(_spearman_corr)
    return corrs.to_frame('corr').reset_index()


def compute_sharpe(df: pd.DataFrame,
                   period: int = 60*24*7,   # weekly
                   target_name: str = 'Target',
                   yhat_name: str = 'yhat',
                   weight_name: str = 'weight',
                   ) -> float:
    
    timesteps = (df['timestamp'].max() - df['timestamp']) // 60   # from 0 up to n min,
    time_groups = timesteps // period
    corrs = df.groupby(time_groups).apply(lambda d: compute_weighted_corr(y=d[target_name].to_numpy(),
                                                                          yhat=d[yhat_name].to_numpy(),
                                                                          w=d[weight_name].to_numpy()))
    assert np.isnan(corrs).sum() == 0, 'period corrs contains NaN values'
    mean = corrs.mean()
    std = corrs.std()
    return {'sharpe': mean / (std + 1e-15), 'corr_period_mean': mean, 'corr_period_std': std}


def compute_metrics(df: pd.DataFrame,
                    target_name: str = 'Target',
                    yhat_name: str = 'yhat',
                    group_col: str = 'Asset_Name') -> Tuple[pd.Series, pd.DataFrame]:

    # BASE APPROACH, COMPUTE CORR AND THE WEIGHTED THEM
    corrs_df = compute_correlation(df, target_name=target_name,
                                   yhat_name=yhat_name,
                                   group_col=group_col)
    corrs_df['weight'] = corrs_df[group_col].map(ASSET_WEIGHT)
    corrs_df['weighted_corr'] = corrs_df[['corr', 'weight']].prod(axis=1)
    corr = corrs_df['weighted_corr'].sum() / TOTAL_WEIGHT_SUM
    
    corr_stats = corrs_df['corr'].agg(('min', 'max', 'std')).add_prefix('corr_').to_dict()
    # COMPUTE WEIGHTED CORRELATION USING FORMULA
    df['_weight'] = df[group_col].map(ASSET_WEIGHT)
    theor_corr = compute_weighted_corr(y=df[target_name], yhat=df[yhat_name], w=df['_weight'].to_numpy())
    # DIVIDE IT INTO DAILY CHUNKS AND COMPUTE SHARPE
    sharpe_scores = compute_sharpe(df, target_name=target_name, yhat_name=yhat_name, weight_name='_weight')
    scores = {'theor_corr': theor_corr, 'weighted_corr': corr}
    scores.update(sharpe_scores)
    scores.update(corr_stats)
    df.drop('_weight', axis=1, inplace=True)
    return pd.Series(scores), corrs_df


# IMPORTED FROM src/cv.py
import numpy as np
from sklearn.model_selection import KFold
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from typing import List, Tuple
import pandas as pd
from datetime import datetime



def get_date_range(dates: pd.Series):
    return dates.agg(('min', 'max'))


class TimeSeriesSplit(_BaseKFold):
    def __init__(self, periods: List[Tuple[str, str]],
                 train_days: int = None,
                 gap: int = 1,
                 gap_unit: int = 'd',
                 dt_col: str = 'date'):
        self.dt_col = dt_col
        self.periods = periods
        self.train_days = train_days
        self.gap = gap
        self.gap_unit = gap_unit
        
    def __len__(self) -> int:
        return len(self.periods)
    
    def check_input(self, X: pd.DataFrame, y=None, groups=None):
        assert self.dt_col in X.columns, f'{self.dt_col} do not exits in input dataframe'
        
    def split(self, X: pd.DataFrame, y=None, groups=None):
        dates = X[self.dt_col]
        self.check_input(X)
        
        first_date = dates.min()
        
        indices = np.arange(len(X))
        for period in self.periods:
            first_valid_date = pd.to_datetime(period[0])
            
            last_train_date = first_valid_date - pd.to_timedelta(self.gap, unit=self.gap_unit)
            
            if self.train_days:
                first_train_date = last_train_date - pd.to_timedelta(self.train_days, unit='d')
                first_train_date = np.maximum(first_train_date, first_date)
            else:
                first_train_date = first_date
            
            valid_mask = dates.between(*period)
            train_mask = (dates.between(first_train_date, last_train_date)) & (dates < first_valid_date)
            
            yield indices[train_mask], indices[valid_mask]



def gen_eval_periods(start_date: str,
                     n_test: int,
                     n_splits: int,
                     unit: str = 'd') -> List[Tuple[datetime, datetime]]:
    start_date = pd.to_datetime(start_date)
    eval_periods = []
    for _ in range(n_splits):
        end_date = start_date + pd.to_timedelta(n_test, unit=unit)
        eval_periods.append([start_date, end_date])
        start_date = end_date + pd.to_timedelta(1, unit=unit)
    return eval_periods
    
    
    

In [7]:
if ON_KAGGLE:
    RAW_DIR = Path('../input/g-research-crypto-forecasting/')
    SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/sample/')
    TOY_SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/toy_sample/')
else:
    RAW_DIR =  Path('data/raw')
    TOY_SAMPLE_DIR = RAW_DIR.joinpath('toy_sample')
    SAMPLE_DIR = RAW_DIR.joinpath('sample')

# filename
TRAIN_FILE = 'train.csv'
ASSET_DETAILS_PATH = RAW_DIR / 'asset_details.csv'

In [8]:
if USE_TOY_SAMPLE:
    print('USING TOY DATASET')
    RAW_TRAIN_PATH = TOY_SAMPLE_DIR / TRAIN_FILE

elif USE_SAMPLE:
    print('USING SAMPLE DATASET')
    RAW_TRAIN_PATH = SAMPLE_DIR / TRAIN_FILE

else:
    print('USING RAW DATASET')
    RAW_TRAIN_PATH = RAW_DIR / TRAIN_FILE

assert RAW_TRAIN_PATH.exists()

USING SAMPLE DATASET


In [9]:
!ls {RAW_DIR}

asset_details.csv		   sample
example_sample_submission.csv	   supplemental_train.csv
example_test.csv		   toy_sample
g-research-crypto-forecasting.zip  train.csv
gresearch_crypto


In [None]:
%%time 
raw_data = pd.read_csv(RAW_TRAIN_PATH)

## create train set

In [None]:
PREPRO_PARAMS = {'window': 60}
MAIN_INDEX = ['timestamp', 'Asset_ID']

In [None]:
# get valid data only, drop where the target is NaN 
data = raw_data[MAIN_INDEX + ['Target']].dropna(subset=['Target'])
# format time to human readable 
data['time'] = pd.to_datetime(data['timestamp'], unit='s')
# merge asset names
data = merge_asset_details(data, ASSET_DETAILS_PATH)

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
features_df = raw_data.loc[:, EXPECTED_RAW_COLS]
features_df = infer_dtypes(features_df)

In [None]:
features_df = compute_instant_features(features_df)

In [None]:
features_df

In [None]:
assert features_df['timestamp'].isna().sum() == 0

In [None]:
data = data.merge(features_df, on=MAIN_INDEX, how='left')

In [None]:
data.isna().mean()

In [None]:
list(data.columns)

In [None]:
FEATURES = [
'Count',
'High',
'Low',
'Close',
'Volume',
'VWAP',
'high_low_return',
'open_close_return',
'upper_shadow',
'lower_shadow',
'dolar_amount',
'vol_per_trades'
]

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from typing import List


class FilterFeatures(BaseEstimator, TransformerMixin):
    def __init__(self,
                 features: List[str] = None,
                 sort: bool = False):
        self.sort = sort
        self.features = features[:]
        if self.sort:
            self.features.sort()
    
    def fit(self, X: pd.DataFrame, y=None):
        return self
    
    def transform(self, X: pd.DataFrame):
        return X.loc[:, self.features]

    
def test_filter_features():
    n_feat = 100
    size = 500
    t = 100
    expected_columns = [f'{f}_ft' for f in np.arange(n_feat)]
    
    expected_df = pd.DataFrame(np.random.randn(size, n_feat), columns=expected_columns)
    tmf = FilterFeatures(expected_columns)
    tmf.fit(expected_df)
    
    for i in range(t):
        shuffle_columns = np.random.permutation(expected_columns)
        shuffle_df = expected_df.loc[:, shuffle_columns]
        actual_columns = list(tmf.transform(shuffle_df).columns)
        assert (actual_columns == expected_columns), f'cols do not match at iter {i}'
    
test_filter_features()

def build_model(params={'alpha': 0.001}):
    model = Pipeline([('filter', FilterFeatures(FEATURES)),
                      ('norm', MinMaxScaler()),
                      ('model', Ridge(params['alpha']))])
    return model

In [None]:
def train_model(config, train_data, valid_data, pipeline=None):
    model  = build_model(config['model'])

    if config['training']['time_decay_alpha'] is not None:
        time_decay_alpha = config['training']['time_decay_alpha']
        print(f'using exponential_time_decay with alpha {time_decay_alpha}')
        timesteps = ((train_data['timestamp'].max() - train_data['timestamp'])//60//60//24)
        weight = time_decay_alpha ** timesteps
    else:
        weight = None

    model.fit(train_data[FEATURES], train_data['Target'], model__sample_weight=weight)
    
    return model

In [None]:
PULIC_LB_RANGE = ['2021-06-13 00:00:00',
                  '2021-09-13 00:00:00'] # 3 MONTH WORTH OF DATA

if USE_TOY_SAMPLE:
    EVAL_PERIODS = [['2021-09-15', '2021-09-22']]

else:
    EVAL_PERIODS = [PULIC_LB_RANGE]

In [None]:
CV_PARAMS = {'gap_unit': 'min', 'dt_col': 'time'}

CV = TimeSeriesSplit(EVAL_PERIODS, **CV_PARAMS)

In [None]:
train_idx, valid_idx = next(iter(CV.split(data)))

In [None]:
train_data = data.loc[train_idx, :].reset_index(drop=True)

In [None]:
valid_data = data.loc[valid_idx, :].reset_index(drop=True)

In [None]:
get_date_range(train_data['time'])

In [None]:
get_date_range(valid_data['time'])

In [None]:
MODEL_CONFIG = {'alpha': 0.001}
TRAIN_CONFIG = {'time_decay_alpha': 0.99}
CONFIG = {'model': MODEL_CONFIG, 'training': TRAIN_CONFIG}

In [None]:
MODELS = {}
for asset_id, train_asset_data in data.groupby("Asset_ID"):
    print(f'training model for asset_ID == {asset_id}')
    train_asset_data = train_asset_data.reset_index(drop=True)    
    model = train_model(CONFIG, train_asset_data, train_asset_data)
    MODELS[asset_id] = model

In [None]:
if not ON_KAGGLE:
    sys.path.append(str(RAW_DIR))

import gresearch_crypto
import traceback
env = gresearch_crypto.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

In [None]:
for i, (raw_test_df, submission) in enumerate(iter_test):

    submission = inference(test_data=raw_test_df, submission=submission,
                            models=MODELS)
    if i % 1000 == 0 or i < 10:
        display(submission)    
    env.predict(submission)