In [None]:
!pip install /kaggle/input/scikit/scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

#  UMP TF-Record: CombinatorialPurgedGroupKFold

In this notebook, I am going to create TF-Record for UMP dataset using CombinatorialPurgedGroupKFold CV strategy.

In [None]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import tensorflow as tf
import json
import numpy as np
from scipy.special import comb
from itertools import combinations
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedGroupKFold

class CombinatorialPurgedGroupKFold():
    def __init__(self, n_splits = 6, n_test_splits = 2, purge = 1, pctEmbargo = 0.01, **kwargs):
        self.n_splits = n_splits
        self.n_test_splits = n_test_splits
        self.purge = purge
        self.pctEmbargo = pctEmbargo
        
    def split(self, X, y = None, groups = None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
            
        u, ind = np.unique(groups, return_index = True)
        unique_groups = u[np.argsort(ind)]
        n_groups = len(unique_groups)
        group_dict = {}
        for idx in range(len(X)):
            if groups[idx] in group_dict:
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
                
        n_folds = comb(self.n_splits, self.n_test_splits, exact = True)
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
            
        mbrg = int(n_groups * self.pctEmbargo)
        if mbrg < 0:
            raise ValueError(
                "The number of 'embargoed' groups should not be negative")
        
        split_dict = {}
        group_test_size = n_groups // self.n_splits
        for split in range(self.n_splits):
            if split == self.n_splits - 1:
                split_dict[split] = unique_groups[int(split * group_test_size):].tolist()
            else:
                split_dict[split] = unique_groups[int(split * group_test_size):int((split + 1) * group_test_size)].tolist()
        
        for test_splits in combinations(range(self.n_splits), self.n_test_splits):
            test_groups = []
            banned_groups = []
            for split in test_splits:
                test_groups += split_dict[split]
                banned_groups += unique_groups[split_dict[split][0] - self.purge:split_dict[split][0]].tolist()
                banned_groups += unique_groups[split_dict[split][-1] + 1:split_dict[split][-1] + self.purge + mbrg + 1].tolist()
            train_groups = [i for i in unique_groups if (i not in banned_groups) and (i not in test_groups)]

            train_idx = []
            test_idx = []
            for train_group in train_groups:
                train_idx += group_dict[train_group]
            for test_group in test_groups:
                test_idx += group_dict[test_group]
            yield train_idx, test_idx

Let's use a small sample data to understand this CV strategy.

In [None]:
n_splits = 6
n_test_splits = 1
elements = list(range(10 * (n_splits + n_test_splits)))
groups = [element // n_splits for element in elements]
data = pd.DataFrame({"group": groups, "element": elements})
kfold = CombinatorialPurgedGroupKFold(n_splits, n_test_splits)

for index, (train_indices, test_indices) in enumerate(kfold.split(data, groups=data["group"])):
    print("=" * 100)
    print(f"Fold {index}")
    print("=" * 100)
    print("Train indices:", train_indices, "Length:", len(train_indices))
    print("Test Indices:", test_indices, "Length:", len(test_indices))

## 

## Import dataset

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.head()

## Set time features

In [None]:
test_time_id_len = 942961

calendar_df = pd.read_csv("../input/chinese-holidays/holidays_of_china_from_2014_to_2030.csv", parse_dates=["date"], date_parser=lambda x: datetime.strptime(x, '%Y-%m-%d'))

# leave only national holidays
calendar_df = calendar_df.loc[(calendar_df.type.isin(["National holiday", "Common local holiday"]))]

# fill with everyday from 2014 to 2022
calendar_df = (
    pd.DataFrame({"date": pd.date_range(start="2014-01-01", end="2023-01-01")}).merge(calendar_df, on="date", how="left")
    .assign(weekday=lambda x: x.date.dt.day_name(), year=lambda x: x.date.dt.year)
)

# remove weekends and national holidays and align with time_id
calendar_df = (
    calendar_df.loc[(~calendar_df.weekday.isin(["Sunday", "Saturday"]))&(calendar_df.name.isna())]
    .reset_index(drop=True)
    .head(train.time_id.max()+1)
    .dropna(axis=1)
)

calendar_df['quarter'] = calendar_df['date'].dt.to_period('Q')
calendar_df['time_id'] = calendar_df.index

le = LabelEncoder()
calendar_df['quarter'] = le.fit_transform(calendar_df['quarter'])

train = train.merge(calendar_df[['time_id', 'year', 'quarter']], how='left', on='time_id')

time_id = train.pop("time_id")
quarter = train.pop("quarter")
year = train.pop("year")

del calendar_df
gc.collect()

## Set cluster_id feature

In [None]:
%%time
inv_id_to_cluster = pd.read_pickle('../input/part-1-1-ubiquant-clustering/clustered_inv_index.pkl')
inv_id_to_cluster.index.name = 'investment_id'

train = train.merge(inv_id_to_cluster, how='left', on='investment_id')
train.cluster.fillna(0, inplace=True)
train.cluster = train.cluster.astype(np.uint16)

cluster = train.pop('cluster')
num_clusters = cluster.unique().shape[0] - 1
cluster = pd.get_dummies(cluster, drop_first=True, prefix='cluster_')

# Sorted groups 'investment_id' by 'cluster_id': 
# investment_id_cluster_dict = dict()
# for unique in cluster_id_feature.unique():
#     investment_id_cluster_dict.update({unique: inv_id_to_cluster[inv_id_to_cluster['cluster'] == unique].index.to_numpy()})
    
# Hot Encode cluster feature
# cluster_id_feature = tf.keras.utils.to_categorical(cluster_id_feature, num_classes=num_classes, dtype='uint8')
    
del inv_id_to_cluster
gc.collect()

## Set A-shares sub section feature

In [None]:
map_info = pd.read_csv('../input/ubiquant-a-shares/map_info.csv')[['Sub_Section', 'Main_Section', 'Market_Value', 'investment_id']]
map_info.rename({'Sub_Section': 'sub_section', 'Main_Section': 'main_section', 'Market_Value': 'market_value'}, axis=1, inplace=True)

le = LabelEncoder()
map_info['sub_section'] = le.fit_transform(map_info['sub_section'])
map_info['sub_section'] += 1
map_info['main_section'] += 1

train = train.merge(map_info, how='left', on='investment_id')
train[['main_section', 'sub_section']] = train[['main_section', 'sub_section']].fillna(0)
train[['main_section', 'investment_id', 'sub_section']] = train[['main_section', 'investment_id', 'sub_section']].astype(np.uint16)
train['market_value'] = train['market_value'].fillna(train['market_value'].mean())
train[['market_value']] = train[['market_value']].astype(np.uint64)

sub_section = train.pop('sub_section')
num_sub_sections = sub_section.unique().shape[0]
sub_section = pd.get_dummies(sub_section)

main_section = train.pop('main_section')
num_main_sections = main_section.unique().shape[0] - 1
main_section = pd.get_dummies(main_section, drop_first=True)

market_value = train.pop('market_value')

# # Sorted groups 'investment_id' by 'cluster_id': 
# investment_id_sub_section_dict = dict()
# for unique in sub_section_feature.unique():
#     investment_id_sub_section_dict.update({unique: map_info[map_info['sub_section'] == unique].index.to_numpy()})
    
# Hot Encode sub section feature
# sub_section_feature = tf.keras.utils.to_categorical(sub_section_feature, num_classes=num_sections, dtype='uint8')

del map_info
gc.collect()

## Drop unnecessary features

In [None]:
investment_id = train.pop("investment_id")

## Set target features

In [None]:
y = train.pop("target")

## Create TF-Record

In [None]:
def create_record(i):
    dic = {}
    dic["features"] = tf.train.Feature(float_list=tf.train.FloatList(value=list(train.iloc[i])))
    dic["time_id"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[time_id.iloc[i]]))
    dic["year"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[year.iloc[i]]))
    dic["quarter"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[quarter.iloc[i]]))
    dic["cluster"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list(cluster.iloc[i])))
    dic["sub_section"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list(sub_section.iloc[i])))
    dic["main_section"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list(main_section.iloc[i])))
    dic["market_value"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[market_value.iloc[i]]))
    dic["target"] = tf.train.Feature(float_list=tf.train.FloatList(value=[y.iloc[i]]))
    record_bytes = tf.train.Example(features=tf.train.Features(feature=dic)).SerializeToString()
    return record_bytes
    
def decode_function(record_bytes):
      return tf.io.parse_single_example(
      # Data
      record_bytes,
      # Schema
      {
          "features": tf.io.FixedLenFeature([300], dtype=tf.float32),
          "time_id": tf.io.FixedLenFeature([], dtype=tf.int64),
          "year": tf.io.FixedLenFeature([], dtype=tf.int64),
          "quarter": tf.io.FixedLenFeature([], dtype=tf.int64),
          "cluster": tf.io.FixedLenFeature([num_clusters], dtype=tf.int64), 
          "sub_section": tf.io.FixedLenFeature([num_sub_sections], dtype=tf.int64),
          "main_section": tf.io.FixedLenFeature([num_main_sections], dtype=tf.int64),
          "market_value": tf.io.FixedLenFeature([], dtype=tf.int64),
          "target": tf.io.FixedLenFeature([], dtype=tf.float32)
      }
  )

Now create the whole dataset, it will take a long time.

In [None]:
%%time
import time
n_splits = 5
n_test_splits = 1
# kfold = CombinatorialPurgedGroupKFold(n_splits, n_test_splits)
kfold = StratifiedGroupKFold(n_splits)
for fold, (train_indices, test_indices) in enumerate(kfold.split(train, investment_id, groups=time_id)):
    if fold != 4:
        continue
    print("=" * 100)
    print(f"Fold {fold}")
    print("=" * 100)
    print("Train Sample size:", len(train_indices))
    print("Test Sample size:", len(test_indices))
    train_save_path = f"fold_{fold}_train.tfrecords"
    begin = time.time()
    print(f"Creating {train_save_path}")
    with tf.io.TFRecordWriter(train_save_path) as file_writer:
        for i in train_indices:
            file_writer.write(create_record(i))
    print("Elapsed time: %.2f"%(time.time() - begin))
    test_save_path = f"fold_{fold}_test.tfrecords"
    begin = time.time()
    print(f"Creating {test_save_path}")
    with tf.io.TFRecordWriter(test_save_path) as file_writer:
        for i in test_indices:
            file_writer.write(create_record(i))
    print("Elapsed time: %.2f"%(time.time() - begin))

## Write unique Investment Ids

In [None]:
investment_ids = investment_id.unique()
investment_id_df = pd.DataFrame({"investment_id": investment_ids})
investment_id_df.to_csv("investment_ids.csv", index=False)