In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from scipy.special import comb
from itertools import combinations

In [2]:
## requires 
## import numpy as np 
## from scipy.special import comb
## from itertools import combinations

class CombinatorialPurgedGroupKFold():
    def __init__(self, n_splits = 6, n_test_splits = 2, purge = 1, pctEmbargo = 0.01, **kwargs):
        self.n_splits = n_splits
        self.n_test_splits = n_test_splits
        self.purge = purge
        self.pctEmbargo = pctEmbargo
        
    def split(self, X, y = None, groups = None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
            
        u, ind = np.unique(groups, return_index = True)
        unique_groups = u[np.argsort(ind)]
        n_groups = len(unique_groups)
        group_dict = {}
        for idx in range(len(X)):
            if groups[idx] in group_dict:
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
                
        n_folds = comb(self.n_splits, self.n_test_splits, exact = True)
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
            
        mbrg = int(n_groups * self.pctEmbargo)
        if mbrg < 0:
            raise ValueError(
                "The number of 'embargoed' groups should not be negative")
        
        split_dict = {}
        group_test_size = n_groups // self.n_splits
        for split in range(self.n_splits):
            if split == self.n_splits - 1:
                split_dict[split] = unique_groups[int(split * group_test_size):].tolist()
            else:
                split_dict[split] = unique_groups[int(split * group_test_size):int((split + 1) * group_test_size)].tolist()
        
        for test_splits in combinations(range(self.n_splits), self.n_test_splits):
            test_groups = []
            banned_groups = []
            for split in test_splits:
                test_groups += split_dict[split]
                banned_groups += unique_groups[split_dict[split][0] - self.purge:split_dict[split][0]].tolist()
                banned_groups += unique_groups[split_dict[split][-1] + 1:split_dict[split][-1] + self.purge + mbrg + 1].tolist()
            train_groups = [i for i in unique_groups if (i not in banned_groups) and (i not in test_groups)]

            train_idx = []
            test_idx = []
            for train_group in train_groups:
                train_idx += group_dict[train_group]
            for test_group in test_groups:
                test_idx += group_dict[test_group]
            yield train_idx, test_idx

In [3]:
df = pd.read_parquet('parquet/train_low_mem.parquet')

In [4]:
time_col = ['time_id']
f_col = []
for i in range(0,300):
    time_col.append('f_'+str(i))
    f_col.append('f_'+str(i))


In [5]:
#polars implementation
#df.select(pl.col('*').cast(pl.Float16))

df = df.astype('float16')
df['time_id'] = df['time_id'].astype('int32')
df_x = df[time_col]
df_y = pd.DataFrame(df['target'])

## Consider deleting df for memory
## del df

In [6]:
%%time

cv = CombinatorialPurgedGroupKFold().split(df_x, df_y, groups = df_x['time_id'])

models = []
i = 0
for tr, val in cv:
    print(i)
    X_train = df_x.loc[tr][f_col]#.to_parquet(f'xtrain{i}.parquet')
    y_train = df_y.loc[tr]#.to_parquet(f'ytrain{i}.parquet')
    
    model = lgb.LGBMRegressor()
    model.fit(X_train, y_train, eval_metric='rmse')
    models.append(model)
    i+=1
print('Done')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
Done
CPU times: user 1h 39min 35s, sys: 5min 51s, total: 1h 45min 27s
Wall time: 17min 39s


In [9]:
preds = []
for model in models:
    pred = model.predict(df_x[f_col])
    preds.append(pred)
finalpred = np.mean(preds, axis=0)

In [10]:
finalpred

array([-0.00869107, -0.04353364,  0.04574885, ...,  0.02985028,
        0.01785094,  0.01530847])

Pickling the models for submission

In [None]:
import pickle
for i in range(len(models)):
    string = 'lgbmmodel'+str(i)+'.pkl'
    filepath = open(string, 'wb')
    pickle.dump(models[i], filepath)
    filepath.close()