In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import lightgbm as lgb

In [None]:
class TrainTestHelper:
    """Combine/split datasets."""
    def __init__(self):
        self.ntrain = None

    def combine(self, train, test):
        """Combine 2 np.ndarrays or dataframes."""
        self.ntrain = train.shape[0]
        if isinstance(train, np.ndarray):
            return np.row_stack((train, test))
        return train.append(test, sort=False).reset_index(drop=True)

    def split(self, train_test):
        """Split np.ndarray or dataframe split into 2 entities."""
        if self.ntrain is None:
            return None
        if isinstance(train_test, np.ndarray):
            train = train_test[:self.ntrain, :]
            test = train_test[self.ntrain:, :]
        else:
            train = train_test.iloc[:self.ntrain, :].copy().reset_index(drop=True)
            test = train_test.iloc[self.ntrain:, :].copy().reset_index(drop=True)
        return train, test

def rmse(y_true, y_pred):
    "Root Mean Squared Error."
    return mean_squared_error(y_true, y_pred)**0.5

In [None]:
# Load files
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
ss = pd.read_csv('SampleSubmission.csv')

train = train[train['open'].notnull()].reset_index(drop=True)
target = train.pop('close')

helper = TrainTestHelper()
data = helper.combine(train, test)
del data['id'], data['asset_id']

# features
data['open_is_low'] = (data['open'] == data['low']).astype(int)
data['open_is_high'] = (data['open'] == data['high']).astype(int)
data['high-low'] = data['high'] - data['low']
data['high2low'] = data['high']/data['low']
data['cap2low'] = data['market_cap']/data['low']
data['cap2open'] = data['market_cap']/data['open']
data['gcap2low'] = data['market_cap_global']/data['low']
data['gcap2open'] = data['market_cap_global']/data['open']
data['gcap2cap'] = data['market_cap_global']/data['market_cap']
data['price_mag'] = data['open'].apply(lambda x: len(str(x).split('.')[0]))

cluster = KMeans(n_clusters=8, random_state=0)
cluster.fit(data['high'].dropna().values.reshape(-1, 1))
data_cl = pd.DataFrame(cluster.transform(data['high'].fillna(0).values.reshape(-1, 1)), columns=[f'cl_high_{c}' for c in range(8)])
data = pd.concat((data, data_cl), axis=1)
del data_cl

train, test = helper.split(data)

In [None]:
folds = []
for seed in range(10):
    folds.extend(list(KFold(n_splits=5, shuffle=True, random_state=seed).split(train)))

In [None]:
os.mkdir('gbm_models')

lgb_params = {}
lgb_params['objective'] = 'regression_l2'
lgb_params['metric'] = 'rmse'
lgb_params['learning_rate'] = 0.01
lgb_params['linear_tree'] = True
lgb_params['feature_fraction'] = 0.7
lgb_params['bagging_freq'] = 1
lgb_params['bagging_fraction'] = 0.9
lgb_params['max_bin'] = 700
lgb_params['min_data_in_bin'] = 10
lgb_params['min_data'] = 50
lgb_params['linear_lambda'] = 10
lgb_params['num_leaves'] = 9
lgb_params['path_smooth'] = 3
lgb_params['verbose'] = -1

config = {}
config['params'] = lgb_params
config['num_boost_round'] = 5000
config['early_stopping_rounds'] = 100
config['verbose'] = False

errors = []
test_prediction = np.zeros(test.shape[0])

for fold, (train_idx, valid_idx) in enumerate(folds):
    xtrain, xvalid = train.values[train_idx], train.values[valid_idx]
    ytrain, yvalid = target.values[train_idx], target.values[valid_idx]
    
    dtrain = lgb.Dataset(xtrain, label=ytrain)
    dvalid = lgb.Dataset(xvalid, label=yvalid, reference=dtrain)
    
    for seed in range(3):
        config['params']['seed'] = seed
        model = lgb.train(config['params'],
                          dtrain,
                          num_boost_round=config['num_boost_round'],
                          valid_sets=[dtrain, dvalid],
                          early_stopping_rounds=config['early_stopping_rounds'],
                          verbose_eval=config['verbose'])
        
        ypred = model.predict(test)
        error = np.round(rmse(yvalid, model.predict(xvalid)), 4)
        errors.append(error)
        
        test_prediction += ypred
        
        print(f'Fold {fold} seed {seed} error: {error}')
        model.save_model(f'gbm_models/GBM_{fold}_{seed}_{error}.txt')

print(f'Mean CV: {np.mean(errors)}')

Fold 0 seed 0 error: 54.7423
Fold 0 seed 1 error: 54.4782
Fold 0 seed 2 error: 55.61
Fold 1 seed 0 error: 49.8144
Fold 1 seed 1 error: 48.6273
Fold 1 seed 2 error: 49.8539
Fold 2 seed 0 error: 59.149
Fold 2 seed 1 error: 57.8452
Fold 2 seed 2 error: 58.225
Fold 3 seed 0 error: 57.5019
Fold 3 seed 1 error: 56.963
Fold 3 seed 2 error: 58.5491
Fold 4 seed 0 error: 60.0478
Fold 4 seed 1 error: 58.8554
Fold 4 seed 2 error: 59.5512
Fold 5 seed 0 error: 56.8141
Fold 5 seed 1 error: 53.1774
Fold 5 seed 2 error: 54.9422
Fold 6 seed 0 error: 56.4129
Fold 6 seed 1 error: 53.7982
Fold 6 seed 2 error: 54.729
Fold 7 seed 0 error: 60.5636
Fold 7 seed 1 error: 58.9305
Fold 7 seed 2 error: 60.9383
Fold 8 seed 0 error: 68.9403
Fold 8 seed 1 error: 70.2321
Fold 8 seed 2 error: 68.628
Fold 9 seed 0 error: 52.8034
Fold 9 seed 1 error: 49.3605
Fold 9 seed 2 error: 50.5453
Fold 10 seed 0 error: 57.4836
Fold 10 seed 1 error: 59.364
Fold 10 seed 2 error: 57.0134
Fold 11 seed 0 error: 53.8371
Fold 11 seed 1 err

In [None]:
ss['close'] = (test_prediction/150).clip(1, np.inf)
ss.loc[test['open'].isnull(), 'close'] = 0
ss.loc[ss['close'] > test['high'], 'close'] = test.loc[ss['close'] > test['high'], 'high']
ss.loc[ss['close'] < test['low'], 'close'] = test.loc[ss['close'] < test['low'], 'low']
ss.to_csv('gbm_final.csv', index=False)