In [None]:
import pandas as pd
import numpy as np
import glob
import gc
from numba import njit
import xgboost as xgb
from fastai_timeseries import *

In [None]:
# List files

files = glob.glob('data.*')
files_train = files[:-2]
files_test = files[1:-1]

In [None]:
# Fast standardization

@njit(parallel = True)
def standardize(X_train,X_test,means,stds):
    for i in range(X_train.shape[1]):
        X_train[:,i] = (X_train[:,i] - means[i]) / stds[i]
        X_test[:,i] = (X_test[:,i] - means[i]) / stds[i]
    return X_train, X_test

In [None]:
# Training / Testing batch

columns = ['time', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8', 'b9',
       'b10', 'b11', 'b12', 'b13', 'b14', 'b15', 'b16', 'b17', 'b18', 'b19',
       'b20', 'b21', 'b22', 'b23', 'b24', 'b25', 'b26', 'b27', 'b28', 'b29',
       'b30', 'b31', 'b32', 'b33', 'b34', 'b35', 'b36', 'b37', 'b38', 'b39',
       'b40', 'b41', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9',
       's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19',
       's20', 's21', 's22', 's23', 's24', 's25', 's26', 's27', 's28', 's29',
       's30', 's31', 's32', 's33', 's34', 's35', 's36', 's37', 's38', 's39',
       's40', 's41', 's42', 's43', 's44', 's45', 's46', 's47', 's48', 's49',
       's50', 'y_1m', 'y_5m', 'y_10m', 'y_15m', 'y_30m', 'y_hat1', 'y_hat2',
       'y_hat3']
for i in range(len(files_train)):
    print(files_train[i])
    #i=0
    #File loading
    if i == 0:
        df = pd.read_csv(files_train[i])
        df.columns = columns
    else:
        df = pd.read_csv(files_train[i],names=columns)
    test = pd.read_csv(files_test[i],header=None,names=columns)

    #Preprocessing
    df = df.set_index(pd.DatetimeIndex(pd.to_datetime(df[df.columns[0]])))
    target = df.y_10m
    df = df.drop(['s19','s24','time',"y_1m","y_5m","y_10m","y_15m","y_30m","y_hat1","y_hat2","y_hat3"],axis=1)
    df = df.fillna(0)

    test = test.set_index(pd.DatetimeIndex(pd.to_datetime(test[test.columns[0]])))
    target_test = test.y_10m
    test = test.drop(['s19','s24','time',"y_1m","y_5m","y_10m","y_15m","y_30m","y_hat1","y_hat2","y_hat3"],axis=1)
    test = test.fillna(0)
    test2 = np.array(test)

    y_train = np.array(target).reshape(len(target),)
    kernels = generate_kernels(df.shape[1], 1000)

    means = np.mean(np.array(df.tail(100000)),axis=1)
    stds = np.std(np.array(df.tail(100000)),axis=1)
    X_train, X_test = standardize(np.array(df.tail(100000)),test2,means,stds)

    # (2) then transform the normalised time series
    X_training_transform = apply_kernels(X_train, kernels)
    gc.collect()
    
    classifier = xgb.XGBRegressor(max_depth=3,
                              learning_rate=0.1,
                               n_estimators=100,
                               verbosity=2,
                               booster='gbtree',
                               tree_method='auto',
                               n_jobs=7,
                               gpu_id=0,
                               gamma=0,
                               min_child_weight=1,
                               max_delta_step=4,
                               subsample=.5,
                               colsample_bytree=1,
                               colsample_bylevel=1,
                               colsample_bynode=1,
                               reg_alpha=0,
                               reg_lambda=1,
                               scale_pos_weight=1,
                               base_score=0.5,
                               random_state=0,
                               missing=None)
                               #weight=np.sqrt(target_train**2)))
    for j in range(0,int(X_test.shape[0]/50000)+1):
        index = min(X_test.shape[0],(j+1)*50000)
        X_test_transform = apply_kernels(X_test[(j*50000):index], kernels)
        means = np.mean(X_train,axis=1)
        stds = np.std(X_train,axis=1)
        X_training_transform, X_test_transform = standardize(X_training_transform,X_test_transform,means,stds)
        if j == 0:
            print('training...')
            classifier.fit(X_training_transform, y_train[-100000:])
        print('testing:' + str(j) + '...')
        preds = classifier.predict(X_test_transform)
        if j == 0:
            all_preds = preds
        else:
            all_preds = np.concatenate((all_preds,preds))
        gc.collect()
    lin = np.linspace(0,1,5)
    print(pd.DataFrame(preds).quantile(lin))
    print("cor: " + str(np.corrcoef(target_test,all_preds)[0,1]))
    pd.DataFrame({'index':test.index,'y_pred':all_preds,'target_test':target_test}).to_csv("rocket" + str(i) + ".csv")

In [None]:
# list files

files = glob.glob('rocket*')[2:11]
files

In [None]:
# Concatenate predictions

for i,file in enumerate(files):
    df = pd.read_csv(file)
    df = df.set_index(pd.DatetimeIndex(pd.to_datetime(df[df.columns[0]]))) 
    df =df.y_pred
    if i == 0:
        all_df = df
    else:
        all_df = pd.concat((all_df,df))
all_df.to_csv('all_rocket.csv')