In [3]:
import os
import glob
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import scipy as sc
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import joblib
from statsmodels.tsa.arima.model import ARIMA as ARIMA
from arch import arch_model
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)
pd.options.mode.chained_assignment = None

In [16]:
data_dir = '../input/optiver-trading-at-the-close/'

def gts_cv(date_ids, holdout_size):

    X = date_ids[['date_id']]
    groups = X['date_id'].values
    unique_date_ids = np.unique(groups)
    array_split_date_ids = np.array_split(unique_date_ids, len(unique_date_ids) // holdout_size)[::-1]

    #array_split_date_ids = np.flip(array_split_date_ids)

    for date_ids in array_split_date_ids:
        
        
        test_condition = X['date_id'].isin(np.array([date_ids[-1]]))
        val_index = X.loc[test_condition].index
        train_condition = X['date_id'].isin(date_ids)
        train_index = X.loc[train_condition].index
        train_index = pd.Index(set(train_index)- set(val_index))


        yield train_index, val_index


def mae(y_true, y_pred):
    return np.abs(y_true - y_pred).mean()


def feval_mae(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'MAE', mae(y_true, y_pred), False



def train_and_evaluate_ARIMA(train):
 
    train = train[train['stock_id'] == 0]
    date_ids_train = train[['date_id','target']].reset_index(drop=True)

    cvs_dict = {'Fold': [],  'MAE_val': []}
    cv = gts_cv(date_ids_train, holdout_size=7)
    fold = 1
    for train_index, val_index in cv:

        target_train_cv, target_val_cv = date_ids_train.iloc[train_index], date_ids_train.iloc[val_index]

      
        print("训练集日期范围: {} -- {}".format(target_train_cv.date_id.min(),target_train_cv.date_id.max()))
        print("验证集日期范围: {} -- {}".format(target_val_cv.date_id.min(),target_val_cv.date_id.max()))
        
        target_train_cv =target_train_cv.drop(['date_id'], axis = 1)
        target_val_cv = target_val_cv.drop(['date_id'], axis = 1)


        model = ARIMA(target_train_cv, order=(2, 1, 6))  
        fitted = model.fit()  
        fc = fitted.forecast(len(target_val_cv)) 
        score=mae(fc.values,target_val_cv.values)
        #print(fc.values,score)
        cvs_dict['Fold'].append(fold)
        cvs_dict['MAE_val'].append(score)
        fold += 1
        
    print(f'CV Val MAE:',np.asarray(cvs_dict['MAE_val']).mean())

    return model

def train_and_evaluate_GARCH(train):
 
    train = train[train['stock_id'] == 0]
    date_ids_train = train[['date_id','target']].reset_index(drop=True)
    
    cvs_dict = {'Fold': [],  'MAE_val': []}
    cv = gts_cv(date_ids_train, holdout_size=30)
    fold = 1
    for train_index, val_index in cv:

        target_train_cv, target_val_cv = date_ids_train.iloc[train_index], date_ids_train.iloc[val_index]

      
        print("训练集日期范围: {} -- {}".format(target_train_cv.date_id.min(),target_train_cv.date_id.max()))
        print("验证集日期范围: {} -- {}".format(target_val_cv.date_id.min(),target_val_cv.date_id.max()))
        
        target_train_cv =target_train_cv.drop(['date_id'], axis = 1)
        target_val_cv = target_val_cv.drop(['date_id'], axis = 1)

        model = arch_model(target_train_cv, mean='Zero', vol='GARCH', p=2, q=6)
        model_fit = model.fit()
        yhat = model_fit.forecast(horizon=len(target_val_cv))
        score=mae(yhat.mean.values,target_val_cv.values)
        print(score)
        cvs_dict['Fold'].append(fold)
        cvs_dict['MAE_val'].append(score)
        fold += 1
        
    print(f'CV Val MAE:',np.asarray(cvs_dict['MAE_val']).mean())

    return model




In [17]:
dtypes = {
    'stock_id' : np.uint8,
    'date_id' : np.uint16,
    'seconds_in_bucket' : np.uint16,
    'imbalance_buy_sell_flag' : np.int8,
    'time_id' : np.uint16
    }
    
df = pd.read_csv('train.csv', dtype = dtypes)

In [18]:
train_and_evaluate_ARIMA(df)

训练集日期范围: 474 -- 479
验证集日期范围: 480 -- 480
训练集日期范围: 467 -- 472
验证集日期范围: 473 -- 473
训练集日期范围: 460 -- 465
验证集日期范围: 466 -- 466
训练集日期范围: 453 -- 458
验证集日期范围: 459 -- 459
训练集日期范围: 446 -- 451
验证集日期范围: 452 -- 452
训练集日期范围: 439 -- 444
验证集日期范围: 445 -- 445
训练集日期范围: 432 -- 437
验证集日期范围: 438 -- 438
训练集日期范围: 425 -- 430
验证集日期范围: 431 -- 431
训练集日期范围: 418 -- 423
验证集日期范围: 424 -- 424
训练集日期范围: 411 -- 416
验证集日期范围: 417 -- 417
训练集日期范围: 404 -- 409
验证集日期范围: 410 -- 410
训练集日期范围: 397 -- 402
验证集日期范围: 403 -- 403
训练集日期范围: 390 -- 395
验证集日期范围: 396 -- 396
训练集日期范围: 383 -- 388
验证集日期范围: 389 -- 389
训练集日期范围: 376 -- 381
验证集日期范围: 382 -- 382
训练集日期范围: 369 -- 374
验证集日期范围: 375 -- 375
训练集日期范围: 362 -- 367
验证集日期范围: 368 -- 368
训练集日期范围: 355 -- 360
验证集日期范围: 361 -- 361
训练集日期范围: 348 -- 353
验证集日期范围: 354 -- 354
训练集日期范围: 341 -- 346
验证集日期范围: 347 -- 347
训练集日期范围: 334 -- 339
验证集日期范围: 340 -- 340
训练集日期范围: 327 -- 332
验证集日期范围: 333 -- 333
训练集日期范围: 320 -- 325
验证集日期范围: 326 -- 326
训练集日期范围: 313 -- 318
验证集日期范围: 319 -- 319
训练集日期范围: 306 -- 311
验证集日期范围: 312 -- 312


<statsmodels.tsa.arima.model.ARIMA at 0x14372de2510>

In [7]:
train_and_evaluate_GARCH(df)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (16,) + inhomogeneous part.