<a href="https://colab.research.google.com/github/anirbanghoshsbi/.github.io/blob/master/project/TimeSeriesSplit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
!pip install yfinance



In [34]:
import yfinance as yf

In [35]:
import pandas as pd
import datetime
from datetime import datetime as dt
from dateutil.relativedelta import *

class TimeBasedCV(object):
    '''
    Parameters 
    ----------
    train_period: int
        number of time units to include in each train set
        default is 30
    test_period: int
        number of time units to include in each test set
        default is 7
    freq: string
        frequency of input parameters. possible values are: days, months, years, weeks, hours, minutes, seconds
        possible values designed to be used by dateutil.relativedelta class
        deafault is days
    '''
    
    
    def __init__(self, train_period=30, test_period=7, freq='days'):
        self.train_period = train_period
        self.test_period = test_period
        self.freq = freq

        
        
    def split(self, data, validation_split_date=None, date_column='Date', gap=0):
        '''
        Generate indices to split data into training and test set
        
        Parameters 
        ----------
        data: pandas DataFrame
            your data, contain one column for the record date 
        validation_split_date: datetime.date()
            first date to perform the splitting on.
            if not provided will set to be the minimum date in the data after the first training set
        date_column: string, deafult='record_date'
            date of each record
        gap: int, default=0
            for cases the test set does not come right after the train set,
            *gap* days are left between train and test sets
        
        Returns 
        -------
        train_index ,test_index: 
            list of tuples (train index, test index) similar to sklearn model selection
        '''
        
        # check that date_column exist in the data:
        try:
            data[date_column]
        except:
            raise KeyError(date_column)
                    
        train_indices_list = []
        test_indices_list = []

        if validation_split_date==None:
            validation_split_date = data[date_column].min().date() + eval('relativedelta('+self.freq+'=self.train_period)')
        
        start_train = validation_split_date - eval('relativedelta('+self.freq+'=self.train_period)')
        end_train = start_train + eval('relativedelta('+self.freq+'=self.train_period)')
        start_test = end_train + eval('relativedelta('+self.freq+'=gap)')
        end_test = start_test + eval('relativedelta('+self.freq+'=self.test_period)')

        while end_test < data[date_column].max().date():
            # train indices:
            cur_train_indices = list(data[(data[date_column].dt.date>=start_train) & 
                                     (data[date_column].dt.date<end_train)].index)

            # test indices:
            cur_test_indices = list(data[(data[date_column].dt.date>=start_test) &
                                    (data[date_column].dt.date<end_test)].index)
            
            print("Train period:",start_train,"-" , end_train, ", Test period", start_test, "-", end_test,
                  "# train records", len(cur_train_indices), ", # test records", len(cur_test_indices))

            train_indices_list.append(cur_train_indices)
            test_indices_list.append(cur_test_indices)

            # update dates:
            start_train = start_train + eval('relativedelta('+self.freq+'=self.test_period)')
            end_train = start_train + eval('relativedelta('+self.freq+'=self.train_period)')
            start_test = end_train + eval('relativedelta('+self.freq+'=gap)')
            end_test = start_test + eval('relativedelta('+self.freq+'=self.test_period)')

        # mimic sklearn output  
        index_output = [(train,test) for train,test in zip(train_indices_list,test_indices_list)]

        self.n_splits = len(index_output)
        
        return index_output
    
    
    def get_n_splits(self):
        """Returns the number of splitting iterations in the cross-validator
        Returns
        -------
        n_splits : int
            Returns the number of splitting iterations in the cross-validator.
        """
        return self.n_splits 

In [36]:
data_for_modeling=yf.download('^nsei',start='2015-01-01',parse_dates=['Date']).reset_index()

[*********************100%***********************]  1 of 1 completed


In [37]:
data_for_modeling.head(1)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2015-01-02,8288.700195,8410.599609,8288.700195,8395.450195,8395.450195,101900


In [38]:
data_for_modeling['Signal']=data_for_modeling['Close'].transform(lambda x : np.sign(x.diff(1)))


In [41]:
# How to use TimeBasedCV
#data_for_modeling=pd.read_csv('data.csv', parse_dates=['record_date'])
tscv = TimeBasedCV(train_period=30,
                   test_period=7,
                   freq='days')
for train_index, test_index in tscv.split(data_for_modeling,
                   validation_split_date=datetime.date(2019,2,1), date_column='Date'):
    print(train_index, test_index)

# get number of splits
tscv.get_n_splits()

#### Example- compute average test sets score: ####
X = data_for_modeling[['Date','Open',"High",'Low','Volume']]
y = data_for_modeling['Signal']
from sklearn.linear_model import LogisticRegression
import numpy as np

scores = []
for train_index, test_index in tscv.split(X, validation_split_date=datetime.date(2019,2,1)):

    data_train   = X.loc[train_index].drop('Date', axis=1)
    target_train = y.loc[train_index]

    data_test    = X.loc[test_index].drop('Date', axis=1)
    target_test  = y.loc[test_index]

    # if needed, do preprocessing here

    clf = LogisticRegression()
    clf.fit(data_train,target_train)

    preds = clf.predict(data_test)

    # accuracy for the current fold only    
    r2score = clf.score(data_test,target_test)

    scores.append(r2score)

# this is the average accuracy over all folds
average_r2score = np.mean(scores)
#### End of example ####
print(average_r2score)

Train period: 2019-01-02 - 2019-02-01 , Test period 2019-02-01 - 2019-02-08 # train records 22 , # test records 5
Train period: 2019-01-09 - 2019-02-08 , Test period 2019-02-08 - 2019-02-15 # train records 22 , # test records 4
Train period: 2019-01-16 - 2019-02-15 , Test period 2019-02-15 - 2019-02-22 # train records 21 , # test records 5
Train period: 2019-01-23 - 2019-02-22 , Test period 2019-02-22 - 2019-03-01 # train records 21 , # test records 5
Train period: 2019-01-30 - 2019-03-01 , Test period 2019-03-01 - 2019-03-08 # train records 21 , # test records 4
Train period: 2019-02-06 - 2019-03-08 , Test period 2019-03-08 - 2019-03-15 # train records 20 , # test records 5
Train period: 2019-02-13 - 2019-03-15 , Test period 2019-03-15 - 2019-03-22 # train records 20 , # test records 4
Train period: 2019-02-20 - 2019-03-22 , Test period 2019-03-22 - 2019-03-29 # train records 20 , # test records 5
Train period: 2019-02-27 - 2019-03-29 , Test period 2019-03-29 - 2019-04-05 # train reco

In [40]:

#### Example- RandomizedSearchCV ####
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMRegressor
from random import randint, uniform

tscv = TimeBasedCV(train_period=10, test_period=3)
index_output = tscv.split(data_for_modeling, validation_split_date=datetime.date(2019,2,1))

lgbm = randomforestclassifier()

lgbmPd = {" max_depth": [-1,2]
         }

model = RandomizedSearchCV(
    estimator = lgbm,
    param_distributions = lgbmPd,
    n_iter = 10,
    n_jobs = -1,
    iid = True,
    cv = index_output,
    verbose=5,
    pre_dispatch='2*n_jobs',
    random_state = None,
    return_train_score = True)

model.fit(X.drop('record_date', axis=1),y)
model.cv_results_
#### End of example ####

Train period: 2019-01-22 - 2019-02-01 , Test period 2019-02-01 - 2019-02-04 # train records 8 , # test records 1
Train period: 2019-01-25 - 2019-02-04 , Test period 2019-02-04 - 2019-02-07 # train records 6 , # test records 3
Train period: 2019-01-28 - 2019-02-07 , Test period 2019-02-07 - 2019-02-10 # train records 8 , # test records 2
Train period: 2019-01-31 - 2019-02-10 , Test period 2019-02-10 - 2019-02-13 # train records 7 , # test records 2
Train period: 2019-02-03 - 2019-02-13 , Test period 2019-02-13 - 2019-02-16 # train records 7 , # test records 2
Train period: 2019-02-06 - 2019-02-16 , Test period 2019-02-16 - 2019-02-19 # train records 7 , # test records 1
Train period: 2019-02-09 - 2019-02-19 , Test period 2019-02-19 - 2019-02-22 # train records 5 , # test records 3
Train period: 2019-02-12 - 2019-02-22 , Test period 2019-02-22 - 2019-02-25 # train records 7 , # test records 1
Train period: 2019-02-15 - 2019-02-25 , Test period 2019-02-25 - 2019-02-28 # train records 6 , 

NameError: ignored