<a href="https://colab.research.google.com/github/anirbanghoshsbi/.github.io/blob/master/project/07nov2021/TimeSeriesSplit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [114]:
!pip install yfinance
!pip install pandas-ta==0.2.45b



In [115]:
import yfinance as yf
import pandas_ta as ta
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,f1_score

In [116]:
import pandas as pd
import datetime
from datetime import datetime as dt
from dateutil.relativedelta import *

class TimeBasedCV(object):
    '''
    Parameters 
    ----------
    train_period: int
        number of time units to include in each train set
        default is 30
    test_period: int
        number of time units to include in each test set
        default is 7
    freq: string
        frequency of input parameters. possible values are: days, months, years, weeks, hours, minutes, seconds
        possible values designed to be used by dateutil.relativedelta class
        deafault is days
    '''
    
    
    def __init__(self, train_period=60, test_period=14, freq='days'):
        self.train_period = train_period
        self.test_period = test_period
        self.freq = freq

        
        
    def split(self, data, validation_split_date=None, date_column='Date', gap=0):
        '''
        Generate indices to split data into training and test set
        
        Parameters 
        ----------
        data: pandas DataFrame
            your data, contain one column for the record date 
        validation_split_date: datetime.date()
            first date to perform the splitting on.
            if not provided will set to be the minimum date in the data after the first training set
        date_column: string, deafult='record_date'
            date of each record
        gap: int, default=0
            for cases the test set does not come right after the train set,
            *gap* days are left between train and test sets
        
        Returns 
        -------
        train_index ,test_index: 
            list of tuples (train index, test index) similar to sklearn model selection
        '''
        
        # check that date_column exist in the data:
        try:
            data[date_column]
        except:
            raise KeyError(date_column)
                    
        train_indices_list = []
        test_indices_list = []

        if validation_split_date==None:
            validation_split_date = data[date_column].min().date() + eval('relativedelta('+self.freq+'=self.train_period)')
        
        start_train = validation_split_date - eval('relativedelta('+self.freq+'=self.train_period)')
        end_train = start_train + eval('relativedelta('+self.freq+'=self.train_period)')
        start_test = end_train + eval('relativedelta('+self.freq+'=gap)')
        end_test = start_test + eval('relativedelta('+self.freq+'=self.test_period)')

        while end_test < data[date_column].max().date():
            # train indices:
            cur_train_indices = list(data[(data[date_column].dt.date>=start_train) & 
                                     (data[date_column].dt.date<end_train)].index)

            # test indices:
            cur_test_indices = list(data[(data[date_column].dt.date>=start_test) &
                                    (data[date_column].dt.date<end_test)].index)
            
            print("Train period:",start_train,"-" , end_train, ", Test period", start_test, "-", end_test,
                  "# train records", len(cur_train_indices), ", # test records", len(cur_test_indices))

            train_indices_list.append(cur_train_indices)
            test_indices_list.append(cur_test_indices)

            # update dates:
            start_train = start_train + eval('relativedelta('+self.freq+'=self.test_period)')
            end_train = start_train + eval('relativedelta('+self.freq+'=self.train_period)')
            start_test = end_train + eval('relativedelta('+self.freq+'=gap)')
            end_test = start_test + eval('relativedelta('+self.freq+'=self.test_period)')

        # mimic sklearn output  
        index_output = [(train,test) for train,test in zip(train_indices_list,test_indices_list)]

        self.n_splits = len(index_output)
        
        return index_output
    
    
    def get_n_splits(self):
        """Returns the number of splitting iterations in the cross-validator
        Returns
        -------
        n_splits : int
            Returns the number of splitting iterations in the cross-validator.
        """
        return self.n_splits 

In [117]:
data_for_modeling=yf.download('^nsei',start='2015-01-01',parse_dates=['Date']).reset_index()

[*********************100%***********************]  1 of 1 completed


In [118]:
data_for_modeling.ta.macd(append=True)
data_for_modeling.ta.vortex(append=True)
print()




In [119]:
data_for_modeling.rename(columns={'date':'Date'},inplace=True)

In [120]:
features=data_for_modeling.columns.tolist()

In [121]:
features

['Date',
 'open',
 'high',
 'low',
 'close',
 'adj_close',
 'volume',
 'MACD_12_26_9',
 'MACDh_12_26_9',
 'MACDs_12_26_9',
 'VTXP_14',
 'VTXM_14']

In [122]:
data_for_modeling['Signal']=data_for_modeling['close'].transform(lambda x : np.sign(x.diff(14)))


In [123]:
data_for_modeling=data_for_modeling.dropna()

In [124]:

# How to use TimeBasedCV
#data_for_modeling=pd.read_csv('data.csv', parse_dates=['record_date'])
tscv = TimeBasedCV(train_period=60,
                   test_period=14,
                   freq='days')
for train_index, test_index in tscv.split(data_for_modeling,
                   validation_split_date=datetime.date(2019,2,1), date_column='Date'):
    print(train_index, test_index)

# get number of splits
tscv.get_n_splits()
'''
#### Example- compute average test sets score: ####
X = data_for_modeling[features]
y = data_for_modeling['Signal']
from sklearn.linear_model import LogisticRegression
import numpy as np

scores = []
for train_index, test_index in tscv.split(X, validation_split_date=datetime.date(2019,2,1)):

    data_train   = X.loc[train_index].drop('Date', axis=1)
    target_train = y.loc[train_index]

    data_test    = X.loc[test_index].drop('Date', axis=1)
    target_test  = y.loc[test_index]

    # if needed, do preprocessing here

    clf = RandomForestClassifier()
    clf.fit(data_train,target_train)

    preds = clf.predict(data_test)

    # accuracy for the current fold only    
    r2score = clf.score(data_test,target_test)

    scores.append(r2score)

# this is the average accuracy over all folds
average_r2score = np.mean(scores)
#### End of example ####
print(average_r2score)
'''

Train period: 2018-12-03 - 2019-02-01 , Test period 2019-02-01 - 2019-02-15 # train records 42 , # test records 9
Train period: 2018-12-17 - 2019-02-15 , Test period 2019-02-15 - 2019-03-01 # train records 41 , # test records 10
Train period: 2018-12-31 - 2019-03-01 , Test period 2019-03-01 - 2019-03-15 # train records 42 , # test records 9
Train period: 2019-01-14 - 2019-03-15 , Test period 2019-03-15 - 2019-03-29 # train records 42 , # test records 9
Train period: 2019-01-28 - 2019-03-29 , Test period 2019-03-29 - 2019-04-12 # train records 41 , # test records 9
Train period: 2019-02-11 - 2019-04-12 , Test period 2019-04-12 - 2019-04-26 # train records 40 , # test records 8
Train period: 2019-02-25 - 2019-04-26 , Test period 2019-04-26 - 2019-05-10 # train records 39 , # test records 8
Train period: 2019-03-11 - 2019-05-10 , Test period 2019-05-10 - 2019-05-24 # train records 38 , # test records 10
Train period: 2019-03-25 - 2019-05-24 , Test period 2019-05-24 - 2019-06-07 # train re

"\n#### Example- compute average test sets score: ####\nX = data_for_modeling[features]\ny = data_for_modeling['Signal']\nfrom sklearn.linear_model import LogisticRegression\nimport numpy as np\n\nscores = []\nfor train_index, test_index in tscv.split(X, validation_split_date=datetime.date(2019,2,1)):\n\n    data_train   = X.loc[train_index].drop('Date', axis=1)\n    target_train = y.loc[train_index]\n\n    data_test    = X.loc[test_index].drop('Date', axis=1)\n    target_test  = y.loc[test_index]\n\n    # if needed, do preprocessing here\n\n    clf = RandomForestClassifier()\n    clf.fit(data_train,target_train)\n\n    preds = clf.predict(data_test)\n\n    # accuracy for the current fold only    \n    r2score = clf.score(data_test,target_test)\n\n    scores.append(r2score)\n\n# this is the average accuracy over all folds\naverage_r2score = np.mean(scores)\n#### End of example ####\nprint(average_r2score)\n"

In [125]:
n_estimators = [int(x) for x in np.linspace(start = 5 , stop = 15, num = 10)] # returns 10 numbers 

max_features = ['auto', 'log2']

max_depth = [int(x) for x in np.linspace(5, 10, num = 2)] 

max_depth.append(None)

bootstrap = [True, False]

r_grid = {'n_estimators': n_estimators,

               'max_features': max_features,

               'max_depth': max_depth,

               'bootstrap': bootstrap}

print(r_grid)

{'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15], 'max_features': ['auto', 'log2'], 'max_depth': [5, 10, None], 'bootstrap': [True, False]}


In [None]:

#### Example- RandomizedSearchCV ####
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMRegressor
from random import randint, uniform

tscv = TimeBasedCV(train_period=60, test_period=7)
index_output = tscv.split(data_for_modeling, validation_split_date=datetime.date(2019,2,1))

rf = RandomForestClassifier()



rf_random = RandomizedSearchCV(estimator = rf, param_distributions = r_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
X=X.fillna(0.01)

# Fit the random search model############################~~~~~~~~~~~~~~~~~~################
rf_random.fit(X.drop('Date', axis=1),y)
print(f"Best score: {rf_random.best_score_}")
print("Best parameters set:")
best_parameters = rf_random.best_estimator_.get_params()
for param_name in sorted(r_grid.keys()):
  print(f"\t{param_name}: {best_parameters[param_name]}")






#model.fit(X.drop('date', axis=1),y)
#model.cv_results_
#### End of example ####

In [128]:
for param_name in sorted(r_grid.keys()):
  print(f"\t{param_name}: {best_parameters[param_name]}")

	bootstrap: False
	max_depth: 5
	max_features: auto
	n_estimators: 13


In [129]:
X.rename(columns={'date':'Date'},inplace=True)

In [None]:
scores = []
for train_index, test_index in tscv.split(X, validation_split_date=datetime.date(2019,2,1)):

    data_train   = X.loc[train_index].drop('Date', axis=1)
    target_train = y.loc[train_index]

    data_test    = X.loc[test_index].drop('Date', axis=1)
    target_test  = y.loc[test_index]

    # if needed, do preprocessing here

    clf = RandomForestClassifier(bootstrap=True,max_depth=5,max_features='log2',n_estimators=5)
    clf.fit(data_train,target_train)

    preds = clf.predict(data_test)
    scores.extend(preds)

In [133]:
len(scores)

672