# Return signals regression

In [1]:
import pandas as pd
import numpy as np
import sys, os

In [2]:
dataset = pd.read_pickle("data/2y_tickers_features.pkl")

In [3]:
dataset.shape[0]

3696

In [4]:
from utils import MultipleTimeSeriesCV, format_time

## Cross Validation

In [158]:
cv = MultipleTimeSeriesCV(n_splits=6, test_period_length=30, lookahead=1, train_period_length=100, date_idx = "Date")

In [159]:
cv.preview_split_dates(dataset)

Training: 2019-11-06-2020-04-07  (100 days) | Test: 2020-04-08-2020-05-22 (30 days)
Training: 2019-09-24-2020-02-18  (100 days) | Test: 2020-02-19-2020-04-07 (30 days)
Training: 2019-08-12-2020-01-07  (100 days) | Test: 2020-01-08-2020-02-18 (30 days)
Training: 2019-06-27-2019-11-20  (100 days) | Test: 2019-11-21-2020-01-07 (30 days)


## Pipeline de preprocesamiento

In [160]:
dataset.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits',
       'Ticker', 'MACD', 'RSI', 'BB_High', 'BB_Mid', 'BB_Low', 'ATR', 'NATR',
       'Currency_Volume', 'Return_1m', 'Return_2m', 'Return_3m', 'Year',
       'Month', 'Weekday', 'Forward_Return_1m', 'Forward_Return_2m',
       'Forward_Return_3m'],
      dtype='object')

In [207]:
continuous_features = ['Open', 'High', 'Low', 'Close', 'Volume',
                       'MACD', 'RSI', 'BB_High', 'BB_Mid', 'BB_Low',
                       'ATR', 'NATR', 'Currency_Volume']
categorical_features = ['Month', 'Weekday', 'Ticker']

In [208]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_union

In [209]:
from sklearn.base import BaseEstimator, TransformerMixin
class NoTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X

In [210]:
preprocessing_pipeline = ColumnTransformer(transformers = [
    ('continuous', NoTransformer(), continuous_features),
    ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

## Entrenamiendo por grilla

In [211]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [212]:
pipeline = Pipeline(steps = [
    ('preprocessing', preprocessing_pipeline),
    ('estimator', LinearRegression())
])

In [213]:
accepted_targets = [column for column in dataset.columns if 'Forward_' in column]

In [214]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMClassifier
from catboost import CatBoostRegressor

In [231]:
params_grid =[
    {'estimator':[LinearRegression()]},
    {
        'estimator': [RandomForestRegressor()],
        'estimator__max_depth': range(5, 16, 5),
        'estimator__min_samples_split': [2, 5, 10, 15, 20]
    },
    {
        'estimator': [LGBMClassifier(random_state = 42, silent = True)],
        'estimator__n_estimators': range(10, 101, 10)
    },
    {
        'estimator': [CatBoostRegressor(loss_function='MSE', verbose = False)],
    }
]

In [236]:
models = []

for target in accepted_targets:
    model = GridSearchCV(pipeline, params_grid, cv = cv, n_jobs = -1,
                         scoring='neg_mean_squared_error', verbose = 1,
                         return_train_score = True)
    model.fit(dataset, dataset[target])
    models.append(model)
    
scores = [model.best_score_ for model in models]
estimators = [model.best_params_['estimator'] for model in models]

results = dict(zip(accepted_targets, list(zip(estimators, scores))))

Fitting 6 folds for each of 27 candidates, totalling 162 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:   19.6s finished


Fitting 6 folds for each of 27 candidates, totalling 162 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:   25.0s finished


Fitting 6 folds for each of 27 candidates, totalling 162 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:   22.2s finished


In [261]:
results

{'Forward_Return_1m': (RandomForestRegressor(max_depth=15),
  -0.06671163780575254),
 'Forward_Return_2m': (LinearRegression(), -0.06103851703460597),
 'Forward_Return_3m': (LinearRegression(), -0.08940922234148024)}