# Return signals regression

In [1]:
import pandas as pd
import numpy as np
import sys, os

In [2]:
dataset = pd.read_pickle("data/all_tickers_features.pkl")

## Pipeline de preprocesamiento

In [3]:
dataset.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Ticker', 'MACD',
       'RSI', 'BB_High', 'BB_Mid', 'BB_Low', 'ATR', 'NATR', 'Currency_Volume',
       'Return_1m', 'Return_2m', 'Return_3m', 'Year', 'Month', 'Weekday',
       'Forward_Return_1m', 'Forward_Return_2m', 'Forward_Return_3m'],
      dtype='object')

In [4]:
continuous_features = ['Open', 'High', 'Low', 'Close', 'Volume',
                       'MACD', 'RSI', 'BB_High', 'BB_Mid', 'BB_Low',
                       'ATR', 'NATR', 'Currency_Volume', 'Adj Close']
categorical_features = ['Month', 'Weekday', 'Ticker', 'Currency']

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_union

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
class NoTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X

In [7]:
preprocessing_pipeline = ColumnTransformer(transformers = [
    ('continuous', NoTransformer(), continuous_features),
    ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [8]:
features = continuous_features + categorical_features

## Entrenamiendo por grilla

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [10]:
pipeline = Pipeline(steps = [
    ('preprocessing', preprocessing_pipeline),
    ('estimator', LinearRegression())
])

In [11]:
accepted_targets = [column for column in dataset.columns if 'Forward_' in column]

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [13]:
params_grid =[
    {'estimator':[LinearRegression()]},
    {
        'estimator': [RandomForestRegressor()],
        'estimator__max_depth': range(5, 16, 5),
        'estimator__min_samples_split': [2, 5, 10, 15, 20]
    },
    {
        'estimator': [LGBMRegressor(random_state = 42, silent = True)],
        'estimator__n_estimators': range(10, 101, 10)
    },
    {
        'estimator': [CatBoostRegressor(verbose = False)],
    },
]

Como queremos estimar varios targets en los cuales hay nulos en distintos rangos de tiempo, haremos cross-validation en cada caso.

In [14]:
from sklearn.model_selection import TimeSeriesSplit, train_test_split

In [15]:
target = 'Forward_Return_1m'

model_data = dataset[features + [target]].dropna()
X = model_data[features]
y = model_data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, shuffle = False, train_size = 0.8)
cv = TimeSeriesSplit(n_splits = 2)
model = GridSearchCV(pipeline, params_grid, cv = cv,
                    scoring='neg_mean_squared_error',
                    return_train_score = True)
model.fit(X_train, y_train)

KeyError: "['Currency'] not in index"

In [35]:
from sklearn.metrics import r2_score, mean_squared_error

mse = mean_squared_error(y_test, model.predict(X_test))
r2 = r2_score(y_test, model.predict(X_test))
rmse = np.sqrt(mse)

mse, rmse

(0.06444161908384105, 0.25385353864746707)

In [36]:
from sklearn.metrics import r2_score, mean_squared_error

metrics = {
    'mse': [],
    'rmse': [],
    'ticker': []
}

for ticker in X_test.Ticker.unique():
    # Compute metrics by ticker
    mask = X_test.Ticker == ticker
    mse = mean_squared_error(y_test[mask], model.predict(X_test[mask]))
    rmse = np.sqrt(mse)
    # Save in dict
    metrics['ticker'].append(ticker)
    metrics['mse'].append(mse)
    metrics['rmse'].append(rmse)

pd.DataFrame(metrics).sort_by("").head(10)


Unnamed: 0,mse,rmse,ticker
0,0.008389,0.091591,TECO2
1,0.01079,0.103876,ALUA
2,0.026015,0.161292,PAMP
3,0.08288,0.28789,BHIP
4,0.091155,0.301918,GGAL
5,0.331363,0.575641,CEPU
6,0.053514,0.231332,EDN
7,0.022187,0.148954,BMA
8,0.051138,0.226137,BBAR
9,0.051927,0.227875,SUPV


In [41]:
import pickle
file = open('models/trained_model_1m.pkl', 'wb')
pickle.dump(model, file)
file.close()

In [44]:
import pickle
file = open('data/test_data.pkl', 'wb')
pickle.dump(X_train, file)
file.close()

## Modelo por representation learning

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn  import KerasRegressor
from tensorflow.keras.callbacks import EarlyStopping

def create_dnn(units_a = 32, units_b = 16):
    n = len(features)
    model = Sequential()
    Dense(units_a, activation = 'relu', input_shape=(n,))
    model.add(Dense(units_b, activation = 'relu'))
    model.add(Dense(1))
    model.compile(loss="mean_squared_error", optimizer="adam")   
    return model