# Feature Engineering

Preparo el dataset para ser posteriormente usado en el modelo

Indicadores a utilizar:

* Percentage price oscilator (PPO)
* Moving average convergence/divergence (MACD)
* Normalized average true range (NATR)
* Relative Strenght index (RSI)
* Bollinger Bands

## Unimos los tickers

In [4]:
# !pip install ta-lib

In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
#from talib import MACD, PPO, MACD, NATR, RSI, BBANDS, SMA, ATR
from tqdm import tqdm

tqdm.pandas()

  from pandas import Panel


Indexamos por ticker para facilitar calculos

In [14]:
dataset = pd.read_pickle("data/all_tickers_raw.pkl")
dataset.sort_index(inplace = True)
dataset.set_index('Ticker', append = True, inplace = True)
dataset.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Adj Close,Volume,Currency
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1962-01-02,GE,0.751202,0.763722,0.74369,0.748698,0.001775989,2156500.0,USD
1962-01-02,MO,0.0,0.19184,0.189236,0.189236,0.0002221978,345600.0,USD
1962-01-02,CVX,0.0,3.296131,3.244048,3.296131,0.05051204,105600.0,USD
1962-01-02,JNJ,0.0,0.22338,0.222222,0.22338,6.196926e-07,0.0,USD
1962-01-02,CAT,1.604167,1.619792,1.588542,1.604167,0.1347026,163200.0,USD


## Calculamos indicadores

### Compute Moving Average Convergence/Divergence (MACD)

In [12]:
def compute_macd(close):
    macd = MACD(close)[0]
    return macd

dataset["MACD"] = dataset.groupby('Ticker', level = 2).Close.apply(compute_macd)

NameError: name 'MACD' is not defined

In [None]:
alua_mask = dataset.index.get_level_values('Ticker') == "ALUA.BA"

In [None]:
sns.displot(data = dataset[alua_mask], x = "MACD")

### Relative Strength Index (RSI)

In [None]:
dataset["RSI"] = dataset.groupby('Ticker').Close.apply(RSI)

In [None]:
sns.displot(data = dataset[alua_mask], x = "RSI")

### Bollinger Bands

In [None]:
def compute_bb(close):
    high, mid, low = BBANDS(np.log1p(close), timeperiod=20)
    return pd.DataFrame({'BB_High': high, 'BB_Mid': mid, 'BB_Low': low}, index=close.index)

In [None]:
bollinger_bands = dataset.groupby('Ticker').Close.apply(compute_bb)
dataset = dataset.join(bollinger_bands)

### Average True Range

In [None]:
def compute_atr(stock_data):
    return ATR(stock_data.High, 
              stock_data.Low, 
              stock_data.Close, 
              timeperiod=14)

In [None]:
dataset["ATR"] = dataset.groupby("Ticker", group_keys=False).apply(compute_atr)

### Normalized average true range

In [None]:
dataset["NATR"] = dataset.groupby("Ticker", group_keys=False).apply(lambda x: NATR(high=x.High, low=x.Low, close=x.Close))

### Volumen en moneda

In [8]:
dataset["Currency_Volume"] = dataset.loc[:, 'Close'].mul(dataset.loc[:, 'Volume'], axis=0)

### Retorno mensual, bimensual y trimestral

In [9]:
def remove_outliers(x):
    return x.clip(lower=x.quantile(0.01), upper=x.quantile(0.99))

In [10]:
dataset["Return_1w"] = dataset.sort_index().groupby("Ticker").Close.pct_change(7).pipe(remove_outliers)
dataset["Return_1m"] = dataset.sort_index().groupby("Ticker").Close.pct_change(30).pipe(remove_outliers)
dataset["Return_2m"] = dataset.sort_index().groupby("Ticker").Close.pct_change(60).pipe(remove_outliers)
dataset["Return_3m"] = dataset.sort_index().groupby("Ticker").Close.pct_change(90).pipe(remove_outliers)

In [11]:
sns.displot(data = dataset[alua_mask], x = "Return_1m")

AttributeError: module 'seaborn' has no attribute 'displot'

In [None]:
sns.displot(data = dataset[alua_mask], x = "Return_2m")

In [None]:
sns.displot(data = dataset[alua_mask], x = "Return_3m")

In [None]:
sns.displot(data = dataset[alua_mask], x = "Return_1w")

### Confección de target con features

Utilizando el % de diferencia a 1, 2 y 3 meses confeccionamos el target, moviendo el periodo de holding hacia el pasado.

In [None]:
dataset["Forward_Return_1w"] = dataset.groupby('Ticker')[f'Return_1w'].shift(-7)
dataset["Forward_Return_1m"] = dataset.groupby('Ticker')[f'Return_1m'].shift(-30)
dataset["Forward_Return_2m"] = dataset.groupby('Ticker')[f'Return_2m'].shift(-60)
dataset["Forward_Return_3m"] = dataset.groupby('Ticker')[f'Return_3m'].shift(-90)

Creamos feature de fechas

In [None]:
dataset['Year'] = dataset.apply(lambda frame: frame.index.get_level_values(0).year).Close
dataset['Month'] = dataset.apply(lambda frame: frame.index.get_level_values(0).month).Close
dataset['Weekday'] = dataset.apply(lambda frame: frame.index.get_level_values(0).weekday).Close

Eliminamos los indices de ticker para llevarlos a sus respectivas columnas

In [None]:
dataset.reset_index(inplace = True, level = 1)
dataset.sample(5)

Ordenamos por fecha para evitar esta reingenieria durante los procesos de entrenamiento e iteracion de modelos

In [None]:
dataset.sort_index(inplace = True)
dataset.index.is_monotonic

### Salvado de los datos

In [None]:
dataset.shape

In [None]:
dataset.info(null_counts = True)

In [None]:
dataset.shape

In [None]:
dataset.to_pickle("data/all_tickers_features.pkl")