# Módulo 6c - Treinando os modelos com dados textuais

In [196]:
from numpy import loadtxt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

In [197]:
TrainX_Scaled2 = loadtxt('TrainX_Scaled2.csv', delimiter=',')

print(TrainX_Scaled2)

[[0.36133199 0.7375     0.27275164 ... 0.         0.         0.89754763]
 [0.42373204 0.625      0.51479182 ... 0.         1.         0.49402108]
 [0.41765741 0.4125     0.1716134  ... 0.         0.         0.25248277]
 ...
 [0.31259221 0.25       0.36556714 ... 0.         0.         0.45490474]
 [0.4065331  0.4375     0.27939567 ... 0.         0.         0.99908796]
 [0.45120385 0.4125     0.20304323 ... 0.         0.         0.98758614]]


In [198]:
TrainX_Scaled2.shape

(481, 13)

In [199]:
TrainY2 = pd.read_csv("TrainY2.csv", parse_dates=True)
TrainY2.index = TrainY2["date"]
TrainY2.index = pd.to_datetime(TrainY2.index)
TrainY2.drop(["date"], inplace = True, axis = 1)

In [200]:
TrainY2.shape

(481, 2)

In [201]:
TrainY2.head()

Unnamed: 0_level_0,target_value,target_class
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-02-01 10:00:00,0.011487,1.0
2022-02-01 12:00:00,-0.006911,0.0
2022-02-01 13:00:00,-0.003026,0.0
2022-02-01 14:00:00,0.00091,1.0
2022-02-02 10:00:00,-0.010046,0.0


In [202]:
TestX_Scaled2 = loadtxt('TestX_Scaled2.csv', delimiter=',')
print(TestX_Scaled2)

[[0.40264879 0.08823529 0.41101697 ... 0.         1.         0.38872434]
 [0.38732201 0.41176471 0.40973411 ... 0.         0.         0.84988309]
 [0.40210006 0.61764706 0.53313108 ... 0.         1.         0.95676799]
 ...
 [0.26851365 0.76470588 0.2689709  ... 0.         0.         0.        ]
 [0.38622276 0.76470588 0.34045117 ... 0.         0.         0.26214601]
 [0.39772575 0.67647059 0.30421479 ... 0.         0.         0.62312289]]


In [203]:
TestX_Scaled2.shape

(86, 13)

In [204]:
TestY2 = pd.read_csv("TestY2.csv", parse_dates=True)
TestY2.index = TestY2["date"]
TestY2.index = pd.to_datetime(TestY2.index)
TestY2.drop(["date"], inplace = True, axis = 1)

In [205]:
TestY2.head()

Unnamed: 0_level_0,target_value,target_class
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-05-20 14:00:00,-0.001443,0.0
2022-05-20 15:00:00,0.004626,1.0
2022-05-20 16:00:00,0.019281,1.0
2022-05-23 10:00:00,0.011293,1.0
2022-05-23 11:00:00,0.001396,1.0


In [206]:
TestY2.shape

(86, 2)

### Random Forest

In [207]:
def backtest(features, targets, model, start=300, step=50):
    
    """
    Faz o treinamento do modelo com backtesting, 
    uma forma de validação para séries temporais, 
    em que cross-validation não pode ser utilizado.
    Basicamente ele treina o modelo nos primeitos
    start número de exemplos e testa este modelo nos
    step exemplos posteriores. 
    
    Argumentos
    ----------
    features: array
    Array contendo as features de treinamento já normalizadas.
    
    targets: pandas data frame
    Coluna de um Pandas Data Frame contendo os targets de treino
    
    model: algoritmo de treinamento
    
    start: int
    Indica quantos exemplos utilizar no primeiro treinamento
    
    step: int
    Indica o incremento do set de treinamento a cada iteração.

    Retorna
    -------
    Data Frame com duas colunas, sendo uma de predições do target e 
    a outra os targets reais
    
    """
    
    predictions = []
    
    # Percorre os dados de treinamento, conforme o tamanho de step:
    for i in range(start, features.shape[0], step):                 
        # Separacao do set de treino e de validacao
        xtrain = features[0:i].copy()
        ytrain = targets.iloc[0:i].copy()
        
        xval = features[i:(i+step)].copy()
        yval = targets.iloc[i:(i+step)].copy()
        
        # Treinando o modelo
        model.fit(xtrain, ytrain)
        
        # Predicoes:
        preds = model.predict_proba(xval)[:,1]
        preds = pd.Series(preds, index = yval.index)
        preds[preds >.5] = 1
        preds[preds<=.5] = 0
        
        
        # Combine predictions and test values
        combined = pd.concat({"Target": yval,"Predictions": preds}, axis=1)
        
        predictions.append(combined)
    
    return pd.concat(predictions)

In [208]:
def tunning(features, targets, n_estimators, min_samples_leaf, max_features):
    
    """
    Faz a busca pelos hyperparametros mais adequados ao
    modelo.
    
    Argumentos
    ----------
    
    n_estimators: list
    Lista contendo os valores de n_estimators a serem testados
    
    min_samples_leaf: list
    Lista contendo os valores de min_samples_spli a serem testados
    
    max_features: list
    Lista contendo os valores de max_features a serem testados

    Retorna
    -------
    Data Frame com as combinações de hiperparametros e a precisão
    correspondente
    
    """
    
    ilist = []
    jlist = []
    wlist = []
    precisionlist = []
    for i in n_estimators:
        for j in min_samples_leaf:
            for w in max_features:
                RF_model = RandomForestClassifier(n_estimators=i, min_samples_leaf=j, max_features = w, random_state=42)
                predictions = backtest(features = features, targets = targets, model = RF_model)
                p = precision_score(predictions["Target"], predictions["Predictions"])
                ilist.append(i)
                jlist.append(j)
                wlist.append(w)
                precisionlist.append(p)
    d={
    'n_estimators': ilist,
    'min_samples_leaf': jlist,
    'max_features': wlist,
    "precision": precisionlist}
    
    return pd.DataFrame.from_dict(d, orient='index').transpose()

In [209]:
RF_model = RandomForestClassifier(random_state=42)

In [210]:
predictions = backtest(features = TrainX_Scaled2, targets = TrainY2.iloc[:,1], model = RF_model)

In [211]:
precision_score(predictions["Target"], predictions["Predictions"])

0.5

In [17]:
predictions["Predictions"].value_counts()

0.0    95
1.0    86
Name: Predictions, dtype: int64

In [18]:
predictions.shape

(181, 2)

In [19]:
accuracy_score(predictions["Target"], predictions["Predictions"])

0.4861878453038674

In [20]:
from random import random
from random import randint

max_features = [0.3,0.6,0.9]
n_estimators = [randint(0,1000) for i in range(15)]
min_samples_leaf = [2,4,7,6,11,18,9,13,15,5]

In [21]:
n_estimators

[38, 110, 711, 974, 101, 458, 332, 585, 699, 403, 344, 251, 771, 287, 848]

In [22]:
hyper_parms = tunning(features = TrainX_Scaled2, 
                      targets = TrainY2.iloc[:,1],
                      n_estimators = n_estimators, 
                      min_samples_leaf = min_samples_leaf, 
                      max_features = max_features
                     )

KeyboardInterrupt: 

In [None]:
hyper_parms.loc[hyper_parms["precision"]>0.5]

n_estimators: 13, min_samples_leaf: 7, max_features: 0,9 => 0.563218

In [212]:
RF_model = RandomForestClassifier(n_estimators=100, min_samples_leaf=10, max_features = 0.3, random_state=42)

In [213]:
predictions = backtest(features = TrainX_Scaled2, targets = TrainY2.iloc[:,1], model = RF_model)

In [214]:
precision_score(predictions["Target"], predictions["Predictions"])

0.5045871559633027

In [215]:
predictions["Predictions"].value_counts()

1.0    109
0.0     72
Name: Predictions, dtype: int64

In [216]:
test_pred = RF_model.predict_proba(TestX_Scaled2)[:,1]

In [217]:
preds = pd.Series(test_pred, index = TestY2.index)
preds[preds >.5] = 1
preds[preds<=.5] = 0

In [218]:
combined = pd.concat({"Target": TestY2["target_class"], "TargetValue":TestY2["target_value"], "Predictions": preds}, axis=1)

In [219]:
precision_score(combined["Target"], combined["Predictions"])

0.5098039215686274

In [220]:
combined.shape

(86, 3)

In [221]:
pred_1 = combined.loc[combined["Predictions"]==1]

## Calculando lucro/prejuízo

In [224]:
gp = 1
for i in pred_1["TargetValue"]:
    gp = gp*(1+i)
print(gp)

1.2402496010334867


In [229]:
extremes = combined.loc[abs(combined["TargetValue"])>0.025]

In [230]:
extremes

Unnamed: 0_level_0,Target,TargetValue,Predictions
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-05-23 16:00:00,0.0,-0.130795,0.0
2022-05-24 15:00:00,1.0,0.160256,1.0
2022-05-24 16:00:00,0.0,-0.112155,0.0
2022-05-26 16:00:00,0.0,-0.029256,0.0
2022-05-27 16:00:00,0.0,-0.02641,0.0
2022-06-06 16:00:00,1.0,0.027107,1.0


### Adaboost

In [37]:
AB_model = AdaBoostClassifier(random_state=42)

In [39]:
AB_predictions = backtest(features = TrainX_Scaled2, targets = TrainY2.iloc[:,1], model = AB_model)

In [40]:
AB_predictions["Predictions"].value_counts()

1.0    106
0.0     75
Name: Predictions, dtype: int64

In [41]:
precision_score(AB_predictions["Target"], AB_predictions["Predictions"])

0.4811320754716981

In [42]:
def AB_tunning(features, targets, n_estimators, learning_rate):
    
    """
    Faz a busca pelos hyperparametros mais adequados ao
    modelo.
    
    Argumentos
    ----------
    
    n_estimators: list
    Lista contendo os valores de n_estimators a serem testados
    
    learning_rate: list
    Lista contendo os valores de learning_rate a serem testados

    Retorna
    -------
    Data Frame com as combinações de hiperparametros e a precisão
    correspondente
    
    """
    
    ilist = []
    jlist = []
    precisionlist = []
    for i in n_estimators:
        for j in learning_rate:
                AB_model = AdaBoostClassifier(n_estimators=i, learning_rate=j, random_state=42)
                predictions = backtest(features = features, targets = targets, model = AB_model)
                p = precision_score(predictions["Target"], predictions["Predictions"])
                ilist.append(i)
                jlist.append(j)
                precisionlist.append(p)
    d={
    'n_estimators': ilist,
    'learning_rate': jlist,
    "precision": precisionlist}
    
    return pd.DataFrame.from_dict(d, orient='index').transpose()

In [43]:
n_estimators = [50,100,200,800,1600]
learning_rate = [1,2,3]

In [44]:
AB_hyper_parms = AB_tunning(features = TrainX_Scaled2, 
                      targets = TrainY2.iloc[:,1],
                      n_estimators = n_estimators,
                      learning_rate = learning_rate)

In [45]:
AB_hyper_parms

Unnamed: 0,n_estimators,learning_rate,precision
0,50.0,1.0,0.481132
1,50.0,2.0,0.555556
2,50.0,3.0,0.527473
3,100.0,1.0,0.486726
4,100.0,2.0,0.55
5,100.0,3.0,0.527473
6,200.0,1.0,0.504762
7,200.0,2.0,0.561644
8,200.0,3.0,0.505747
9,800.0,1.0,0.5


In [231]:
AB_model = AdaBoostClassifier(n_estimators = 800, learning_rate = 1, random_state=42)

In [232]:
AB_predictions = backtest(features = TrainX_Scaled2, targets = TrainY2.iloc[:,1], model = AB_model)

In [233]:
precision_score(AB_predictions["Target"], AB_predictions["Predictions"])

0.5

In [234]:
AB_test_pred = AB_model.predict_proba(TestX_Scaled2)[:,1]

In [235]:
AB_preds = pd.Series(AB_test_pred, index = TestY2.index)

In [236]:
AB_combined = pd.concat({"Target": TestY2["target_class"], "TargetValue":TestY2["target_value"], "Predictions": AB_preds}, axis=1)

In [237]:
AB_combined

Unnamed: 0_level_0,Target,TargetValue,Predictions
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-05-20 14:00:00,0.0,-0.001443,0.502503
2022-05-20 15:00:00,1.0,0.004626,0.501018
2022-05-20 16:00:00,1.0,0.019281,0.501421
2022-05-23 10:00:00,1.0,0.011293,0.520503
2022-05-23 11:00:00,1.0,0.001396,0.499100
...,...,...,...
2022-06-07 11:00:00,0.0,-0.002259,0.498346
2022-06-07 12:00:00,1.0,0.000970,0.500610
2022-06-07 13:00:00,0.0,-0.005816,0.502832
2022-06-07 14:00:00,1.0,0.002275,0.499418


In [238]:
AB_combined["Predictions"].loc[AB_combined["Predictions"]>0.5] = 1
AB_combined["Predictions"].loc[AB_combined["Predictions"]<=0.5] = 0

In [239]:
AB_combined

Unnamed: 0_level_0,Target,TargetValue,Predictions
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-05-20 14:00:00,0.0,-0.001443,1.0
2022-05-20 15:00:00,1.0,0.004626,1.0
2022-05-20 16:00:00,1.0,0.019281,1.0
2022-05-23 10:00:00,1.0,0.011293,1.0
2022-05-23 11:00:00,1.0,0.001396,0.0
...,...,...,...
2022-06-07 11:00:00,0.0,-0.002259,0.0
2022-06-07 12:00:00,1.0,0.000970,1.0
2022-06-07 13:00:00,0.0,-0.005816,1.0
2022-06-07 14:00:00,1.0,0.002275,0.0


In [240]:
precision_score(AB_combined["Target"], AB_combined["Predictions"])

0.5576923076923077

In [241]:
AB_pred_1 = AB_combined.loc[AB_combined["Predictions"]==1]

In [242]:
AB_pred_1

Unnamed: 0_level_0,Target,TargetValue,Predictions
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-05-20 14:00:00,0.0,-0.001443,1.0
2022-05-20 15:00:00,1.0,0.004626,1.0
2022-05-20 16:00:00,1.0,0.019281,1.0
2022-05-23 10:00:00,1.0,0.011293,1.0
2022-05-24 10:00:00,0.0,-0.018095,1.0
2022-05-24 11:00:00,1.0,0.007436,1.0
2022-05-24 13:00:00,1.0,0.000322,1.0
2022-05-24 14:00:00,1.0,0.005155,1.0
2022-05-24 15:00:00,1.0,0.160256,1.0
2022-05-25 10:00:00,0.0,-0.003423,1.0


## Calculando o lucro/prejuízo

$$ ganho/perda = (1+ganho/perda_1)(1+ganho/perda_2)...(1+ganho/perda_n) $$

In [243]:
gp = 1
for i in AB_pred_1["TargetValue"]:
    gp = gp*(1+i)
print(gp)

1.1942394414823536


In [244]:
AB_pred_1["TargetValue"].sum()

0.19158256911297963

In [245]:
AB_extremes = AB_combined.loc[abs(AB_combined["TargetValue"])>0.025]

In [246]:
AB_extremes

Unnamed: 0_level_0,Target,TargetValue,Predictions
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-05-23 16:00:00,0.0,-0.130795,0.0
2022-05-24 15:00:00,1.0,0.160256,1.0
2022-05-24 16:00:00,0.0,-0.112155,0.0
2022-05-26 16:00:00,0.0,-0.029256,1.0
2022-05-27 16:00:00,0.0,-0.02641,1.0
2022-06-06 16:00:00,1.0,0.027107,1.0
