# 8.7. Aplicação Prática Árvore de Decisão

In [1]:
import numpy as np
import pandas as pd
import scipy.stats

from sklearn.model_selection import train_test_split 
from sklearn import metrics 
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Capturando os dados para a análise

def busca_titulos_tesouro_direto():
    url = 'https://www.tesourotransparente.gov.br/ckan/dataset/df56aa42-484a-4a59-8184-7676580c81e3/resource/796d2059-14e9-44e3-80c9-2d9e30b405c1/download/PrecoTaxaTesouroDireto.csv'
    df  = pd.read_csv(url, sep=';', decimal=',')
    df['Data Vencimento'] = pd.to_datetime(df['Data Vencimento'], dayfirst=True)
    df['Data Base']       = pd.to_datetime(df['Data Base'], dayfirst=True)
    multi_indice = pd.MultiIndex.from_frame(df.iloc[:, :3])
    df = df.set_index(multi_indice).iloc[: , 3:]  
    return df

In [3]:
titulos = busca_titulos_tesouro_direto()
ipca = titulos.loc[('Tesouro IPCA+', '2035-05-15')]
ipca.sort_index(inplace = True)

In [4]:
ipca_limpo = ipca.copy()

In [5]:
ipca_limpo["Retornos"] = ipca_limpo["PU Base Manha"].pct_change(1)

In [6]:
ipca_limpo.head()

Unnamed: 0_level_0,Taxa Compra Manha,Taxa Venda Manha,PU Compra Manha,PU Venda Manha,PU Base Manha,Retornos
Data Base,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-03-08,6.23,6.33,415.52,405.82,405.61,
2010-03-09,6.25,6.35,413.78,404.12,403.91,-0.004191
2010-03-10,6.25,6.35,413.99,404.33,404.12,0.00052
2010-03-11,6.25,6.35,414.2,404.54,404.33,0.00052
2010-03-12,6.27,6.37,412.69,403.07,402.82,-0.003735


In [7]:
# Vamos criar algumas variáveis e o alvo do modelo

#ipca_limpo["Alvo"] = ipca_limpo["Retornos"].rolling(20).std().shift(-1)

ipca_limpo["Alvo"] = (ipca_limpo["Taxa Venda Manha"].shift(-1)/ipca_limpo["Taxa Compra Manha"].shift(-1)-1)*100
ipca_limpo["var1"] = (ipca_limpo["Taxa Venda Manha"]/ipca_limpo["Taxa Compra Manha"]-1)*100
ipca_limpo["var2"] = (ipca_limpo["PU Venda Manha"]/ipca_limpo["PU Compra Manha"]-1)*100
ipca_limpo["var3"] = ipca_limpo["Retornos"].shift(1)
ipca_limpo["var4"] =ipca_limpo["Retornos"].rolling(20).std().shift(1)
ipca_limpo.head(10)

Unnamed: 0_level_0,Taxa Compra Manha,Taxa Venda Manha,PU Compra Manha,PU Venda Manha,PU Base Manha,Retornos,Alvo,var1,var2,var3,var4
Data Base,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-03-08,6.23,6.33,415.52,405.82,405.61,,1.6,1.605136,-2.334424,,
2010-03-09,6.25,6.35,413.78,404.12,403.91,-0.004191,1.6,1.6,-2.334574,,
2010-03-10,6.25,6.35,413.99,404.33,404.12,0.00052,1.6,1.6,-2.33339,-0.004191,
2010-03-11,6.25,6.35,414.2,404.54,404.33,0.00052,1.594896,1.6,-2.332207,0.00052,
2010-03-12,6.27,6.37,412.69,403.07,402.82,-0.003735,1.592357,1.594896,-2.331048,0.00052,
2010-03-15,6.28,6.38,411.87,402.27,402.12,-0.001738,1.594896,1.592357,-2.330833,-0.003735,
2010-03-16,6.27,6.37,413.0,403.37,403.22,0.002736,1.597444,1.594896,-2.331719,-0.001738,
2010-03-17,6.26,6.36,414.13,404.47,404.32,0.002728,1.597444,1.597444,-2.332601,0.002736,
2010-03-18,6.26,6.36,414.28,404.62,404.47,0.000371,1.607717,1.597444,-2.331756,0.002728,
2010-03-19,6.22,6.32,418.46,408.71,408.46,0.009865,1.607717,1.607717,-2.329972,0.000371,


In [8]:
ipca_limpo.tail(10)

Unnamed: 0_level_0,Taxa Compra Manha,Taxa Venda Manha,PU Compra Manha,PU Venda Manha,PU Base Manha,Retornos,Alvo,var1,var2,var3,var4
Data Base,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-10-07,5.74,5.86,1959.05,1931.55,1931.55,-0.009329,2.083333,2.090592,-1.403742,0.003686,0.011176
2022-10-10,5.76,5.88,1954.64,1926.8,1926.8,-0.002459,2.065404,2.083333,-1.424303,-0.009329,0.011387
2022-10-11,5.81,5.93,1943.65,1916.15,1916.15,-0.005527,2.086957,2.065404,-1.414864,-0.002459,0.011369
2022-10-13,5.75,5.87,1957.77,1929.88,1929.88,0.007165,2.076125,2.086957,-1.42458,-0.005527,0.011174
2022-10-14,5.78,5.9,1951.48,1923.29,1923.29,-0.003415,2.090592,2.076125,-1.444545,0.007165,0.011081
2022-10-17,5.74,5.86,1961.39,1933.08,1933.08,0.00509,2.083333,2.090592,-1.443364,-0.003415,0.011118
2022-10-18,5.76,5.88,1957.4,1929.15,1929.15,-0.002033,2.076125,2.083333,-1.443241,0.00509,0.011094
2022-10-19,5.78,5.9,1953.41,1925.23,1925.23,-0.002032,2.086957,2.076125,-1.442605,-0.002033,0.011069
2022-10-20,5.75,5.87,1961.0,1932.72,1932.72,0.00389,2.076125,2.086957,-1.442121,-0.002032,0.010836
2022-10-21,5.78,5.9,1955.11,1926.52,1926.52,-0.003208,,2.076125,-1.462322,0.00389,0.010701


In [9]:
def limpa_base(df):
    df.dropna(inplace = True)
    manter = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[manter].astype(np.float64)

In [10]:
ipca_limpo = limpa_base(ipca_limpo)

# Árvore de Decisão para tarefa de regressão

In [11]:
x = ipca_limpo.drop(["Alvo", "Taxa Compra Manha", "Taxa Venda Manha", "PU Compra Manha"
               , "PU Venda Manha", "PU Base Manha"], axis = 1)
y = ipca_limpo["Alvo"]

#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.50)
x_train = x["2015":"2019"]
x_test = x["2020":"2022"]

y_train = y["2015":"2019"]
y_test = y["2020":"2022"]

In [12]:
# Treinando o modelo

from sklearn.tree import DecisionTreeRegressor

mod_arvore = DecisionTreeRegressor(criterion = "absolute_error", max_depth = 20)

mod_arvore.fit(x_train, y_train)

DecisionTreeRegressor(criterion='absolute_error', max_depth=20)

In [13]:
# Predições para o treinamento e teste

y_pred_train = mod_arvore.predict(x_train)
y_pred_test = mod_arvore.predict(x_test)

### Métricas de avaliação: MAE, R2 e RMSE

In [14]:
# Avaliação dos resultados
import sklearn.metrics as metrics
from math import sqrt

MAE_train = metrics.mean_absolute_error(y_train, y_pred_train) 
R2_train = metrics.r2_score(y_train, y_pred_train) # Coeficiente de determinaçao
RMSE_train = sqrt(metrics.mean_squared_error(y_train, y_pred_train))

print("----- Avaliaçao do treinamento -----")
print("MAE: ", round(MAE_train,2))
print('R2: ', round(R2_train,2))
print('RMSE: ', round(RMSE_train,2))

MAE_test = metrics.mean_absolute_error(y_test, y_pred_test) 
R2_test = metrics.r2_score(y_test, y_pred_test)
RMSE_test = sqrt(metrics.mean_squared_error(y_test, y_pred_test))

print("")
print("----- Avaliaçao do teste -----")
print("MAE: ", round(MAE_test,2))
print('R2: ', round(R2_test,2))
print('RMSE: ', round(RMSE_test,2))


----- Avaliaçao do treinamento -----
MAE:  0.0
R2:  1.0
RMSE:  0.0

----- Avaliaçao do teste -----
MAE:  0.06
R2:  0.97
RMSE:  0.08


### Entendendo os resultados

In [15]:
# Resultados: Real x previsto em um dataframe

resultados = pd.DataFrame({'Real': y_test, 'Previsto': y_pred_test})
resultados["Residuos"] = resultados["Real"] - resultados["Previsto"]
resultados

Unnamed: 0_level_0,Real,Previsto,Residuos
Data Base,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-02,3.539823,3.603604,-0.063781
2020-01-03,3.539823,3.498542,0.041281
2020-01-06,3.508772,3.498542,0.010230
2020-01-07,3.519062,3.519062,0.000000
2020-01-08,3.529412,3.592814,-0.063403
...,...,...,...
2022-10-14,2.090592,2.135231,-0.044639
2022-10-17,2.083333,2.068966,0.014368
2022-10-18,2.076125,2.068966,0.007159
2022-10-19,2.086957,2.135231,-0.048275


In [18]:
# Avaliando os resultados

fig = make_subplots(rows = 1, cols = 1
                    , shared_xaxes = True
                    , vertical_spacing = 0.05)

fig.add_trace(go.Scatter(x = resultados.index, y = resultados["Previsto"].rolling(50).mean()
                                , name = "Previsto"
                                , line = dict(color = "red"))
              , row = 1, col = 1)

fig.add_trace(go.Scatter(x = resultados.index, y = resultados["Real"].rolling(50).mean()
                                , name = "Real"
                                , line = dict(color = "blue"))
              , row = 1, col = 1)

fig.update_layout(height = 600, width = 800
                  , title_text = "Tesouro IPCA+ 2035 - Spread Real x Previsto"
                  , font_color = "blue"
                  , title_font_color = "black"
                  , xaxis_title = "Previsto"
                  , yaxis_title = "Real"
                  , font = dict(size = 15, color = "Black")
                 )
fig.show()