In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns

from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

import utils
from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly
from prophet.diagnostics import cross_validation

### Carregando o dataframe e preparando para trabalhar com o Prophet:

In [22]:
df = pd.read_csv('./Ibovespa.csv')
df = df[['Data', 'Último']]
df.rename(columns={'Data':'ds','Último':'y'},inplace=True)
df['ds'] = pd.to_datetime(df['ds'], format='%d.%m.%Y')
df.head()

Unnamed: 0,ds,y
0,2024-01-09,131.447
1,2024-01-08,132.427
2,2024-01-05,132.023
3,2024-01-04,131.226
4,2024-01-03,132.834


## Separando somente os dados > 31/12/2020

In [23]:
df = df[df['ds'] > '2020-12-31']

In [41]:
df.count()

ds    751
y     751
dtype: int64

## Inserindo os feriados importantes:

In [24]:
import holidays
import holidays

# Capturando feriados dos EUA e da NYSE
us_holidays = holidays.country_holidays('US', years=[2021,2022,2023, 2024, 2025])  # Exemplo para anos específicos
nyse_holidays = holidays.financial_holidays('NYSE', years=[2021,2022,2023, 2024, 2025])

br_holidays = holidays.country_holidays('BR', years=[2021,2022,2023, 2024, 2025])

# Capturando os feriados do estado de São Paulo
sp_holidays = holidays.Brazil(state='SP', years=[2021,2022,2023, 2024, 2025])

us_holidays_df = pd.DataFrame(list(us_holidays.items()), columns=['ds', 'holiday'])
nyse_holidays_df = pd.DataFrame(list(nyse_holidays.items()), columns=['ds', 'holiday'])
br_holidays_df = pd.DataFrame(list(br_holidays.items()), columns=['ds', 'holiday'])
sp_holidays_df = pd.DataFrame(list(sp_holidays.items()), columns=['ds', 'holiday'])

total_holidays = pd.concat([us_holidays_df, nyse_holidays_df, br_holidays_df, sp_holidays_df]).drop_duplicates().reset_index(drop=True)
total_holidays['ds'] = pd.to_datetime(total_holidays['ds'])

total_holidays.count()

ds         123
holiday    123
dtype: int64

## Seprando os dados em treino e teste

In [25]:
train_data = df.sample(frac=0.8, random_state=0)
test_data = df.drop(train_data.index)
train_data.sort_values(by='ds', inplace=True)
test_data.sort_values(by='ds', inplace=True)
print(f'training data size : {train_data.shape}')
print(f'testing data size : {test_data.shape}')

training data size : (601, 2)
testing data size : (150, 2)


## Treinando o Modelo

In [26]:
m = Prophet(holidays=total_holidays)
m.fit(train_data)
future = m.make_future_dataframe(periods=20, freq='M')
forecast = m.predict(future)
forecast.head()

12:00:16 - cmdstanpy - INFO - Chain [1] start processing
12:00:16 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,Christmas Day,Christmas Day_lower,Christmas Day_upper,Christmas Day (observed),...,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2021-01-04,120.40664,114.181673,120.665528,120.40664,120.40664,0.0,0.0,0.0,0.0,...,0.580016,0.580016,0.580016,-3.687806,-3.687806,-3.687806,0.0,0.0,0.0,117.29885
1,2021-01-05,120.337179,113.876234,120.527057,120.337179,120.337179,0.0,0.0,0.0,0.0,...,0.444431,0.444431,0.444431,-3.551149,-3.551149,-3.551149,0.0,0.0,0.0,117.23046
2,2021-01-06,120.267717,114.193503,120.929807,120.267717,120.267717,0.0,0.0,0.0,0.0,...,0.700414,0.700414,0.700414,-3.404334,-3.404334,-3.404334,0.0,0.0,0.0,117.563797
3,2021-01-07,120.198256,114.5612,121.054169,120.198256,120.198256,0.0,0.0,0.0,0.0,...,0.865703,0.865703,0.865703,-3.247319,-3.247319,-3.247319,0.0,0.0,0.0,117.81664
4,2021-01-08,120.128795,114.187229,120.766767,120.128795,120.128795,0.0,0.0,0.0,0.0,...,0.589469,0.589469,0.589469,-3.080241,-3.080241,-3.080241,0.0,0.0,0.0,117.638022


In [27]:
m.train_holiday_names

0                                      New Year's Day
1                           New Year's Day (observed)
2                                        Memorial Day
3                Juneteenth National Independence Day
4     Juneteenth National Independence Day (observed)
5                                    Independence Day
6                         Independence Day (observed)
7                                           Labor Day
8                                        Veterans Day
9                                        Thanksgiving
10                                      Christmas Day
11                           Christmas Day (observed)
12                         Martin Luther King Jr. Day
13                              Washington's Birthday
14                                       Columbus Day
15                            Veterans Day (observed)
16                                        Good Friday
17                                   Thanksgiving Day
18                         C

In [28]:
plot_plotly(m, forecast)

In [29]:
plot_components_plotly(m, forecast)

In [30]:
# Extrair as colunas relevantes dos DataFrames
forecast_cols = ['ds', 'yhat']
valores_reais_cols = ['ds', 'y']

forecast = forecast[forecast_cols]
valores_reais = train_data[valores_reais_cols]

# Mesclar os DataFrames nas colunas 'ds' para comparar previsões e valores reais
resultados = pd.merge(forecast, valores_reais, on='ds', how='inner')

# Calcular o erro percentual absoluto para cada ponto de dados
resultados['erro_percentual_absoluto'] = np.abs((resultados['y'] - resultados['yhat']) / resultados['y']) * 100

# Calcular o MAPE
mape = np.mean(resultados['erro_percentual_absoluto'])

print(f"MAPE: {mape:.2f}%")

MAPE: 1.85%


In [45]:
df_cv = cross_validation(m, initial='365 days', period='30 days', horizon = '7 days')

Seasonality has period of 365.25 days which is larger than initial window. Consider increasing initial.
  0%|          | 0/25 [00:00<?, ?it/s]

12:32:19 - cmdstanpy - INFO - Chain [1] start processing
12:32:19 - cmdstanpy - INFO - Chain [1] done processing
  4%|▍         | 1/25 [00:00<00:09,  2.63it/s]12:32:20 - cmdstanpy - INFO - Chain [1] start processing
12:32:20 - cmdstanpy - INFO - Chain [1] done processing
  8%|▊         | 2/25 [00:00<00:06,  3.81it/s]12:32:20 - cmdstanpy - INFO - Chain [1] start processing
12:32:20 - cmdstanpy - INFO - Chain [1] done processing
 12%|█▏        | 3/25 [00:00<00:05,  4.36it/s]12:32:20 - cmdstanpy - INFO - Chain [1] start processing
12:32:20 - cmdstanpy - INFO - Chain [1] done processing
 16%|█▌        | 4/25 [00:00<00:04,  5.02it/s]12:32:20 - cmdstanpy - INFO - Chain [1] start processing
12:32:20 - cmdstanpy - INFO - Chain [1] done processing
 20%|██        | 5/25 [00:01<00:03,  5.13it/s]12:32:20 - cmdstanpy - INFO - Chain [1] start processing
12:32:20 - cmdstanpy - INFO - Chain [1] done processing
 24%|██▍       | 6/25 [00:01<00:03,  5.25it/s]12:32:21 - cmdstanpy - INFO - Chain [1] start 

In [46]:
df_cv.tail()

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper,y,cutoff
84,2024-01-03,131.165647,127.838468,134.649393,132.834,2024-01-02
85,2024-01-04,131.598819,128.544909,135.045971,131.226,2024-01-02
86,2024-01-05,131.58274,128.33758,135.055048,132.023,2024-01-02
87,2024-01-08,132.454278,129.168268,135.857448,132.427,2024-01-02
88,2024-01-09,132.60801,129.070393,135.775669,131.447,2024-01-02


In [47]:
from prophet.diagnostics import performance_metrics
df_p = performance_metrics(df_cv)
df_p

Unnamed: 0,horizon,mse,rmse,mae,mape,mdape,smape,coverage
0,1 days,15.961603,3.995197,3.298434,0.029967,0.028593,0.030147,0.375
1,2 days,31.994171,5.656339,4.770579,0.04268,0.041832,0.043243,0.333333
2,3 days,21.564597,4.64377,4.022869,0.035986,0.04079,0.03608,0.230769
3,4 days,26.136764,5.112413,4.443618,0.040305,0.041431,0.040316,0.230769
4,5 days,41.185132,6.417564,5.3173,0.047898,0.036345,0.048348,0.230769
5,6 days,33.676896,5.80318,4.349499,0.038943,0.045823,0.039601,0.416667
6,7 days,48.805169,6.98607,5.41685,0.048562,0.034,0.049202,0.4


Os resultados mostram o seguinte:

MAPE: Varia de cerca de 3% para um horizonte de 1 dia até cerca de 4.8% para um horizonte de 7 dias. Esses valores indicam que as previsões são relativamente precisas, com erros percentuais aumentando ligeiramente à medida que o horizonte de previsão se estende.
Cobertura: A cobertura do intervalo de previsão parece diminuir com horizontes de previsão mais longos, o que é esperado, pois previsões mais distantes tendem a ser menos precisas.
Os valores do MAPE na faixa de 3% a 5% são geralmente considerados bons para muitas aplicações de séries temporais, especialmente em domínios como previsões de mercado de ações, onde a incerteza é inerente e difícil de prever.

A cobertura do intervalo de previsão (a proporção de pontos de dados futuros reais que caem dentro do intervalo de previsão) parece estar em torno de 40% para um horizonte de 6 dias e cai para 0% no 7º dia. Isso pode indicar que os intervalos de confiança podem ser muito estreitos ou que o modelo está se tornando menos confiável à medida que tenta prever mais adiante no futuro.