In [130]:
import pandas as pd
import numpy as np
import yfinance as yf
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly
from prophet.diagnostics import cross_validation

In [131]:

end_data = datetime.today().strftime('%Y-%m-%d')
df = yf.download("^BVSP", start="2021-01-01", end=end_data, progress=False)
df.reset_index(inplace=True)
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2021-01-04,119024.0,120354.0,118062.0,118558.0,118558.0,8741400
1,2021-01-05,118835.0,119790.0,116756.0,119223.0,119223.0,9257100
2,2021-01-06,119377.0,120924.0,118917.0,119851.0,119851.0,11638200
3,2021-01-07,119103.0,121983.0,119101.0,121956.0,121956.0,11774800
4,2021-01-08,122387.0,125324.0,122386.0,125077.0,125077.0,11085800
...,...,...,...,...,...,...,...
754,2024-01-15,130988.0,131606.0,130253.0,131521.0,131521.0,5746600
755,2024-01-16,131515.0,131517.0,129147.0,129294.0,129294.0,11911300
756,2024-01-17,129293.0,129296.0,128312.0,128524.0,128524.0,9952500
757,2024-01-18,128524.0,129047.0,127316.0,127316.0,127316.0,12460800


### Carregando o dataframe e preparando para trabalhar com o Prophet:

In [132]:
#df = pd.read_csv('./Ibovespa.csv')
df = df[['Date', 'Close']]
df.rename(columns={'Date':'ds','Close':'y'},inplace=True)
df['ds'] = pd.to_datetime(df['ds'], format='%d.%m.%Y')
df.head()

Unnamed: 0,ds,y
0,2021-01-04,118558.0
1,2021-01-05,119223.0
2,2021-01-06,119851.0
3,2021-01-07,121956.0
4,2021-01-08,125077.0


## Separando somente os dados > 31/12/2020

In [133]:
df = df[df['ds'] > '2020-12-31']

In [134]:
df.count()

ds    759
y     759
dtype: int64

## Inserindo os feriados importantes:

In [135]:
import holidays

years = list(range(2021, 2026))

# Capturando feriados dos EUA e da NYSE
us_holidays = holidays.country_holidays('US', years=years)
nyse_holidays = holidays.financial_holidays('NYSE', years=years)

br_holidays = holidays.country_holidays('BR', years=years)

# Capturando os feriados do estado de São Paulo
sp_holidays = holidays.Brazil(state='SP', years=years)

us_holidays_df = pd.DataFrame(list(us_holidays.items()), columns=['ds', 'holiday'])
nyse_holidays_df = pd.DataFrame(list(nyse_holidays.items()), columns=['ds', 'holiday'])
br_holidays_df = pd.DataFrame(list(br_holidays.items()), columns=['ds', 'holiday'])
sp_holidays_df = pd.DataFrame(list(sp_holidays.items()), columns=['ds', 'holiday'])

total_holidays = pd.concat([us_holidays_df, nyse_holidays_df, br_holidays_df, sp_holidays_df]).drop_duplicates().reset_index(drop=True)
total_holidays['ds'] = pd.to_datetime(total_holidays['ds'])

total_holidays.count()

ds         123
holiday    123
dtype: int64

## Separando os dados em treino e teste

In [136]:
train_data = df.sample(frac=0.8, random_state=0)
test_data = df.drop(train_data.index)
print(f'training data size : {train_data.shape}')
print(f'testing data size : {test_data.shape}')

training data size : (607, 2)
testing data size : (152, 2)


## Treinando o Modelo

In [137]:
m = Prophet(holidays=total_holidays)
m.fit(train_data)
future = m.make_future_dataframe(len(test_data))
forecast = m.predict(future)
forecast.head()

22:08:43 - cmdstanpy - INFO - Chain [1] start processing
22:08:43 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,Christmas Day,Christmas Day_lower,Christmas Day_upper,Christmas Day (observed),...,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2021-01-04,120143.619812,114839.322201,122187.372812,120143.619812,120143.619812,0.0,0.0,0.0,0.0,...,613.614372,613.614372,613.614372,-2424.888358,-2424.888358,-2424.888358,0.0,0.0,0.0,118332.345827
1,2021-01-05,120086.496569,114928.8003,121992.860846,120086.496569,120086.496569,0.0,0.0,0.0,0.0,...,844.018868,844.018868,844.018868,-2429.573093,-2429.573093,-2429.573093,0.0,0.0,0.0,118500.942345
2,2021-01-06,120029.373327,114502.324458,121547.072832,120029.373327,120029.373327,0.0,0.0,0.0,0.0,...,570.242025,570.242025,570.242025,-2430.785538,-2430.785538,-2430.785538,0.0,0.0,0.0,118168.829814
3,2021-01-07,119972.250084,114383.779102,121585.874807,119972.250084,119972.250084,0.0,0.0,0.0,0.0,...,628.110181,628.110181,628.110181,-2426.631936,-2426.631936,-2426.631936,0.0,0.0,0.0,118173.72833
4,2021-01-08,119915.126842,114900.333289,122219.840135,119915.126842,119915.126842,0.0,0.0,0.0,0.0,...,843.850338,843.850338,843.850338,-2415.282049,-2415.282049,-2415.282049,0.0,0.0,0.0,118343.695131


In [138]:
plot_plotly(m, forecast)

In [139]:
plot_components_plotly(m, forecast)

In [140]:
# Extrair as colunas relevantes dos DataFrames
forecast_cols = ['ds', 'yhat']
valores_reais_cols = ['ds', 'y']

forecast = forecast[forecast_cols]
valores_reais = train_data[valores_reais_cols]

# Mesclar os DataFrames nas colunas 'ds' para comparar previsões e valores reais
resultados = pd.merge(forecast, valores_reais, on='ds', how='inner')

# Calcular o erro percentual absoluto para cada ponto de dados
resultados['erro_percentual_absoluto'] = np.abs((resultados['y'] - resultados['yhat']) / resultados['y']) * 100

# Calcular o MAPE
mape = np.mean(resultados['erro_percentual_absoluto'])

print(f"MAPE: {mape:.2f}%")

MAPE: 1.96%


In [141]:
df_cv = cross_validation(m, initial='365 days', period='30 days', horizon = '7 days')

Seasonality has period of 365.25 days which is larger than initial window. Consider increasing initial.
  0%|          | 0/25 [00:00<?, ?it/s]

22:08:43 - cmdstanpy - INFO - Chain [1] start processing
22:08:43 - cmdstanpy - INFO - Chain [1] done processing
  4%|▍         | 1/25 [00:00<00:03,  6.60it/s]22:08:44 - cmdstanpy - INFO - Chain [1] start processing
22:08:44 - cmdstanpy - INFO - Chain [1] done processing
  8%|▊         | 2/25 [00:00<00:03,  6.54it/s]22:08:44 - cmdstanpy - INFO - Chain [1] start processing
22:08:44 - cmdstanpy - INFO - Chain [1] done processing
 12%|█▏        | 3/25 [00:00<00:03,  5.88it/s]22:08:44 - cmdstanpy - INFO - Chain [1] start processing
22:08:44 - cmdstanpy - INFO - Chain [1] done processing
 16%|█▌        | 4/25 [00:00<00:03,  6.15it/s]22:08:44 - cmdstanpy - INFO - Chain [1] start processing
22:08:44 - cmdstanpy - INFO - Chain [1] done processing
 20%|██        | 5/25 [00:00<00:03,  5.36it/s]22:08:44 - cmdstanpy - INFO - Chain [1] start processing
22:08:44 - cmdstanpy - INFO - Chain [1] done processing
 24%|██▍       | 6/25 [00:01<00:03,  5.44it/s]22:08:44 - cmdstanpy - INFO - Chain [1] start 

In [142]:
df_cv.tail()

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper,y,cutoff
84,2023-12-19,123052.343174,119537.719106,126388.489916,131851.0,2023-12-13
85,2024-01-16,134603.755214,131386.646099,137765.095677,129294.0,2024-01-12
86,2024-01-17,134713.524404,131079.300483,138043.477092,128524.0,2024-01-12
87,2024-01-18,135103.649644,131547.892813,138543.685782,127316.0,2024-01-12
88,2024-01-19,135628.548296,132303.64909,139046.047815,127636.0,2024-01-12


In [143]:
from prophet.diagnostics import performance_metrics
df_p = performance_metrics(df_cv)
df_p

Unnamed: 0,horizon,mse,rmse,mae,mape,mdape,smape,coverage
0,1 days,11989510.0,3462.587714,2806.985706,0.024729,0.02059,0.02499,0.571429
1,2 days,9323285.0,3053.405512,2315.398158,0.020145,0.017589,0.020238,0.6
2,3 days,22088410.0,4699.830352,3465.168753,0.030259,0.020237,0.030644,0.461538
3,4 days,38222690.0,6182.450348,5122.857861,0.044439,0.041067,0.044979,0.272727
4,5 days,39776100.0,6306.829624,5392.084782,0.047343,0.048159,0.046845,0.363636
5,6 days,42532550.0,6521.698265,5638.71915,0.049591,0.06183,0.049879,0.357143
6,7 days,62201690.0,7886.804659,6490.904069,0.056948,0.047953,0.05774,0.272727


Os resultados mostram o seguinte:

MAPE: Varia de cerca de 3% para um horizonte de 1 dia até cerca de 4.8% para um horizonte de 7 dias. Esses valores indicam que as previsões são relativamente precisas, com erros percentuais aumentando ligeiramente à medida que o horizonte de previsão se estende.
Cobertura: A cobertura do intervalo de previsão parece diminuir com horizontes de previsão mais longos, o que é esperado, pois previsões mais distantes tendem a ser menos precisas.
Os valores do MAPE na faixa de 3% a 5% são geralmente considerados bons para muitas aplicações de séries temporais, especialmente em domínios como previsões de mercado de ações, onde a incerteza é inerente e difícil de prever.

A cobertura do intervalo de previsão (a proporção de pontos de dados futuros reais que caem dentro do intervalo de previsão) parece estar em torno de 40% para um horizonte de 6 dias e cai para 0% no 7º dia. Isso pode indicar que os intervalos de confiança podem ser muito estreitos ou que o modelo está se tornando menos confiável à medida que tenta prever mais adiante no futuro.