In [130]:
import pandas as pd
import numpy as np
import yfinance as yf
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly
from prophet.diagnostics import cross_validation
import math

In [131]:

end_data = datetime.today().strftime('%Y-%m-%d')
df = yf.download("^BVSP", start="2021-01-01", end=end_data, progress=False)
df.reset_index(inplace=True)
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2021-01-04,119024.0,120354.0,118062.0,118558.0,118558.0,8741400
1,2021-01-05,118835.0,119790.0,116756.0,119223.0,119223.0,9257100
2,2021-01-06,119377.0,120924.0,118917.0,119851.0,119851.0,11638200
3,2021-01-07,119103.0,121983.0,119101.0,121956.0,121956.0,11774800
4,2021-01-08,122387.0,125324.0,122386.0,125077.0,125077.0,11085800
...,...,...,...,...,...,...,...
756,2024-01-17,129293.0,129296.0,128312.0,128524.0,128524.0,9952500
757,2024-01-18,128524.0,129047.0,127316.0,127316.0,127316.0,12460800
758,2024-01-19,127319.0,127820.0,126533.0,127636.0,127636.0,11956900
759,2024-01-22,127636.0,127843.0,125876.0,126602.0,126602.0,9509100


### Carregando o dataframe e preparando para trabalhar com o Prophet:

In [132]:
df = df[['Date', 'Close']]
df.rename(columns={'Date':'ds','Close':'y'},inplace=True)
df['ds'] = pd.to_datetime(df['ds'], format='%d.%m.%Y')
df.head()

Unnamed: 0,ds,y
0,2021-01-04,118558.0
1,2021-01-05,119223.0
2,2021-01-06,119851.0
3,2021-01-07,121956.0
4,2021-01-08,125077.0


In [133]:
df.count()

ds    761
y     761
dtype: int64

## Inserindo os feriados importantes:

In [134]:
import holidays

years = list(range(2021, 2026))

# Capturando feriados dos EUA e da NYSE
us_holidays = holidays.country_holidays('US', years=years)
nyse_holidays = holidays.financial_holidays('NYSE', years=years)

br_holidays = holidays.country_holidays('BR', years=years)

# Capturando os feriados do estado de São Paulo
sp_holidays = holidays.Brazil(state='SP', years=years)

us_holidays_df = pd.DataFrame(list(us_holidays.items()), columns=['ds', 'holiday'])
nyse_holidays_df = pd.DataFrame(list(nyse_holidays.items()), columns=['ds', 'holiday'])
br_holidays_df = pd.DataFrame(list(br_holidays.items()), columns=['ds', 'holiday'])
sp_holidays_df = pd.DataFrame(list(sp_holidays.items()), columns=['ds', 'holiday'])

total_holidays = pd.concat([us_holidays_df, nyse_holidays_df, br_holidays_df, sp_holidays_df]).drop_duplicates().reset_index(drop=True)
total_holidays['ds'] = pd.to_datetime(total_holidays['ds'])

total_holidays.count()

ds         123
holiday    123
dtype: int64

## Separando os dados em treino e teste

In [135]:
df.shape[0]

761

In [136]:
train_data = df.sample(frac=0.8, random_state=0)
test_data = df.drop(train_data.index)
print(f'training data size : {train_data.shape}')
print(f'testing data size : {test_data.shape}')

training data size : (609, 2)
testing data size : (152, 2)


## Treinando o Modelo

In [137]:
m = Prophet(holidays=total_holidays)
m.fit(train_data)
future = m.make_future_dataframe(len(test_data))
forecast = m.predict(future)
forecast.head()

23:41:04 - cmdstanpy - INFO - Chain [1] start processing


23:41:05 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,Christmas Day,Christmas Day_lower,Christmas Day_upper,Christmas Day (observed),...,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2021-01-04,120103.227418,115515.339061,122336.346558,120103.227418,120103.227418,0.0,0.0,0.0,0.0,...,546.591305,546.591305,546.591305,-1848.014469,-1848.014469,-1848.014469,0.0,0.0,0.0,118801.804254
1,2021-01-05,120051.05414,115795.247314,122329.834768,120051.05414,120051.05414,0.0,0.0,0.0,0.0,...,734.45894,734.45894,734.45894,-1880.287896,-1880.287896,-1880.287896,0.0,0.0,0.0,118905.225184
2,2021-01-06,119998.880862,115440.455712,122174.030152,119998.880862,119998.880862,0.0,0.0,0.0,0.0,...,615.897339,615.897339,615.897339,-1912.97004,-1912.97004,-1912.97004,0.0,0.0,0.0,118701.808161
3,2021-01-07,119946.707584,115383.791816,121972.728766,119946.707584,119946.707584,0.0,0.0,0.0,0.0,...,567.637399,567.637399,567.637399,-1943.839084,-1943.839084,-1943.839084,0.0,0.0,0.0,118570.505899
4,2021-01-08,119894.534306,115352.673947,121945.073084,119894.534306,119894.534306,0.0,0.0,0.0,0.0,...,797.815432,797.815432,797.815432,-1970.655613,-1970.655613,-1970.655613,0.0,0.0,0.0,118721.694125


In [138]:
plot_plotly(m, forecast)

In [139]:
plot_components_plotly(m, forecast)

In [140]:
# Extrair as colunas relevantes dos DataFrames
forecast_cols = ['ds', 'yhat']
valores_reais_cols = ['ds', 'y']

forecast = forecast[forecast_cols]
valores_reais = train_data[valores_reais_cols]

# Mesclar os DataFrames nas colunas 'ds' para comparar previsões e valores reais
resultados = pd.merge(forecast, valores_reais, on='ds', how='inner')

# Calcular o erro percentual absoluto para cada ponto de dados
resultados['erro_percentual_absoluto'] = np.abs((resultados['y'] - resultados['yhat']) / resultados['y']) * 100

# Calcular o MAPE
mape = np.mean(resultados['erro_percentual_absoluto'])

print(f"MAPE: {mape:.2f}%")

MAPE: 1.89%


In [141]:
df_cv = cross_validation(m, initial='365 days', period='30 days', horizon = '7 days')

Seasonality has period of 365.25 days which is larger than initial window. Consider increasing initial.
  0%|          | 0/25 [00:00<?, ?it/s]

23:41:05 - cmdstanpy - INFO - Chain [1] start processing
23:41:05 - cmdstanpy - INFO - Chain [1] done processing
  4%|▍         | 1/25 [00:00<00:03,  7.36it/s]23:41:05 - cmdstanpy - INFO - Chain [1] start processing
23:41:05 - cmdstanpy - INFO - Chain [1] done processing
  8%|▊         | 2/25 [00:00<00:03,  6.89it/s]23:41:05 - cmdstanpy - INFO - Chain [1] start processing
23:41:05 - cmdstanpy - INFO - Chain [1] done processing
 12%|█▏        | 3/25 [00:00<00:03,  6.44it/s]23:41:05 - cmdstanpy - INFO - Chain [1] start processing
23:41:05 - cmdstanpy - INFO - Chain [1] done processing
 16%|█▌        | 4/25 [00:00<00:03,  6.46it/s]23:41:06 - cmdstanpy - INFO - Chain [1] start processing
23:41:06 - cmdstanpy - INFO - Chain [1] done processing
 20%|██        | 5/25 [00:00<00:03,  6.06it/s]23:41:06 - cmdstanpy - INFO - Chain [1] start processing
23:41:06 - cmdstanpy - INFO - Chain [1] done processing
 24%|██▍       | 6/25 [00:00<00:03,  6.15it/s]23:41:06 - cmdstanpy - INFO - Chain [1] start 

In [142]:
df_cv.tail()

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper,y,cutoff
89,2024-01-17,134012.147926,130645.978927,137381.532868,128524.0,2024-01-16
90,2024-01-18,134304.03069,130807.19646,137922.446943,127316.0,2024-01-16
91,2024-01-19,134863.779386,131491.324088,138514.517103,127636.0,2024-01-16
92,2024-01-22,135644.242112,132171.36013,138975.986303,126602.0,2024-01-16
93,2024-01-23,136151.286038,132756.568518,139555.948107,128263.0,2024-01-16


In [143]:
from prophet.diagnostics import performance_metrics
df_p = performance_metrics(df_cv)
df_p

Unnamed: 0,horizon,mse,rmse,mae,mape,mdape,smape,coverage
0,1 days,30005770.0,5477.752491,4574.69171,0.040249,0.042701,0.039753,0.454545
1,2 days,30829230.0,5552.407745,4698.235469,0.041508,0.039922,0.041664,0.285714
2,3 days,49808440.0,7057.509274,5750.281949,0.050039,0.045045,0.050615,0.333333
3,4 days,29323790.0,5415.144924,3816.859572,0.03363,0.020581,0.034582,0.533333
4,5 days,16677010.0,4083.749021,2802.814837,0.0241,0.01725,0.02444,0.692308
5,6 days,31255420.0,5590.654899,4457.207508,0.038791,0.037492,0.038772,0.285714
6,7 days,48771210.0,6983.638523,5272.748272,0.047807,0.035579,0.048071,0.333333


Os resultados mostram o seguinte:

MAPE: Varia de cerca de 3% para um horizonte de 1 dia até cerca de 4.8% para um horizonte de 7 dias. Esses valores indicam que as previsões são relativamente precisas, com erros percentuais aumentando ligeiramente à medida que o horizonte de previsão se estende.
Cobertura: A cobertura do intervalo de previsão parece diminuir com horizontes de previsão mais longos, o que é esperado, pois previsões mais distantes tendem a ser menos precisas.
Os valores do MAPE na faixa de 3% a 5% são geralmente considerados bons para muitas aplicações de séries temporais, especialmente em domínios como previsões de mercado de ações, onde a incerteza é inerente e difícil de prever.

A cobertura do intervalo de previsão (a proporção de pontos de dados futuros reais que caem dentro do intervalo de previsão) parece estar em torno de 40% para um horizonte de 6 dias e cai para 0% no 7º dia. Isso pode indicar que os intervalos de confiança podem ser muito estreitos ou que o modelo está se tornando menos confiável à medida que tenta prever mais adiante no futuro.