In [None]:
!pip install python-bcb pandas seaborn sklearn yfinance xgboost optuna scikit-learn xgboost

In [None]:
import pandas as pd
import seaborn as sns
import yfinance as yf
from bcb import sgs
from sklearn.feature_selection import mutual_info_classif, chi2
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import xgboost as xgb
import optuna
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import train_test_split
import warnings


In [None]:
DATA_INICIAL = '2019-04-09'
DATA_FIM = '2025-10-01'

In [None]:
tickers = ['^GSPC', '^VIX', '^TNX',  'BZ=F', 'BRL=X', 'IRFM11.SA']
dataframes = []
closes = []
for ticker in tickers:
    dado = yf.download(
        ticker,
        start= DATA_INICIAL,
        end= DATA_FIM,
        auto_adjust=True
    )

    if not dado.empty:
        close = dado[['Close']].rename(columns={'Close': ticker})
        closes.append(close)
df_final = pd.concat(closes, axis=1)
print(df_final)

In [None]:
ipca_exp = sgs.get(433, start=DATA_INICIAL, end=DATA_FIM)
ipca_exp.rename(columns={433: 'IPCA_Expectativa_12m'}, inplace=True)

selic = sgs.get(432, start=DATA_INICIAL, end=DATA_FIM)
selic.rename(columns={432: 'Selic_Meta'}, inplace=True)

ibcbr = sgs.get(24363, start=DATA_INICIAL, end=DATA_FIM)
ibcbr.rename(columns={24363: 'IBC_Br'}, inplace=True)

ptax = sgs.get(1, start=DATA_INICIAL, end=DATA_FIM)
ptax.rename(columns={1: 'Dolar_PTAX'}, inplace=True)

macro_bcb = {
    'IPCA_EXP': ipca_exp,
    'SELIC': selic,
    'IBC_BR': ibcbr,
    'PTAX': ptax
}
macro_bcb['SELIC'].describe()

macro_df = pd.concat(macro_bcb.values(), axis=1)
macro_df.head()
macro_df = macro_df.ffill()
macro_df.head()

In [None]:
join = [macro_df, df_final]

externo = pd.concat(join, axis=1)
print(externo)

In [None]:
url_focus = (
    "https://olinda.bcb.gov.br/olinda/servico/Expectativas/versao/v1/odata/"
    "ExpectativasMercadoAnuais?$top=100000"
    "&$filter=Indicador%20eq%20'IPCA'%20and%20Data%20ge%20'2019-04-09'"
    "&$format=json"
    "&$orderby=Data%20asc"
)
try:
    json = pd.read_json(url_focus)
    Focus = pd.DataFrame(json['value'].tolist())

    Focus['Data'] = pd.to_datetime(Focus['Data'])

    Focus['DataReferencia'] = Focus['DataReferencia'].astype(int)
    
    Focus['Ano_Divulgacao'] = Focus['Data'].dt.year
    
    Focus = Focus[Focus['DataReferencia'] == Focus['Ano_Divulgacao']].copy()

    Focus = Focus.set_index('Data').sort_index()
    
    Focus = Focus[['Mediana']].rename(columns={'Mediana': 'IPCA_Expectativa_AnoCorrente'})
    
    print(Focus.tail())

except Exception as e:
    raise TypeError
print(Focus)

In [None]:
Focus = Focus.groupby(Focus.index).mean()
Focus = Focus.sort_index()
print(Focus)

In [None]:
FeatureBase = pd.merge(externo, Focus, left_index=True, right_index=True, how='inner')
FeatureSemanal = FeatureBase[FeatureBase.index.dayofweek == 4].copy()
print(FeatureSemanal)

In [None]:
finbert = pd.read_csv('FinBert.csv')
finbert['data'] = pd.to_datetime(finbert['data'])

Diario = finbert.groupby('data')['score'].sum().sort_index()
Diario = Diario.to_frame(name='scoreTotal')

Diario['scoreSemanal'] = Diario['scoreTotal'].rolling('7D', min_periods=1).mean()

FeatureB = Diario.loc[Diario.index.dayofweek == 4, ['scoreSemanal']].copy()

print(FeatureB.tail())

In [None]:
Features_B = pd.merge(FeatureB, FeatureSemanal, left_index=True, right_index=True, how='inner')
print(Features_B)

In [None]:
nans = Features_B.isna().sum()
print(nans[nans > 0])

In [None]:
Features_B = Features_B.ffill()
Features_B = Features_B.dropna()

print(f"Total de NaNs: {Features_B.isna().sum().sum()}")

In [None]:
Features_B.columns

Features_B.columns = [
    'scoreSemanal',
    'ipcaMensal',
    'selicMeta',
    'ibcBrActivity',
    'dolarPtax',
    'sp500Index',
    'vixIndex',
    'treasuryYield10y',
    'brentOilPrice',
    'usdBrlExchange',
    'irfm11FixedIncome',
    'IPCA_Ano_Atual'
]

Features_B.columns

In [None]:
print(Features_B)

Pre-processamento antes do XGBOOST

In [None]:
padrao = pd.read_csv('./dados/BaseA.csv')
padrao['Data'] = pd.to_datetime(padrao['Data'])
alinhamento = padrao['Data'].values

if not isinstance(Features_B.index, pd.DatetimeIndex):
    Features_B.index = pd.to_datetime(Features_B.index)

Features_B = Features_B.ffill()
Features_B = Features_B.dropna()

ativos = pd.read_csv('ibovespa.csv', sep=';')
dataset = {}
tickers = ativos['ticker']

print(f"Iniciando processamento para {len(tickers)} ativos...")

for ticker in tickers:
    try:
        df = yf.download(
            ticker,
            start=DATA_INICIAL,
            end=DATA_FIM,
            auto_adjust=True,
            progress=False
        )
        
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = df.columns.get_level_values(0)

        df = df[df.index.weekday == 4]

        var = pd.merge(df, Features_B, left_index=True, right_index=True, how='inner')

        var['AlvoRetorno'] = var['Close'].shift(-1) / var['Close'] - 1
        var['Alvo'] = np.where(var['AlvoRetorno'] > 0, 1, 0)

        var = var.drop(columns=['Open', 'High', 'Low', 'Close', 'Volume'], errors='ignore')
        
        var = var.dropna(subset=['Alvo', 'AlvoRetorno'])
        var['Alvo'] = var['Alvo'].astype(int)

        var_final = var[var.index.isin(alinhamento)].copy()

        if not var_final.empty:
            dataset[ticker] = var_final
            
    except Exception as e:
        pass

In [None]:
import pandas as pd

ref = pd.read_csv('./dados/Interseccao.csv')
DatasEmp = pd.to_datetime(ref['Data']).values

if ticker in dataset:
    dataset[ticker] = dataset[ticker][dataset[ticker].index.isin(DatasEmp)].copy()

In [None]:
import numpy as np

for tick, df in dataset.items():
    featuresCol = [c for c in df.columns if c not in ['Alvo', 'AlvoRetorno','scoreSemanal']]
    
    x = df[featuresCol]
    
    corr_matrix = x.corr().abs()

    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    drop = [column for column in upper.columns if any(upper[column] > 0.95)]

    if len(drop) > 0:
        print(f"[{tick}] Removendo {len(drop)} features redundantes: {drop}")
        
        dataset[tick] = df.drop(columns=drop)
print("\nTratamento de correlação concluido")

Teste de datas iguais em ambos os modelos

In [None]:
print(dataset['PETR4.SA'].tail(-10))

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings('ignore')

modelos = {}

for ticker, df in dataset.items():
    print(f"\n--- Treinando: {ticker} ---")
    
    features = [c for c in df.columns if c not in ['Alvo', 'AlvoRetorno']]
    X = df[features]
    y = df['Alvo']

    split = int(len(df) * 0.8)
    
    X_dev = X.iloc[:split]
    y_dev = y.iloc[:split]
    
    X_backtest = X.iloc[split:]
    y_backtest = y.iloc[split:]
    
    def objective(trial):

        cutoff = int(len(X_dev) * 0.75)
        X_train_opt, X_val_opt = X_dev.iloc[:cutoff], X_dev.iloc[cutoff:]
        y_train_opt, y_val_opt = y_dev.iloc[:cutoff], y_dev.iloc[cutoff:]
        
        params = {
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
            'gamma': trial.suggest_float('gamma', 0.0, 5.0),
            'random_state': 42,
            'n_jobs': -1
        }
        
        model = xgb.XGBClassifier(**params)
        model.fit(X_train_opt, y_train_opt)
        
        preds = model.predict(X_val_opt)
        
        score = precision_score(y_val_opt, preds, zero_division=0)
        
        return score

    study = optuna.create_study(direction='maximize')
    
    study.optimize(objective, n_trials=100) 
    
    print(f"Melhor Score: {study.best_value:.2%}")
    print(f"Melhores Parâmetros: {study.best_params}")

    best_params = study.best_params
    best_params['random_state'] = 42
    best_params['n_jobs'] = -1
    
    final_model = xgb.XGBClassifier(**best_params)
    final_model.fit(X_dev, y_dev)
    
    modelos[ticker] = final_model
