In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import lightgbm as lgb
from scipy import stats
from sklearn.preprocessing import LabelEncoder

# 1. Carregar e preparar os dados
def load_data(filepath):
    df = pd.read_csv(filepath, parse_dates=['Unnamed: 0'], index_col='Unnamed: 0')
    df.rename(columns={'setor': 'sector'}, inplace=True)
    df = df.sort_index()
    return df

# 2. Feature Engineering sem TA-Lib
def create_features(df):
    # Codificar setor como variável categórica
    le = LabelEncoder()
    df['sector_encoded'] = le.fit_transform(df['sector'])
    
    # Retornos
    df['retorno_1D'] = df.groupby('ticker')['close'].pct_change()
    for window in [5, 21, 63, 252]:  # 1 semana, 1 mês, 3 meses, 1 ano
        df[f'retorno_{window}D'] = df.groupby('ticker')['close'].pct_change(window)
    
    # Volatilidade
    df['volatilidade_21D'] = df.groupby('ticker')['retorno_1D'].rolling(21).std().values
    
    # Volume
    df['volume_avg_21D'] = df.groupby('ticker')['volume'].rolling(21).mean().values
    df['volume_spike'] = df['volume'] / df['volume_avg_21D']
    
    # Médias Móveis (substituindo TA-Lib)
    df['SMA_50'] = df.groupby('ticker')['close'].rolling(50).mean().values
    df['SMA_200'] = df.groupby('ticker')['close'].rolling(200).mean().values
    
    # RSI (implementação manual simplificada)
    def calculate_rsi(series, window=14):
        delta = series.diff()
        gain = delta.where(delta > 0, 0)
        loss = -delta.where(delta < 0, 0)
        
        avg_gain = gain.rolling(window).mean()
        avg_loss = loss.rolling(window).mean()
        
        rs = avg_gain / avg_loss
        return 100 - (100 / (1 + rs))
    
    df['RSI_14'] = df.groupby('ticker')['close'].transform(calculate_rsi)
    
    # Bollinger Bands (implementação manual)
    def calculate_bollinger_bands(series, window=20, num_std=2):
        sma = series.rolling(window).mean()
        std = series.rolling(window).std()
        upper = sma + (std * num_std)
        lower = sma - (std * num_std)
        return upper, lower
    
    df['BB_upper'], df['BB_lower'] = calculate_bollinger_bands(df['close'])
    
    # Dividendos
    df['dividend_yield'] = df['dividend_amount'] / df['close']
    df['dividend_payment'] = (df['dividend_amount'] > 0).astype(int)
    
    # Features agregadas por setor
    df['sector_avg_retorno_21D'] = df.groupby(['sector', df.index])['retorno_21D'].transform('mean')
    df['sector_avg_volume'] = df.groupby(['sector', df.index])['volume'].transform('mean')
    
    # Target: Retorno em 1 ano (252 dias úteis)
    df['target'] = df.groupby('ticker')['adjusted_close'].transform(
        lambda x: x.shift(-252) / x - 1
    )
    
    return df.dropna()

# 3. Tratamento de Outliers
def remove_outliers(df, z_threshold=3):
    # Remover outliers nas features numéricas
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [col for col in numeric_cols if col not in ['target', 'split_coefficient', 'sector_encoded']]
    
    for col in numeric_cols:
        z_scores = np.abs(stats.zscore(df[col]))
        df = df[z_scores < z_threshold]
    
    # Remover dias com volume muito baixo (liquidez insuficiente)
    df = df[df['volume'] > 10000]  # Ajuste conforme necessário
    
    return df.dropna()

# 4. Treinamento do Modelo
def train_model(df):
    # Features e target
    features = ['retorno_1D', 'retorno_5D', 'retorno_21D', 'retorno_63D',
                'volatilidade_21D', 'volume_avg_21D', 'volume_spike',
                'RSI_14', 'SMA_50', 'SMA_200', 'BB_upper', 'BB_lower',
                'dividend_yield', 'dividend_payment',
                'sector_encoded', 'sector_avg_retorno_21D', 'sector_avg_volume']
    
    X = df[features]
    y = df['target']
    
    # Cross-validation temporal
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Configuração do modelo
    params = {
        'objective': 'mae',
        'num_iterations': 1000,
        'learning_rate': 0.01,
        'max_depth': 7,
        'num_leaves': 31,
        'random_state': 42,
        'verbosity': -1
    }
    
    metrics = []
    feature_importances = pd.DataFrame()
    models = []  # Para armazenar todos os modelos
    
    for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_test, label=y_test)
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[valid_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(period=100)
            ]
        )
        models.append(model)  # Armazena o modelo
        
        # Avaliação
        preds = model.predict(X_test)
        metrics.append({
            'Fold': fold + 1,
            'MAE': mean_absolute_error(y_test, preds),
            'RMSE': np.sqrt(mean_squared_error(y_test, preds)),
            'R2': r2_score(y_test, preds)
        })
        
        # Importância das features
        fold_importance = pd.DataFrame({
            'feature': features,
            'importance': model.feature_importance(importance_type='gain'),
            'fold': fold + 1
        })
        feature_importances = pd.concat([feature_importances, fold_importance])
    
    # Métricas médias
    avg_metrics = pd.DataFrame(metrics).mean()
    print("\nMétricas médias de validação:")
    print(f"MAE: {avg_metrics['MAE']:.4f}")
    print(f"RMSE: {avg_metrics['RMSE']:.4f}")
    print(f"R²: {avg_metrics['R2']:.4f}")
    
    # Analisar importância das features
    mean_importance = feature_importances.groupby('feature')['importance'].mean().sort_values(ascending=False)
    print("\nImportância das features:")
    print(mean_importance.head(15))
    
    return models[-1], features, pd.DataFrame(metrics)  # Retorna o último modelo treinado

# Pipeline completo ajustado
if __name__ == "__main__":
    # Carregar dados
    df = load_data('datasets/acoes_consolidadas.csv')
    
    # Criar features
    df = create_features(df)
    print(f"Total de registros após feature engineering: {len(df)}")
    print(f"Setores disponíveis: {df['sector'].unique()}")
    
    # Remover outliers
    df = remove_outliers(df)
    print(f"Total de registros após remoção de outliers: {len(df)}")
    
    # Treinar modelo
    model, features, metrics_df = train_model(df)
    
    # Salvar modelo e resultados
    model.save_model('modelo_lgbm_sem_talib.txt')  # Método direto do Booster
    metrics_df.to_csv('metricas_validacao.csv', index=False)
    pd.Series(features).to_csv('features_utilizadas.csv', index=False)
    
    print("\nModelo treinado com sucesso sem TA-Lib!")
    print("Arquivos gerados:")
    print("- modelo_lgbm_sem_talib.txt")
    print("- metricas_validacao.csv")
    print("- features_utilizadas.csv")

Total de registros após feature engineering: 54529
Setores disponíveis: ['Construção' 'Industrial' 'Siderurgia' 'Alimentos' 'Energia' 'Varejo'
 'Financeiro' 'Mineração' 'Consumo' 'Turismo' 'Celulose' 'Saúde']
Total de registros após remoção de outliers: 44473
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l1: 0.479005
Early stopping, best iteration is:
[91]	valid_0's l1: 0.478933
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l1: 0.313573
Early stopping, best iteration is:
[81]	valid_0's l1: 0.313143
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[30]	valid_0's l1: 0.499935
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l1: 0.354372
Early stopping, best iteration is:
[69]	valid_0's l1: 0.35192
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l1: 0.245889
Early stopping, best iteration is:
[121]	valid_0's l1: 0.245712

Mét