In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ta.momentum import RSIIndicator
from ta.trend import MACD
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# 1. Chargement des données
df = pd.read_csv("aapl_data.csv").dropna()

# 2. Enrichissement des indicateurs (réduit pour éviter le surapprentissage)
df["Momentum_5"] = df["Close"] - df["Close"].shift(5)
df["Volatility_5"] = df["Close"].rolling(5).std()
df["Return1d"] = df["Close"].pct_change()
df["MA_10"] = df["Close"].rolling(10).mean()
df["MA_50"] = df["Close"].rolling(50).mean()
df["MA_200"] = df["Close"].rolling(200).mean()
df["MA_ratio"] = df["MA_10"] / df["MA_50"]
df["Long_trend"] = (df["Close"] - df["MA_200"]) / df["MA_200"]  # Normalisé
df["Price_vs_High"] = df["Close"] / df["High"].rolling(10).max()
df["Price_vs_Low"] = df["Close"] / df["Low"].rolling(10).min()
df["Trend_day"] = (df["Close"] - df["Open"]) / df["Open"]  # Normalisé
df["RSI"] = RSIIndicator(df["Close"]).rsi()
macd = MACD(df["Close"])
df["MACD"] = macd.macd()
df["MACD_signal"] = macd.macd_signal()
df["MACD_diff"] = macd.macd_diff()

# Normalisation du volume
df["Volume_norm"] = df["Volume"] / df["Volume"].rolling(20).mean()

# 3. Cible : log-return à J+5
df["Target"] = np.log(df["Close"].shift(-5)) - np.log(df["Close"])
df = df.dropna()

# 4. Features + split (sélection plus conservatrice)
features = ["Momentum_5", "Volatility_5", "Return1d", "MA_ratio",
            "Price_vs_High", "Price_vs_Low", "Trend_day", "Long_trend",
            "RSI", "MACD_diff", "Volume_norm"]

X = df[features]
y = df["Target"]

# Suppression des valeurs aberrantes (outliers)
Q1 = y.quantile(0.25)
Q3 = y.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
mask = (y >= lower_bound) & (y <= upper_bound)
X = X[mask]
y = y[mask]
print(f"📊 Données après suppression des outliers: {len(X)} observations")

# Split temporel
split_date = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_date], X.iloc[split_date:]
y_train, y_test = y.iloc[:split_date], y.iloc[split_date:]

# Normalisation des features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

# Corrélation entre features (avec seuil plus strict)
plt.figure(figsize=(10, 8))
corr_matrix = X_train_scaled.corr()
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, linewidths=0.5)
plt.title("🔍 Corrélation entre les variables explicatives", fontsize=14)
plt.tight_layout()
plt.show()

# Suppression des features trop corrélées (seuil plus strict)
corr_threshold = 0.35  # Plus strict
corr_matrix_abs = X_train_scaled.corr().abs()
upper_tri = corr_matrix_abs.where(np.triu(np.ones(corr_matrix_abs.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > corr_threshold)]
print("🚫 Variables supprimées pour forte corrélation :", to_drop)

X_train_final = X_train_scaled.drop(columns=to_drop)
X_test_final = X_test_scaled.drop(columns=to_drop)

print(f"🎯 Nombre final de features: {X_train_final.shape[1]}")

# 5. Hyperparamètres optimisés contre le surapprentissage
optimized_params = {
    "n_estimators": 200,        # Réduit de 300
    "learning_rate": 0.05,      # Augmenté de 0.01
    "max_depth": 3,             # Réduit de 2 à 3 (plus de complexité contrôlée)
    "subsample": 0.8,           # Augmenté de 0.7
    "colsample_bytree": 0.8,    # Augmenté de 0.7
    "colsample_bylevel": 0.8,   # Nouveau paramètre
    "reg_lambda": 10.0,         # Augmenté de 2.0 (plus de régularisation L2)
    "reg_alpha": 5.0,           # Augmenté de 1.0 (plus de régularisation L1)
    "min_child_weight": 3,      # Nouveau paramètre pour éviter le surapprentissage
    "gamma": 0.1,               # Nouveau paramètre (minimum split loss)
    "random_state": 42,
    "objective": "reg:squarederror",
    "eval_metric": "rmse"
}

# 6. Validation croisée temporelle avec plus de splits
tscv = TimeSeriesSplit(n_splits=8)  # Augmenté de 5 à 8
cv_scores = []
val_scores = []

print("\n🔁 Validation croisée temporelle:")
for i, (train_idx, val_idx) in enumerate(tscv.split(X_train_final)):
    X_tr, X_val = X_train_final.iloc[train_idx], X_train_final.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model_cv = XGBRegressor(**optimized_params, early_stopping_rounds=50)
    model_cv.fit(
        X_tr, y_tr,
        eval_set=[(X_tr, y_tr), (X_val, y_val)],
        verbose=False
    )

    # Prédictions
    train_preds = model_cv.predict(X_tr)
    val_preds = model_cv.predict(X_val)

    # Scores
    train_rmse = np.sqrt(mean_squared_error(y_tr, train_preds))
    val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))

    cv_scores.append(train_rmse)
    val_scores.append(val_rmse)

    print(f"Split {i+1} - Train RMSE: {train_rmse:.5f}, Val RMSE: {val_rmse:.5f}, "
          f"Écart: {val_rmse - train_rmse:.5f}")

print(f"\n📊 RMSE moyen Train: {np.mean(cv_scores):.5f}")
print(f"📊 RMSE moyen Validation: {np.mean(val_scores):.5f}")
print(f"📊 Écart moyen (surapprentissage): {np.mean(val_scores) - np.mean(cv_scores):.5f}")

# 7. Entraînement final avec early stopping plus agressif
final_model = XGBRegressor(**optimized_params)
final_model.fit(
    X_train_final, y_train,
    eval_set=[(X_train_final, y_train), (X_test_final, y_test)],
    verbose=True
)

print(f"\n🎯 Nombre d'arbres utilisés: {final_model}")

# 8. Prédiction & reconstruction prix J+5
y_pred_delta = final_model.predict(X_test_final)
close_today = df["Close"].iloc[-len(y_test):].values
y_pred_price = close_today * np.exp(y_pred_delta)
y_true_price = close_today * np.exp(y_test.values)

# 9. Évaluation détaillée
# Sur les prix
mae_price = mean_absolute_error(y_true_price, y_pred_price)
rmse_price = np.sqrt(mean_squared_error(y_true_price, y_pred_price))
r2_price = r2_score(y_true_price, y_pred_price)

# Sur les log-returns
mae_log = mean_absolute_error(y_test, y_pred_delta)
rmse_log = np.sqrt(mean_squared_error(y_test, y_pred_delta))
r2_log = r2_score(y_test, y_pred_delta)

# Évaluation sur train pour détecter le surapprentissage
y_pred_train_delta = final_model.predict(X_train_final)
mae_train_log = mean_absolute_error(y_train, y_pred_train_delta)
rmse_train_log = np.sqrt(mean_squared_error(y_train, y_pred_train_delta))
r2_train_log = r2_score(y_train, y_pred_train_delta)

print("\n" + "="*50)
print("📈 RÉSULTATS FINAUX")
print("="*50)
print(f"🏋️  TRAIN - RMSE: {rmse_train_log:.5f}, MAE: {mae_train_log:.5f}, R²: {r2_train_log:.5f}")
print(f"🧪 TEST  - RMSE: {rmse_log:.5f}, MAE: {mae_log:.5f}, R²: {r2_log:.5f}")
print(f"📊 ÉCART (surapprentissage): {rmse_log - rmse_train_log:.5f}")

print(f"\n💰 Sur les prix:")
print(f"📊 MAE  : ${mae_price:.2f}")
print(f"📉 RMSE : ${rmse_price:.2f}")
print(f"📈 R²   : {r2_price:.4f}")

# 10. Visualisation améliorée
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Prix prédits vs réels
axes[0,0].plot(y_true_price, label="Prix Réel J+5", linewidth=2, alpha=0.8)
axes[0,0].plot(y_pred_price, label="Prévision J+5", linestyle='--', alpha=0.8)
axes[0,0].set_title("📈 Prédiction du prix AAPL à J+5")
axes[0,0].set_xlabel("Observation")
axes[0,0].set_ylabel("Prix ($)")
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# Scatter plot prix
axes[0,1].scatter(y_true_price, y_pred_price, alpha=0.6, s=20)
axes[0,1].plot([y_true_price.min(), y_true_price.max()],
               [y_true_price.min(), y_true_price.max()], 'r--', lw=2)
axes[0,1].set_xlabel("Prix Réel ($)")
axes[0,1].set_ylabel("Prix Prédit ($)")
axes[0,1].set_title(f"📊 Corrélation Prix (R² = {r2_price:.3f})")
axes[0,1].grid(True, alpha=0.3)

# Log-returns
axes[1,0].plot(y_test.values, label="Log-return Réel", linewidth=2, alpha=0.8)
axes[1,0].plot(y_pred_delta, label="Log-return Prédit", linestyle='--', alpha=0.8)
axes[1,0].set_title("📈 Prédiction des Log-returns")
axes[1,0].set_xlabel("Observation")
axes[1,0].set_ylabel("Log-return")
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)

# Résidus
residuals = y_test.values - y_pred_delta
axes[1,1].scatter(y_pred_delta, residuals, alpha=0.6, s=20)
axes[1,1].axhline(y=0, color='r', linestyle='--')
axes[1,1].set_xlabel("Prédictions")
axes[1,1].set_ylabel("Résidus")
axes[1,1].set_title("📊 Analyse des résidus")
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


# 12. Comparaison avec baseline
close_today_test = df["Close"].iloc[-len(y_test):].values
y_naive_price = close_today_test
r2_naive = r2_score(y_true_price, y_naive_price)
mae_naive = mean_absolute_error(y_true_price, y_naive_price)
rmse_naive = np.sqrt(mean_squared_error(y_true_price, y_naive_price))

print("\n🧊 BASELINE NAÏVE (prix constant)")
print(f"📉 RMSE : ${rmse_naive:.2f}")
print(f"📊 MAE  : ${mae_naive:.2f}")
print(f"📈 R²   : {r2_naive:.4f}")

# 13. Métriques de généralisation
print("\n🎯 MÉTRIQUES DE GÉNÉRALISATION")
print(f"📊 Amélioration vs Baseline (RMSE): {((rmse_naive - rmse_price) / rmse_naive * 100):.1f}%")
print(f"📊 Amélioration vs Baseline (MAE): {((mae_naive - mae_price) / mae_naive * 100):.1f}%")

if rmse_log - rmse_train_log < 0.001:
    print("✅ Pas de surapprentissage détecté")
elif rmse_log - rmse_train_log < 0.005:
    print("⚠️  Léger surapprentissage")
else:
    print("❌ Surapprentissage détecté - ajuster les hyperparamètres")

In [2]:
!pip install ta
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ta.momentum import RSIIndicator
from ta.trend import MACD
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler  # Plus robuste que StandardScaler
from scipy.stats import zscore
from scipy.stats import median_abs_deviation
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

def detect_outliers_mad(data, threshold=3):
    """Détection d'outliers avec MAD (Median Absolute Deviation) - plus robuste que z-score"""
    mad = median_abs_deviation(data, nan_policy='omit')
    median = np.median(data)
    modified_z_scores = 0.6745 * (data - median) / mad
    return np.abs(modified_z_scores) > threshold

def winsorize_series(series, lower_percentile=5, upper_percentile=95):
    """Winsorisation pour limiter l'impact des outliers extrêmes"""
    lower_bound = np.percentile(series, lower_percentile)
    upper_bound = np.percentile(series, upper_percentile)
    return np.clip(series, lower_bound, upper_bound)

def robust_feature_engineering(df):
    """Feature engineering robuste avec gestion des outliers"""

    # Features de base avec winsorisation
    df["Momentum_5"] = df["Close"] - df["Close"].shift(5)
    df["Momentum_5"] = winsorize_series(df["Momentum_5"])

    df["Volatility_5"] = df["Close"].rolling(5).std()
    df["Volatility_5"] = winsorize_series(df["Volatility_5"])

    df["Return1d"] = df["Close"].pct_change()
    df["Return1d"] = winsorize_series(df["Return1d"])

    # Moyennes mobiles avec ratio robuste
    df["MA_10"] = df["Close"].rolling(10).mean()
    df["MA_50"] = df["Close"].rolling(50).mean()
    df["MA_200"] = df["Close"].rolling(200).mean()

    # Ratios limités pour éviter les valeurs extrêmes
    df["MA_ratio"] = np.clip(df["MA_10"] / df["MA_50"], 0.5, 2.0)

    # Tendance long terme normalisée et bornée
    df["Long_trend"] = np.clip((df["Close"] - df["MA_200"]) / df["MA_200"], -0.5, 0.5)

    # Prix relatifs bornés
    df["Price_vs_High"] = np.clip(df["Close"] / df["High"].rolling(10).max(), 0.7, 1.0)
    df["Price_vs_Low"] = np.clip(df["Close"] / df["Low"].rolling(10).min(), 1.0, 1.5)

    # Trend day normalisé et borné
    trend_day = (df["Close"] - df["Open"]) / df["Open"]
    df["Trend_day"] = np.clip(trend_day, -0.1, 0.1)

    # RSI (déjà borné entre 0-100)
    df["RSI"] = RSIIndicator(df["Close"]).rsi()
    df["RSI"] = df["RSI"].fillna(50)  # Valeur neutre pour les NaN

    # MACD avec winsorisation
    macd = MACD(df["Close"])
    df["MACD_diff"] = macd.macd_diff()
    df["MACD_diff"] = winsorize_series(df["MACD_diff"].fillna(0))

    # Volume normalisé robuste
    volume_ma = df["Volume"].rolling(20).median()  # Médiane plus robuste que moyenne
    df["Volume_norm"] = np.clip(df["Volume"] / volume_ma, 0.1, 5.0)

    return df

# 1. Chargement et preprocessing robuste
df = pd.read_csv("aapl_data.csv").dropna()

# Application du feature engineering robuste
df = robust_feature_engineering(df)

# 2. Cible avec limites pour éviter les valeurs extrêmes
df["Target"] = np.log(df["Close"].shift(-5)) - np.log(df["Close"])
# Limitation des log-returns extrêmes
df["Target"] = np.clip(df["Target"], -0.3, 0.3)
df = df.dropna()

print(f"📊 Données après preprocessing: {len(df)} observations")

# 3. Sélection des features robustes
robust_features = ["Momentum_5", "Volatility_5", "Return1d", "MA_ratio",
                   "Price_vs_High", "Price_vs_Low", "Trend_day", "Long_trend",
                   "RSI", "MACD_diff", "Volume_norm"]

X = df[robust_features]
y = df["Target"]

# 4. Détection et traitement des outliers avec MAD
print("\n🔍 Détection des outliers avec MAD:")
outlier_mask = np.zeros(len(y), dtype=bool)

for col in X.columns:
    col_outliers = detect_outliers_mad(X[col].values)
    outlier_mask |= col_outliers
    print(f"  {col}: {col_outliers.sum()} outliers détectés")

target_outliers = detect_outliers_mad(y.values)
outlier_mask |= target_outliers
print(f"  Target: {target_outliers.sum()} outliers détectés")

print(f"📊 Total outliers: {outlier_mask.sum()}/{len(y)} ({outlier_mask.sum()/len(y)*100:.1f}%)")

# Conservation des données non-outliers
X_clean = X[~outlier_mask]
y_clean = y[~outlier_mask]
print(f"📊 Données finales: {len(X_clean)} observations")

# 5. Split temporel
split_date = int(len(X_clean) * 0.8)
X_train, X_test = X_clean.iloc[:split_date], X_clean.iloc[split_date:]
y_train, y_test = y_clean.iloc[:split_date], y_clean.iloc[split_date:]

# 6. Normalisation robuste (RobustScaler résiste mieux aux outliers)
robust_scaler = RobustScaler()
X_train_scaled = pd.DataFrame(
    robust_scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)
X_test_scaled = pd.DataFrame(
    robust_scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

# 7. Analyse de corrélation pour la sélection finale (approche plus intelligente)
corr_matrix = X_train_scaled.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Corrélation entre features (avec seuil plus strict)
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, linewidths=0.5)
plt.title("🔍 Corrélation entre les variables explicatives", fontsize=14)
plt.tight_layout()
plt.show()

# Sélection intelligente : on garde les features les plus importantes
print(f"\n🔍 Matrice de corrélation:")
print(corr_matrix.round(2))

# Stratégie : supprimer seulement si corrélation > 0.7 ET garder minimum 8 features
corr_threshold = 0.7
highly_correlated_pairs = []

for col in upper_tri.columns:
    correlated_features = upper_tri.index[upper_tri[col] > corr_threshold].tolist()
    if correlated_features:
        highly_correlated_pairs.extend([(col, feat, upper_tri.loc[feat, col]) for feat in correlated_features])

# Tri par corrélation décroissante
highly_correlated_pairs.sort(key=lambda x: x[2], reverse=True)
print(f"\n🔍 Paires très corrélées (>{corr_threshold}):")
for pair in highly_correlated_pairs[:5]:  # Top 5
    print(f"  {pair[0]} - {pair[1]}: {pair[2]:.3f}")

# Suppression conservative : maximum 3 features
to_drop = []
if highly_correlated_pairs:
    # On ne supprime que si on a plus de 8 features et corrélation > 0.8
    for pair in highly_correlated_pairs:
        if len(X_train_scaled.columns) - len(to_drop) > 8 and pair[2] > 0.8:
            if pair[1] not in to_drop:  # Éviter les doublons
                to_drop.append(pair[1])
        if len(to_drop) >= 2:  # Maximum 2 suppressions
            break

print(f"\n🚫 Variables supprimées pour forte corrélation (>{0.8}): {to_drop}")

X_train_final = X_train_scaled.drop(columns=to_drop) if to_drop else X_train_scaled
X_test_final = X_test_scaled.drop(columns=to_drop) if to_drop else X_test_scaled
print(f"🎯 Features finales ({len(X_train_final.columns)}): {list(X_train_final.columns)}")

# 8. Hyperparamètres équilibrés pour permettre l'apprentissage
ultra_robust_params = {
    "n_estimators": 300,        # Suffisant pour l'apprentissage
    "learning_rate": 0.05,      # Apprentissage modéré
    "max_depth": 4,             # Complexité contrôlée mais suffisante
    "subsample": 0.8,           # Échantillonnage robuste
    "colsample_bytree": 0.8,    # Sélection de features robuste
    "colsample_bylevel": 0.8,
    "reg_lambda": 5.0,          # Régularisation modérée (était trop forte)
    "reg_alpha": 2.0,           # Régularisation modérée (était trop forte)
    "min_child_weight": 3,      # Réduit pour permettre plus de splits
    "gamma": 0.05,              # Réduit pour permettre plus de splits
    "max_delta_step": 0,        # Pas de limite (0 = pas de limite)
    "random_state": 42,
    "objective": "reg:squarederror",
    "eval_metric": ["rmse", "mae"]  # Les deux métriques pour surveillance
}

# 9. Validation croisée robuste avec bootstrapping
def robust_cross_validation(X, y, params, n_splits=5):  # Réduit à 5 splits
    """Validation croisée avec mesures de robustesse"""
    tscv = TimeSeriesSplit(n_splits=n_splits, test_size=len(X)//10)  # Test size défini

    mae_scores = []
    rmse_scores = []
    r2_scores = []

    print(f"🔁 Validation croisée avec {n_splits} splits:")

    for i, (train_idx, val_idx) in enumerate(tscv.split(X)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        print(f"  Split {i+1}: Train={len(X_tr)}, Val={len(X_val)}")

        model = XGBRegressor(**params, early_stopping_rounds=30)
        model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)

        preds = model.predict(X_val)

        mae = mean_absolute_error(y_val, preds)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        r2 = r2_score(y_val, preds)

        mae_scores.append(mae)
        rmse_scores.append(rmse)
        r2_scores.append(r2)

        print(f"    MAE: {mae:.5f}, RMSE: {rmse:.5f}, R²: {r2:.5f}")

    return {
        'mae_mean': np.mean(mae_scores),
        'mae_std': np.std(mae_scores),
        'rmse_mean': np.mean(rmse_scores),
        'rmse_std': np.std(rmse_scores),
        'r2_mean': np.mean(r2_scores),
        'r2_std': np.std(r2_scores)
    }

print("\n🔁 Validation croisée robuste en cours...")
cv_results = robust_cross_validation(X_train_final, y_train, ultra_robust_params)

print(f"\n📊 RÉSULTATS VALIDATION CROISÉE:")
print(f"MAE  : {cv_results['mae_mean']:.5f} ± {cv_results['mae_std']:.5f}")
print(f"RMSE : {cv_results['rmse_mean']:.5f} ± {cv_results['rmse_std']:.5f}")
print(f"R²   : {cv_results['r2_mean']:.5f} ± {cv_results['r2_std']:.5f}")

# 10. Entraînement du modèle final avec surveillance
print(f"\n🎯 Entraînement du modèle final avec {len(X_train_final.columns)} features")
print(f"📊 Taille train: {len(X_train_final)}, Taille test: {len(X_test_final)}")

# Vérification des données avant entraînement
print(f"📊 Plage cible train: [{y_train.min():.4f}, {y_train.max():.4f}]")
print(f"📊 Plage cible test: [{y_test.min():.4f}, {y_test.max():.4f}]")

final_model = XGBRegressor(**ultra_robust_params, early_stopping_rounds=50)  # Plus tolérant
final_model.fit(
    X_train_final, y_train,
    eval_set=[(X_train_final, y_train), (X_test_final, y_test)],
    verbose=10  # Affichage toutes les 10 itérations pour diagnostic
)

print(f"\n🎯 Modèle entraîné avec {final_model.best_iteration} arbres")
print(f"🎯 Meilleur score: {final_model.best_score:.5f}")

# 11. Évaluation finale avec métriques robustes
y_pred_train = final_model.predict(X_train_final)
y_pred_test = final_model.predict(X_test_final)

# Métriques sur log-returns
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)

# 12. Test de robustesse sur prix
close_train = df["Close"].iloc[:len(y_train)].values
close_test = df["Close"].iloc[-len(y_test):].values

price_pred_train = close_train * np.exp(y_pred_train)
price_true_train = close_train * np.exp(y_train.values)
price_pred_test = close_test * np.exp(y_pred_test)
price_true_test = close_test * np.exp(y_test.values)

price_mae_train = mean_absolute_error(price_true_train, price_pred_train)
price_mae_test = mean_absolute_error(price_true_test, price_pred_test)

print("\n" + "="*60)
print("🛡️  ÉVALUATION DE ROBUSTESSE")
print("="*60)
print(f"📊 TRAIN - MAE: {train_mae:.5f}, RMSE: {train_rmse:.5f}, R²: {train_r2:.5f}")
print(f"🧪 TEST  - MAE: {test_mae:.5f}, RMSE: {test_rmse:.5f}, R²: {test_r2:.5f}")
print(f"📈 ÉCART MAE (robustesse): {abs(test_mae - train_mae):.5f}")
print(f"📈 ÉCART RMSE (robustesse): {abs(test_rmse - train_rmse):.5f}")

print(f"\n💰 PRIX - Train MAE: ${price_mae_train:.2f}, Test MAE: ${price_mae_test:.2f}")
print(f"💰 ÉCART PRIX: ${abs(price_mae_test - price_mae_train):.2f}")

# 13. Indicateurs de robustesse
robustness_score = 1 - abs(test_mae - train_mae) / train_mae
stability_score = 1 - cv_results['mae_std'] / cv_results['mae_mean']

print(f"\n🎯 SCORES DE ROBUSTESSE:")
print(f"Robustesse Train/Test: {robustness_score:.3f} (1.0 = parfait)")
print(f"Stabilité CV: {stability_score:.3f} (1.0 = parfait)")

if robustness_score > 0.95 and stability_score > 0.9:
    print("✅ MODÈLE TRÈS ROBUSTE")
elif robustness_score > 0.9 and stability_score > 0.8:
    print("✅ MODÈLE ROBUSTE")
elif robustness_score > 0.8 and stability_score > 0.7:
    print("⚠️  MODÈLE MOYENNEMENT ROBUSTE")
else:
    print("❌ MODÈLE PEU ROBUSTE - Révision nécessaire")

# 14. Visualisations de robustesse
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Performance train vs test
axes[0,0].scatter(y_train, y_pred_train, alpha=0.5, s=20, label=f'Train (MAE={train_mae:.4f})')
axes[0,0].scatter(y_test, y_pred_test, alpha=0.5, s=20, label=f'Test (MAE={test_mae:.4f})')
axes[0,0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
axes[0,0].set_xlabel("Valeurs Réelles")
axes[0,0].set_ylabel("Prédictions")
axes[0,0].set_title("🎯 Robustesse Train vs Test")
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# Résidus pour détecter les patterns
residuals_test = y_test.values - y_pred_test
axes[0,1].scatter(y_pred_test, residuals_test, alpha=0.6, s=20)
axes[0,1].axhline(y=0, color='r', linestyle='--')
axes[0,1].set_xlabel("Prédictions")
axes[0,1].set_ylabel("Résidus")
axes[0,1].set_title("📊 Analyse des Résidus")
axes[0,1].grid(True, alpha=0.3)

# Distribution des erreurs
axes[0,2].hist(residuals_test, bins=30, alpha=0.7, density=True)
axes[0,2].axvline(x=0, color='r', linestyle='--')
axes[0,2].set_xlabel("Résidus")
axes[0,2].set_ylabel("Densité")
axes[0,2].set_title("📈 Distribution des Erreurs")
axes[0,2].grid(True, alpha=0.3)

# Prix prédits vs réels
axes[1,0].plot(price_true_test, label="Prix Réel", linewidth=2, alpha=0.8)
axes[1,0].plot(price_pred_test, label="Prix Prédit", linestyle='--', alpha=0.8)
axes[1,0].set_title("💰 Prédiction des Prix")
axes[1,0].set_xlabel("Observation")
axes[1,0].set_ylabel("Prix ($)")
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)

# Importance des features
feature_importance = final_model.feature_importances_
feature_names = X_train_final.columns
sorted_idx = np.argsort(feature_importance)[-10:]
axes[1,1].barh(range(len(sorted_idx)), feature_importance[sorted_idx])
axes[1,1].set_yticks(range(len(sorted_idx)))
axes[1,1].set_yticklabels([feature_names[i] for i in sorted_idx])
axes[1,1].set_title("🎯 Importance des Features")
axes[1,1].grid(True, alpha=0.3)

# Performance temporelle
error_abs = np.abs(residuals_test)
axes[1,2].plot(error_abs, alpha=0.7)
axes[1,2].axhline(y=np.mean(error_abs), color='r', linestyle='--',
                  label=f'MAE moyen: {np.mean(error_abs):.4f}')
axes[1,2].set_xlabel("Observation")
axes[1,2].set_ylabel("Erreur Absolue")
axes[1,2].set_title("📉 Stabilité Temporelle")
axes[1,2].legend()
axes[1,2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 15. Test de stress (simulation avec bruit)
print("\n🧪 TEST DE STRESS - Ajout de bruit gaussien")
noise_levels = [0.01, 0.02, 0.05, 0.1]
stress_results = []

for noise in noise_levels:
    X_test_noisy = X_test_final + np.random.normal(0, noise, X_test_final.shape)
    y_pred_noisy = final_model.predict(X_test_noisy)
    mae_noisy = mean_absolute_error(y_test, y_pred_noisy)
    stress_results.append(mae_noisy)
    print(f"Bruit {noise:.2f}: MAE = {mae_noisy:.5f} (dégradation: {((mae_noisy-test_mae)/test_mae*100):+.1f}%)")

print(f"\n🛡️  RÉSISTANCE AU BRUIT: Dégradation moyenne {np.mean([(s-test_mae)/test_mae*100 for s in stress_results]):.1f}%")
import os

# Créer le dossier 'model' s'il n'existe pas
os.makedirs("model", exist_ok=True)

import joblib

# Enregistrement du modèle
joblib.dump(final_model, "model/final_model.xgb")

# Enregistrement du scaler (pour les nouvelles données)
joblib.dump(robust_scaler, "model/scaler.joblib")

Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25ldone
[?25h  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29482 sha256=5efb52531fcd13f4ddf1eba7f8f90beabb4ba5b774a8f64459ef88796d51a7ea
  Stored in directory: /home/pc/.cache/pip/wheels/5c/a1/5f/c6b85a7d9452057be4ce68a8e45d77ba34234a6d46581777c6
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0


ModuleNotFoundError: No module named 'xgboost'