In [1]:
import pandas as pd
import numpy as np
import yfinance as yf

# --- 1. TÉLÉCHARGEMENT DES DONNÉES (XPH ETF, 2018-2022) ---

# Ticker corrigé pour l'ETF Pharma/Biotech
PHARMA_ETF_TICKER = "XPH"
START_DATE = "2018-01-01"
END_DATE = "2022-12-31"

# auto_adjust=False est conservé pour garantir la présence de 'Adj Close'
df = yf.download(PHARMA_ETF_TICKER, start=START_DATE, end=END_DATE, auto_adjust=False)


# --- 2. FONCTION D'INGÉNIERIE DES CARACTÉRISTIQUES (Corrigée) ---

def create_price_features(df: pd.DataFrame, target_shift: int = 15, delta: float = 0.003) -> tuple:
    """
    Calcule les caractéristiques financières (X) et la cible binaire (Y) pour
    un modèle de classification ML classique, basé sur une prévision à 15 jours.
    """
    df = df.copy()

    # --- 1. Génération de la Cible Y ---
    df['Future_Return'] = (df['Adj Close'].shift(-target_shift) / df['Adj Close']) - 1
    df['Target_Y'] = np.where(df['Future_Return'] > delta, 1,
                       np.where(df['Future_Return'] < -delta, 0, np.nan))


    # --- 2. Caractéristiques de Momentum ---
    for window in [1, 5, 20]:
        df[f'Return_{window}d'] = df['Adj Close'].pct_change(window)

    df['SMA_10d'] = df['Adj Close'].rolling(window=10).mean()
    df['SMA_50d'] = df['Adj Close'].rolling(window=50).mean()
    df['MA_Crossover_Ratio'] = df['SMA_10d'] / df['SMA_50d']


    # --- 3. Caractéristiques de Volatilité ---

    df['Vol_20d'] = df['Return_1d'].rolling(window=20).std()

    # Calcul du True Range (TR) en utilisant np.maximum.reduce pour une meilleure robustesse
    high_minus_low = df['High'] - df['Low']
    high_minus_prev_close_abs = (df['High'] - df['Adj Close'].shift(1)).abs()
    low_minus_prev_close_abs = (df['Low'] - df['Adj Close'].shift(1)).abs()

    df['True_Range'] = np.maximum.reduce([high_minus_low,
                                          high_minus_prev_close_abs,
                                          low_minus_prev_close_abs])

    df['ATR_14d'] = df['True_Range'].rolling(window=14).mean()


    # --- 4. Caractéristiques de Volume ---
    df['Vol_Avg_5d'] = df['Volume'].rolling(window=5).mean()
    df['Vol_Delta'] = df['Volume'].pct_change(1)

    # --- 5. Nettoyage Final & Isolation X/Y ---
    df = df.dropna()

    # X (Features) : toutes les colonnes sauf les colonnes de prix brutes et les cibles intermédiaires
    X = df.drop(columns=['Future_Return', 'Target_Y', 'True_Range', 'Adj Close', 'High', 'Low', 'Open', 'Close'])
    Y = df['Target_Y']

    return X, Y


# --- 3. EXÉCUTION ET VALIDATION ---

X_features, Y_target = create_price_features(df)

print("Features (X) Head:")
print(X_features.head())
print("\nTarget (Y) Value Counts (Vérification du déséquilibre des classes) :")
print(Y_target.value_counts(normalize=True))

[*********************100%***********************]  1 of 1 completed

Features (X) Head:
Price       Volume Return_1d Return_5d Return_20d    SMA_10d    SMA_50d  \
Ticker         XPH                                                        
Date                                                                      
2018-03-14   23300 -0.001373  0.022488   0.045509  39.619617  40.460964   
2018-03-15   28400 -0.010997  0.000000   0.011718  39.794781  40.444370   
2018-03-16  147800  0.007161 -0.006191   0.005065  39.882284  40.427390   
2018-03-19   29700 -0.014956 -0.022842  -0.015655  39.899694  40.397683   
2018-03-20   58200 -0.011913 -0.031832  -0.014471  39.882052  40.360044   

Price      MA_Crossover_Ratio   Vol_20d   ATR_14d Vol_Avg_5d Vol_Delta  
Ticker                                                                  
Date                                                                    
2018-03-14           0.979206  0.014391  3.761681    47020.0 -0.776392  
2018-03-15           0.983939  0.013897  3.728631    48620.0  0.218884  
2018-03-16     


