In [2]:
import pandas as pd
import numpy as np
import yfinance as yf

# --- 1. DOWNLOAD DATA (JETS ETF, 2018-2022) ---

PHARMA_ETF_TICKER = "JETS"
START_DATE = "2018-01-01"
END_DATE = "2022-12-31"

df = yf.download(PHARMA_ETF_TICKER, start=START_DATE, end=END_DATE, auto_adjust=False)


# --- 2. FEATURE ENGINEERING FUNCTION (Corrected) ---

def create_price_features(df: pd.DataFrame, target_shift: int = 15, delta: float = 0.003) -> tuple:
    df = df.copy()

    # --- 1. Target Y Generation ---
    df['Future_Return'] = (df['Adj Close'].shift(-target_shift) / df['Adj Close']) - 1
    df['Target_Y'] = np.where(df['Future_Return'] > delta, 1,
                       np.where(df['Future_Return'] < -delta, 0, np.nan))


    # --- 2. Momentum Features ---
    for window in [1, 5, 20]:
        df[f'Return_{window}d'] = df['Adj Close'].pct_change(window)

    df['SMA_10d'] = df['Adj Close'].rolling(window=10).mean()
    df['SMA_50d'] = df['Adj Close'].rolling(window=50).mean()
    df['MA_Crossover_Ratio'] = df['SMA_10d'] / df['SMA_50d']


    # --- 3. Volatility Features ---

    df['Vol_20d'] = df['Return_1d'].rolling(window=20).std()

    # Average True Range (ATR) - Correction simple et robuste
    # Calcul des trois composantes du True Range
    high_minus_low = df['High'] - df['Low']
    high_minus_prev_close_abs = (df['High'] - df['Adj Close'].shift(1)).abs()
    low_minus_prev_close_abs = (df['Low'] - df['Adj Close'].shift(1)).abs()

    # Utiliser np.maximum.accumulate pour trouver le maximum entre les trois Séries
    # C'est une méthode plus pythonique pour comparer plusieurs Séries sans passer par un DataFrame temporaire
    df['True_Range'] = np.maximum.reduce([high_minus_low,
                                          high_minus_prev_close_abs,
                                          low_minus_prev_close_abs])

    df['ATR_14d'] = df['True_Range'].rolling(window=14).mean()


    # --- 4. Volume Features ---
    df['Vol_Avg_5d'] = df['Volume'].rolling(window=5).mean()
    df['Vol_Delta'] = df['Volume'].pct_change(1)

    # --- 5. Final Cleaning & Feature/Target Isolation ---
    df = df.dropna()

    X = df.drop(columns=['Future_Return', 'Target_Y', 'True_Range', 'Adj Close', 'High', 'Low', 'Open', 'Close'])
    Y = df['Target_Y']

    return X, Y


# --- 3. EXECUTION AND VALIDATION ---

X_features, Y_target = create_price_features(df)

print("Features (X) Head:")
print(X_features.head())
print("\nTarget (Y) Value Counts (Check for class imbalance):")
print(Y_target.value_counts(normalize=True))

[*********************100%***********************]  1 of 1 completed

Features (X) Head:
Price      Volume Return_1d Return_5d Return_20d    SMA_10d    SMA_50d  \
Ticker       JETS                                                        
Date                                                                     
2018-03-14  27200 -0.013601  0.025200   0.064115  32.010972  31.811746   
2018-03-15   6700 -0.002098  0.012162   0.051817  32.125997  31.816425   
2018-03-16  21800  0.010814  0.006581   0.050250  32.275140  31.835901   
2018-03-19  32000 -0.007429 -0.004471   0.043750  32.390164  31.849549   
2018-03-20   8000  0.004191 -0.008279   0.054717  32.506164  31.865340   

Price      MA_Crossover_Ratio   Vol_20d   ATR_14d Vol_Avg_5d Vol_Delta  
Ticker                                                                  
Date                                                                    
2018-03-14           1.006263  0.010603  1.162553    98200.0 -0.238095  
2018-03-15           1.009730  0.010553  1.147091    93940.0 -0.753676  
2018-03-16           1.


