Stage 9: Feature Engineering - Project ImplementationPortfolio Risk Management SystemThis script implements feature engineering for the project based on EDA insights.Creates meaningful features to improve model performance.

In [None]:
import sysimport ossys.path.append('../src')import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom scipy import statsimport utilsimport warningswarnings.filterwarnings('ignore')print("🔧 Stage 9: Feature Engineering - Portfolio Risk Management")

In [None]:
def load_project_data():    """Load and prepare base dataset for feature engineering"""    symbols = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA']    print(f"Loading data for feature engineering: {symbols}")        raw_data = utils.fetch_multiple_stocks(symbols, prefer_alphavantage=False, period='2y')        if raw_data.empty:        print("❌ Failed to load data")        return None    

## Basic preprocessing

In [None]:
    processed_data = []    for symbol in symbols:        symbol_data = raw_data[raw_data['symbol'] == symbol].copy()        symbol_data = symbol_data.sort_values('date')        

## Base features

In [None]:
        symbol_data['daily_return'] = symbol_data['close'].pct_change()        symbol_data['log_return'] = np.log(symbol_data['close'] / symbol_data['close'].shift(1))        symbol_data['price_range'] = (symbol_data['high'] - symbol_data['low']) / symbol_data['close']        

## Moving averages for feature engineering

In [None]:
        for window in [5, 10, 20, 50]:            symbol_data[f'sma_{window}'] = symbol_data['close'].rolling(window).mean()            symbol_data[f'volume_ma_{window}'] = symbol_data['volume'].rolling(window).mean()        

## Volatility measures

In [None]:
        for window in [5, 10, 20]:            symbol_data[f'volatility_{window}'] = symbol_data['daily_return'].rolling(window).std()                processed_data.append(symbol_data)        df = pd.concat(processed_data, ignore_index=True)    df = df.dropna()        print(f"✅ Base data prepared: {df.shape}")    return df

In [None]:
def create_momentum_features(df):    """Create momentum-based features"""    print("\n📈 Creating Momentum Features")        momentum_features = []        for symbol in df['symbol'].unique():        symbol_data = df[df['symbol'] == symbol].copy().sort_values('date')        

## Price momentum features

In [None]:
        for period in [5, 10, 20]:            symbol_data[f'price_momentum_{period}'] = symbol_data['close'].pct_change(period)            symbol_data[f'return_momentum_{period}'] = symbol_data['daily_return'].rolling(period).mean()        

## Relative strength features

In [None]:
        symbol_data['rsi_14'] = calculate_rsi(symbol_data['close'], 14)        symbol_data['rsi_30'] = calculate_rsi(symbol_data['close'], 30)        

## MACD features

In [None]:
        symbol_data['macd'], symbol_data['macd_signal'] = calculate_macd(symbol_data['close'])        symbol_data['macd_histogram'] = symbol_data['macd'] - symbol_data['macd_signal']        

## Price position features

In [None]:
        symbol_data['price_vs_sma20'] = (symbol_data['close'] - symbol_data['sma_20']) / symbol_data['sma_20']        symbol_data['price_vs_sma50'] = (symbol_data['close'] - symbol_data['sma_50']) / symbol_data['sma_50']                momentum_features.append(symbol_data)        df_momentum = pd.concat(momentum_features, ignore_index=True)        momentum_cols = [col for col in df_momentum.columns if any(x in col for x in                     ['momentum', 'rsi', 'macd', 'price_vs'])]        print(f"Created {len(momentum_cols)} momentum features:")    for col in momentum_cols:        print(f"  - {col}")        return df_momentum

In [None]:
def create_volatility_features(df):    """Create volatility and risk-based features"""    print("\n📊 Creating Volatility Features")        volatility_features = []        for symbol in df['symbol'].unique():        symbol_data = df[df['symbol'] == symbol].copy().sort_values('date')        

## Volatility ratios

In [None]:
        symbol_data['vol_ratio_5_20'] = symbol_data['volatility_5'] / symbol_data['volatility_20']        symbol_data['vol_ratio_10_20'] = symbol_data['volatility_10'] / symbol_data['volatility_20']        

## Volatility-adjusted returns (Sharpe-like)

In [None]:
        symbol_data['vol_adj_return_5'] = symbol_data['daily_return'] / symbol_data['volatility_5']        symbol_data['vol_adj_return_20'] = symbol_data['daily_return'] / symbol_data['volatility_20']        

## Bollinger Band features

In [None]:
        symbol_data['bb_upper'] = symbol_data['sma_20'] + 2 * symbol_data['volatility_20'] * symbol_data['sma_20']        symbol_data['bb_lower'] = symbol_data['sma_20'] - 2 * symbol_data['volatility_20'] * symbol_data['sma_20']        symbol_data['bb_position'] = (symbol_data['close'] - symbol_data['bb_lower']) / (symbol_data['bb_upper'] - symbol_data['bb_lower'])        

## Average True Range (ATR)

In [None]:
        symbol_data['atr'] = calculate_atr(symbol_data)        symbol_data['atr_ratio'] = symbol_data['price_range'] / symbol_data['atr']                volatility_features.append(symbol_data)        df_volatility = pd.concat(volatility_features, ignore_index=True)        volatility_cols = [col for col in df_volatility.columns if any(x in col for x in                       ['vol_', 'bb_', 'atr'])]        print(f"Created {len(volatility_cols)} volatility features:")    for col in volatility_cols:        print(f"  - {col}")        return df_volatility

In [None]:
def create_volume_features(df):    """Create volume-based features"""    print("\n📦 Creating Volume Features")        volume_features = []        for symbol in df['symbol'].unique():        symbol_data = df[df['symbol'] == symbol].copy().sort_values('date')        

## Volume ratios

In [None]:
        for window in [5, 10, 20]:            symbol_data[f'volume_ratio_{window}'] = symbol_data['volume'] / symbol_data[f'volume_ma_{window}']        

## Volume momentum

In [None]:
        symbol_data['volume_momentum_5'] = symbol_data['volume'].pct_change(5)        symbol_data['volume_momentum_10'] = symbol_data['volume'].pct_change(10)        

## Price-Volume features

In [None]:
        symbol_data['pv_trend'] = symbol_data['daily_return'] * symbol_data['volume_ratio_20']        symbol_data['volume_price_correlation'] = symbol_data['daily_return'].rolling(20).corr(symbol_data['volume_ratio_20'])        

## On-Balance Volume (OBV)

In [None]:
        symbol_data['obv'] = calculate_obv(symbol_data)        symbol_data['obv_ma_10'] = symbol_data['obv'].rolling(10).mean()        symbol_data['obv_signal'] = symbol_data['obv'] - symbol_data['obv_ma_10']                volume_features.append(symbol_data)        df_volume = pd.concat(volume_features, ignore_index=True)        volume_cols = [col for col in df_volume.columns if any(x in col for x in                   ['volume_', 'pv_', 'obv'])]        print(f"Created {len(volume_cols)} volume features:")    for col in volume_cols:        print(f"  - {col}")        return df_volume

In [None]:
def create_cross_asset_features(df):    """Create cross-asset and market-wide features"""    print("\n🌐 Creating Cross-Asset Features")    

## Market-wide features

In [None]:
    daily_market = df.groupby('date').agg({        'daily_return': ['mean', 'std', 'min', 'max'],        'volume': 'sum',        'volatility_20': 'mean'    }).round(6)        daily_market.columns = ['market_return_mean', 'market_return_std', 'market_return_min',                            'market_return_max', 'total_volume', 'market_volatility']    daily_market = daily_market.reset_index()    

## Merge back to main dataset

In [None]:
    df_cross = df.merge(daily_market, on='date', how='left')    

## Relative performance features

In [None]:
    cross_features = []    for symbol in df_cross['symbol'].unique():        symbol_data = df_cross[df_cross['symbol'] == symbol].copy()        

## Relative to market

In [None]:
        symbol_data['relative_return'] = symbol_data['daily_return'] - symbol_data['market_return_mean']        symbol_data['relative_volatility'] = symbol_data['volatility_20'] - symbol_data['market_volatility']        symbol_data['beta_20'] = symbol_data['daily_return'].rolling(20).corr(symbol_data['market_return_mean'])        

## Market regime features

In [None]:
        symbol_data['market_stress'] = (symbol_data['market_return_std'] > symbol_data['market_return_std'].rolling(50).quantile(0.8)).astype(int)        symbol_data['market_direction'] = (symbol_data['market_return_mean'] > 0).astype(int)                cross_features.append(symbol_data)        df_cross_final = pd.concat(cross_features, ignore_index=True)        cross_cols = [col for col in df_cross_final.columns if any(x in col for x in                  ['market_', 'relative_', 'beta_'])]        print(f"Created {len(cross_cols)} cross-asset features:")    for col in cross_cols:        print(f"  - {col}")        return df_cross_final

In [None]:
def create_lag_features(df):    """Create lagged features for time series modeling"""    print("\n⏰ Creating Lag Features")        lag_features = []        for symbol in df['symbol'].unique():        symbol_data = df[df['symbol'] == symbol].copy().sort_values('date')        

## Key variables to lag

In [None]:
        lag_vars = ['daily_return', 'volatility_20', 'volume_ratio_20', 'rsi_14']                for var in lag_vars:            if var in symbol_data.columns:                for lag in [1, 2, 3, 5]:                    symbol_data[f'{var}_lag_{lag}'] = symbol_data[var].shift(lag)        

## Rolling features

In [None]:
        for var in ['daily_return', 'volatility_20']:            if var in symbol_data.columns:                for window in [3, 7, 14]:                    symbol_data[f'{var}_roll_mean_{window}'] = symbol_data[var].rolling(window).mean()                    symbol_data[f'{var}_roll_std_{window}'] = symbol_data[var].rolling(window).std()                lag_features.append(symbol_data)        df_lag = pd.concat(lag_features, ignore_index=True)        lag_cols = [col for col in df_lag.columns if any(x in col for x in ['_lag_', '_roll_'])]        print(f"Created {len(lag_cols)} lag features:")    for col in lag_cols[:10]:  # Show first 10        print(f"  - {col}")    if len(lag_cols) > 10:        print(f"  ... and {len(lag_cols) - 10} more")        return df_lag

## Helper functions

In [None]:
def calculate_rsi(prices, window=14):    """Calculate RSI indicator"""    delta = prices.diff()    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()    rs = gain / loss    return 100 - (100 / (1 + rs))

In [None]:
def calculate_macd(prices, fast=12, slow=26, signal=9):    """Calculate MACD indicator"""    ema_fast = prices.ewm(span=fast).mean()    ema_slow = prices.ewm(span=slow).mean()    macd = ema_fast - ema_slow    macd_signal = macd.ewm(span=signal).mean()    return macd, macd_signal

In [None]:
def calculate_atr(df, window=14):    """Calculate Average True Range"""    high_low = df['high'] - df['low']    high_close = np.abs(df['high'] - df['close'].shift())    low_close = np.abs(df['low'] - df['close'].shift())        true_range = np.maximum(high_low, np.maximum(high_close, low_close))    return true_range.rolling(window=window).mean()

In [None]:
def calculate_obv(df):    """Calculate On-Balance Volume"""    obv = np.where(df['close'] > df['close'].shift(), df['volume'],           np.where(df['close'] < df['close'].shift(), -df['volume'], 0))    return pd.Series(obv, index=df.index).cumsum()

In [None]:
def feature_selection_analysis(df):    """Analyze feature importance and correlations"""    print("\n🎯 Feature Selection Analysis")    

## Get all engineered features

In [None]:
    base_cols = ['symbol', 'date', 'open', 'high', 'low', 'close', 'volume']    feature_cols = [col for col in df.columns if col not in base_cols]        print(f"Total engineered features: {len(feature_cols)}")    

## Create target variable

In [None]:
    df_analysis = df.copy()    df_analysis['target_return'] = df_analysis.groupby('symbol')['daily_return'].shift(-1)    

## Remove rows with missing target

In [None]:
    df_analysis = df_analysis.dropna(subset=['target_return'])    

## Calculate correlations with target

In [None]:
    feature_correlations = []    for col in feature_cols:        if col in df_analysis.columns and df_analysis[col].dtype in ['float64', 'int64']:            corr = df_analysis[col].corr(df_analysis['target_return'])            if not np.isnan(corr):                feature_correlations.append({                    'feature': col,                    'correlation': abs(corr),                    'correlation_raw': corr                })    

## Sort by absolute correlation

In [None]:
    corr_df = pd.DataFrame(feature_correlations).sort_values('correlation', ascending=False)        print("\nTop 20 Features by Target Correlation:")    print(corr_df.head(20).round(4))    

## Feature correlation matrix (top features)

In [None]:
    top_features = corr_df.head(15)['feature'].tolist()    if len(top_features) > 0:        feature_corr_matrix = df_analysis[top_features].corr()                plt.figure(figsize=(12, 10))        sns.heatmap(feature_corr_matrix, annot=True, cmap='coolwarm', center=0,                    square=True, fmt='.2f', cbar_kws={'label': 'Correlation'})        plt.title('Top Features Correlation Matrix')        plt.tight_layout()        plt.show()        return corr_df

In [None]:
def validate_features(df):    """Validate engineered features"""    print("\n✅ Feature Validation")    

## Check for missing values

In [None]:
    missing_summary = df.isnull().sum()    missing_features = missing_summary[missing_summary > 0]        if len(missing_features) > 0:        print(f"Features with missing values: {len(missing_features)}")        print(missing_features.head(10))    else:        print("✅ No missing values in engineered features")    

## Check for infinite values

In [None]:
    numeric_cols = df.select_dtypes(include=[np.number]).columns    inf_summary = df[numeric_cols].apply(lambda x: np.isinf(x).sum())    inf_features = inf_summary[inf_summary > 0]        if len(inf_features) > 0:        print(f"Features with infinite values: {len(inf_features)}")        print(inf_features.head(10))    else:        print("✅ No infinite values in engineered features")    

## Feature statistics

In [None]:
    feature_stats = df[numeric_cols].describe()    print(f"\nFeature statistics summary:")    print(f"Total numeric features: {len(numeric_cols)}")    print(f"Features with zero variance: {(feature_stats.loc['std'] == 0).sum()}")        return missing_features, inf_features

In [None]:
def main():    """Main execution function"""

## Load base data

In [None]:
    df = load_project_data()    if df is None:        return    

## Create feature sets

In [None]:
    print("\n🔧 Creating Feature Sets...")    df = create_momentum_features(df)    df = create_volatility_features(df)    df = create_volume_features(df)    df = create_cross_asset_features(df)    df = create_lag_features(df)    

## Remove rows with too many missing values

In [None]:
    df = df.dropna(thresh=len(df.columns) * 0.7)  # Keep rows with at least 70% non-null values        print(f"\n📊 Final dataset shape: {df.shape}")    

## Feature analysis

In [None]:
    corr_df = feature_selection_analysis(df)    missing_features, inf_features = validate_features(df)    

## Save engineered dataset

In [None]:
    output_path = utils.save_with_timestamp(        df=df,        prefix="engineered_features_dataset",        source="project_stage9",        ext="csv"    )    

## Save feature correlation analysis

In [None]:
    corr_output_path = utils.save_with_timestamp(        df=corr_df,        prefix="feature_correlations",        source="project_stage9",        ext="csv"    )        print(f"\n💾 Engineered dataset saved to: {output_path}")    print(f"💾 Feature correlations saved to: {corr_output_path}")    

## Feature summary

In [None]:
    base_cols = ['symbol', 'date', 'open', 'high', 'low', 'close', 'volume']    engineered_cols = [col for col in df.columns if col not in base_cols]        print("\n✅ Stage 9: Feature Engineering Complete")    print("Key deliverables:")    print(f"- {len(engineered_cols)} engineered features created")    print("- Momentum, volatility, volume, and cross-asset features")    print("- Lag features for time series modeling")    print("- Feature correlation analysis and validation")    print("- Ready for modeling pipeline integration")if __name__ == "__main__":    main()