# Advanced Feature Engineering for Multimodal Sentiment Analysis

This notebook performs advanced feature engineering on the preprocessed multimodal data, including feature selection, transformation, and creation of composite features for improved sentiment classification.

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

# Load preprocessed data
try:
    multimodal_features = pd.read_csv('../data/multimodal_features.csv')
    sentiment_labels_df = pd.read_csv('../data/sentiment_labels.csv')
    
    # Extract sentiment column (first column if no header)
    if 'sentiment' in sentiment_labels_df.columns:
        sentiment_labels = sentiment_labels_df['sentiment']
    else:
        sentiment_labels = sentiment_labels_df.iloc[:, 0]
    
    print("✅ Preprocessed data loaded successfully")
    print(f"Features shape: {multimodal_features.shape}")
    print(f"Labels shape: {sentiment_labels.shape}")
    print(f"Label distribution: {sentiment_labels.value_counts().sort_index()}")
    
except FileNotFoundError:
    print("❌ Preprocessed data not found. Please run 01_preprocessing.ipynb first")
    multimodal_features = pd.DataFrame()
    sentiment_labels = pd.Series()
except Exception as e:
    print(f"❌ Error loading data: {str(e)}")
    multimodal_features = pd.DataFrame()
    sentiment_labels = pd.Series()

✅ Preprocessed data loaded successfully
Features shape: (38, 60)
Labels shape: (38,)
Label distribution: Unnamed: 0
0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
30    1
31    1
32    1
33    1
34    1
35    1
36    1
37    1
Name: count, dtype: int64


## 1. Data Exploration and Quality Assessment

In [4]:
if not multimodal_features.empty:
    print("=== FEATURE DATA EXPLORATION ===")
    print(f"Dataset shape: {multimodal_features.shape}")
    print(f"Memory usage: {multimodal_features.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Check for missing values
    missing_counts = multimodal_features.isnull().sum()
    missing_features = missing_counts[missing_counts > 0]
    
    if len(missing_features) > 0:
        print(f"\nMissing values found in {len(missing_features)} features:")
        print(missing_features.head(10))
    else:
        print("\n✅ No missing values found")
    
    # Feature type analysis
    numeric_features = multimodal_features.select_dtypes(include=[np.number]).columns
    print(f"\nFeature types:")
    print(f"  Numeric features: {len(numeric_features)}")
    
    # Identify modalities by feature prefixes
    modalities = {}
    for col in numeric_features:
        if col != 'participant_id':
            # Identify modality based on common patterns
            if any(band in col.lower() for band in ['delta', 'theta', 'alpha', 'beta', 'gamma', 'engagement']):
                modalities.setdefault('EEG', []).append(col)
            elif any(gsr in col.lower() for gsr in ['gsr', 'conductance', 'peaks', 'recovery']):
                modalities.setdefault('GSR', []).append(col)
            elif any(au in col.lower() for au in ['au', 'happiness', 'sadness', 'anger', 'valence_au', 'arousal_au']):
                modalities.setdefault('AU', []).append(col)
            elif col.startswith('sr_'):
                modalities.setdefault('Self-Report', []).append(col)
            else:
                modalities.setdefault('Other', []).append(col)
    
    print(f"\nFeatures by modality:")
    for modality, features in modalities.items():
        print(f"  {modality}: {len(features)} features")
        print(f"    Examples: {features[:3]}")
    
    # Basic statistics
    print(f"\nFeature statistics:")
    print(multimodal_features.describe().round(4))

else:
    print("No feature data available. Please run preprocessing first.")

No feature data available. Please run preprocessing first.


## 2. Feature Distribution Analysis

In [5]:
if not multimodal_features.empty:
    # Select a subset of features for visualization
    numeric_features = multimodal_features.select_dtypes(include=[np.number]).columns
    feature_subset = [col for col in numeric_features if col != 'participant_id'][:16]  # First 16 features
    
    # Plot feature distributions
    fig, axes = plt.subplots(4, 4, figsize=(16, 12))
    axes = axes.ravel()
    
    for i, feature in enumerate(feature_subset):
        if i < len(axes):
            multimodal_features[feature].hist(bins=20, ax=axes[i], alpha=0.7)
            axes[i].set_title(f'{feature}', fontsize=10)
            axes[i].set_xlabel('Value')
            axes[i].set_ylabel('Frequency')
    
    # Hide unused subplots
    for i in range(len(feature_subset), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.suptitle('Feature Distributions', fontsize=14, y=1.02)
    plt.show()
    
    # Check for zero variance features
    zero_var_features = []
    for col in numeric_features:
        if col != 'participant_id' and multimodal_features[col].var() == 0:
            zero_var_features.append(col)
    
    if zero_var_features:
        print(f"⚠️  Found {len(zero_var_features)} zero-variance features:")
        print(zero_var_features)
    else:
        print("✅ No zero-variance features found")
    
    # Check for high correlation features
    correlation_matrix = multimodal_features[numeric_features].corr().abs()
    upper_triangle = correlation_matrix.where(
        np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
    )
    
    high_corr_pairs = []
    for col in upper_triangle.columns:
        for idx in upper_triangle.index:
            if upper_triangle.loc[idx, col] > 0.95:  # 95% correlation threshold
                high_corr_pairs.append((idx, col, upper_triangle.loc[idx, col]))
    
    if high_corr_pairs:
        print(f"\n⚠️  Found {len(high_corr_pairs)} highly correlated feature pairs (>95%):")
        for feat1, feat2, corr in high_corr_pairs[:10]:  # Show first 10
            print(f"  {feat1} - {feat2}: {corr:.3f}")
    else:
        print("\n✅ No highly correlated features found")

else:
    print("No feature data available for distribution analysis.")

No feature data available for distribution analysis.


## 3. Advanced Feature Engineering

In [6]:
def create_composite_features(data):
    """Create composite features from multimodal data"""
    enhanced_data = data.copy()
    
    print("Creating composite features...")
    
    # EEG composite features
    eeg_cols = [col for col in data.columns if any(band in col.lower() for band in 
               ['delta', 'theta', 'alpha', 'beta', 'gamma'])]
    
    if eeg_cols:
        print(f"  Processing {len(eeg_cols)} EEG features")
        
        # Frequency band groupings
        delta_cols = [col for col in eeg_cols if 'delta' in col.lower()]
        theta_cols = [col for col in eeg_cols if 'theta' in col.lower()]
        alpha_cols = [col for col in eeg_cols if 'alpha' in col.lower()]
        beta_cols = [col for col in eeg_cols if 'beta' in col.lower()]
        gamma_cols = [col for col in eeg_cols if 'gamma' in col.lower()]
        
        # Create band ratios (important for cognitive states)
        if alpha_cols and beta_cols:
            alpha_mean = data[alpha_cols].mean(axis=1)
            beta_mean = data[beta_cols].mean(axis=1)
            enhanced_data['alpha_beta_ratio'] = alpha_mean / (beta_mean + 1e-8)
        
        if theta_cols and alpha_cols:
            theta_mean = data[theta_cols].mean(axis=1)
            alpha_mean = data[alpha_cols].mean(axis=1)
            enhanced_data['theta_alpha_ratio'] = theta_mean / (alpha_mean + 1e-8)
        
        # Overall EEG activation
        enhanced_data['eeg_total_power'] = data[eeg_cols].sum(axis=1)
        enhanced_data['eeg_mean_power'] = data[eeg_cols].mean(axis=1)
        
        # EEG complexity (coefficient of variation)
        enhanced_data['eeg_complexity'] = data[eeg_cols].std(axis=1) / (data[eeg_cols].mean(axis=1) + 1e-8)
    
    # GSR composite features
    gsr_cols = [col for col in data.columns if any(gsr in col.lower() for gsr in 
               ['gsr', 'conductance', 'peaks', 'recovery'])]
    
    if gsr_cols:
        print(f"  Processing {len(gsr_cols)} GSR features")
        
        # GSR arousal indicators
        enhanced_data['gsr_arousal_index'] = data[gsr_cols].mean(axis=1)
        
        # GSR variability
        enhanced_data['gsr_variability'] = data[gsr_cols].std(axis=1)
        
        # Peak-related features
        peak_cols = [col for col in gsr_cols if 'peak' in col.lower()]
        if peak_cols:
            enhanced_data['gsr_peak_intensity'] = data[peak_cols].sum(axis=1)
    
    # Facial AU composite features
    au_cols = [col for col in data.columns if any(au in col.lower() for au in 
              ['au', 'happiness', 'sadness', 'anger', 'surprise', 'disgust', 'fear'])]
    
    if au_cols:
        print(f"  Processing {len(au_cols)} AU features")
        
        # Emotional intensity
        emotion_cols = [col for col in au_cols if any(emotion in col.lower() for emotion in 
                       ['happiness', 'sadness', 'anger', 'surprise', 'disgust', 'fear'])]
        
        if emotion_cols:
            enhanced_data['emotional_intensity'] = data[emotion_cols].abs().mean(axis=1)
            
            # Positive vs negative emotion balance
            positive_emotions = [col for col in emotion_cols if 'happiness' in col.lower()]
            negative_emotions = [col for col in emotion_cols if any(neg in col.lower() for neg in 
                               ['sadness', 'anger', 'disgust', 'fear'])]
            
            if positive_emotions and negative_emotions:
                pos_mean = data[positive_emotions].mean(axis=1)
                neg_mean = data[negative_emotions].mean(axis=1)
                enhanced_data['emotion_valence_balance'] = pos_mean - neg_mean
        
        # Facial expressiveness
        enhanced_data['facial_expressiveness'] = data[au_cols].std(axis=1)
    
    # Self-report composite features
    sr_cols = [col for col in data.columns if col.startswith('sr_')]
    
    if sr_cols:
        print(f"  Processing {len(sr_cols)} self-report features")
        enhanced_data['sr_mean_rating'] = data[sr_cols].mean(axis=1)
        enhanced_data['sr_rating_variability'] = data[sr_cols].std(axis=1)
    
    # Cross-modal interaction features
    print("  Creating cross-modal interactions...")
    
    # EEG-GSR interaction (cognitive-physiological coupling)
    if eeg_cols and gsr_cols:
        eeg_activation = data[eeg_cols].mean(axis=1)
        gsr_activation = data[gsr_cols].mean(axis=1)
        enhanced_data['eeg_gsr_coupling'] = eeg_activation * gsr_activation
        enhanced_data['cognitive_arousal_ratio'] = eeg_activation / (gsr_activation + 1e-8)
    
    # AU-GSR interaction (emotional-physiological coupling)
    if au_cols and gsr_cols:
        au_intensity = data[au_cols].mean(axis=1)
        gsr_activation = data[gsr_cols].mean(axis=1)
        enhanced_data['emotion_arousal_coupling'] = au_intensity * gsr_activation
    
    print(f"  Created {enhanced_data.shape[1] - data.shape[1]} new composite features")
    
    return enhanced_data

# Apply feature engineering
if not multimodal_features.empty:
    engineered_features = create_composite_features(multimodal_features)
    print(f"\nEnhanced dataset shape: {engineered_features.shape}")
    print(f"Original features: {multimodal_features.shape[1]}")
    print(f"New features added: {engineered_features.shape[1] - multimodal_features.shape[1]}")
else:
    engineered_features = pd.DataFrame()
    print("No data available for feature engineering")

No data available for feature engineering


## 4. Feature Selection and Dimensionality Reduction

In [7]:
if not engineered_features.empty and not sentiment_labels.empty:
    # Prepare features for selection
    X = engineered_features.drop('participant_id', axis=1, errors='ignore')
    y = sentiment_labels
    
    print("=== FEATURE SELECTION ANALYSIS ===")
    print(f"Starting with {X.shape[1]} features")
    
    # Remove zero variance features
    from sklearn.feature_selection import VarianceThreshold
    variance_selector = VarianceThreshold(threshold=0.01)  # Remove features with very low variance
    X_variance_filtered = variance_selector.fit_transform(X)
    selected_variance_features = X.columns[variance_selector.get_support()]
    
    print(f"After variance filtering: {len(selected_variance_features)} features")
    
    # Statistical feature selection
    if len(y.unique()) > 1:  # Check if we have multiple classes
        # F-statistic based selection
        k_best = min(50, len(selected_variance_features))  # Select top 50 or all available
        f_selector = SelectKBest(score_func=f_classif, k=k_best)
        
        X_f_selected = f_selector.fit_transform(X_variance_filtered, y)
        f_scores = f_selector.scores_
        selected_f_features = selected_variance_features[f_selector.get_support()]
        
        print(f"After F-statistic selection (top {k_best}): {len(selected_f_features)} features")
        
        # Display top features
        feature_scores = list(zip(selected_f_features, f_scores[f_selector.get_support()]))
        feature_scores.sort(key=lambda x: x[1], reverse=True)
        
        print(f"\nTop 15 features by F-statistic:")
        for i, (feature, score) in enumerate(feature_scores[:15]):
            print(f"{i+1:2d}. {feature:35s} - {score:.2f}")
        
        # Mutual information based selection
        mi_selector = SelectKBest(score_func=mutual_info_classif, k=k_best)
        X_mi_selected = mi_selector.fit_transform(X_variance_filtered, y)
        mi_scores = mi_selector.scores_
        selected_mi_features = selected_variance_features[mi_selector.get_support()]
        
        print(f"After mutual information selection (top {k_best}): {len(selected_mi_features)} features")
        
        # Tree-based feature importance
        rf_selector = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        rf_selector.fit(X_variance_filtered, y)
        
        feature_importance = rf_selector.feature_importances_
        importance_pairs = list(zip(selected_variance_features, feature_importance))
        importance_pairs.sort(key=lambda x: x[1], reverse=True)
        
        print(f"\nTop 15 features by Random Forest importance:")
        for i, (feature, importance) in enumerate(importance_pairs[:15]):
            print(f"{i+1:2d}. {feature:35s} - {importance:.4f}")
        
        # Combine selection methods (intersection of top features)
        top_f_features = set([f[0] for f in feature_scores[:30]])
        top_mi_features = set([f[0] for f in zip(selected_mi_features, mi_scores)])
        top_rf_features = set([f[0] for f in importance_pairs[:30]])
        
        # Features that appear in multiple selection methods
        consensus_features = list(top_f_features & top_rf_features)
        print(f"\nConsensus features (F-stat + RF): {len(consensus_features)}")
        
        if len(consensus_features) < 20:  # If too few consensus features, use top F-stat features
            final_selected_features = [f[0] for f in feature_scores[:25]]
        else:
            final_selected_features = consensus_features[:25]  # Top 25 consensus features
        
        print(f"Final selected features: {len(final_selected_features)}")
        
        # Create final feature matrix
        X_selected = X[final_selected_features]
        
        print(f"\nFinal feature selection complete:")
        print(f"  Original features: {X.shape[1]}")
        print(f"  Selected features: {X_selected.shape[1]}")
        print(f"  Reduction: {(1 - X_selected.shape[1]/X.shape[1])*100:.1f}%")
        
    else:
        print("Cannot perform feature selection: only one class found in labels")
        X_selected = X
        final_selected_features = list(X.columns)

else:
    print("Cannot perform feature selection: missing data or labels")
    X_selected = pd.DataFrame()
    final_selected_features = []

Cannot perform feature selection: missing data or labels


## 5. Feature Scaling and Normalization

In [8]:
if not X_selected.empty:
    print("=== FEATURE SCALING ANALYSIS ===")
    
    # Analyze feature scales before normalization
    print("Feature scale analysis:")
    scale_stats = pd.DataFrame({
        'min': X_selected.min(),
        'max': X_selected.max(),
        'mean': X_selected.mean(),
        'std': X_selected.std()
    })
    
    print(f"Scale ranges:")
    print(f"  Min value across features: {scale_stats['min'].min():.4f}")
    print(f"  Max value across features: {scale_stats['max'].max():.4f}")
    print(f"  Mean std deviation: {scale_stats['std'].mean():.4f}")
    
    # Features with very different scales
    large_scale_features = scale_stats[scale_stats['max'] > 100]['max'].sort_values(ascending=False)
    if len(large_scale_features) > 0:
        print(f"\nFeatures with large scales (>100):")
        print(large_scale_features.head(10))
    
    small_scale_features = scale_stats[scale_stats['max'] < 0.1]['max'].sort_values(ascending=True)
    if len(small_scale_features) > 0:
        print(f"\nFeatures with small scales (<0.1):")
        print(small_scale_features.head(10))
    
    # Apply different scaling methods
    scalers = {
        'StandardScaler': StandardScaler(),
        'RobustScaler': RobustScaler(),
        'MinMaxScaler': MinMaxScaler()
    }
    
    scaled_datasets = {}
    
    for scaler_name, scaler in scalers.items():
        X_scaled = scaler.fit_transform(X_selected)
        X_scaled_df = pd.DataFrame(X_scaled, columns=X_selected.columns, index=X_selected.index)
        scaled_datasets[scaler_name] = X_scaled_df
        
        print(f"\n{scaler_name} results:")
        print(f"  Mean: {X_scaled_df.mean().mean():.4f}")
        print(f"  Std: {X_scaled_df.std().mean():.4f}")
        print(f"  Min: {X_scaled_df.min().min():.4f}")
        print(f"  Max: {X_scaled_df.max().max():.4f}")
    
    # Visualize scaling effects
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Original data
    X_selected.iloc[:, :5].boxplot(ax=axes[0, 0])
    axes[0, 0].set_title('Original Features (first 5)')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # StandardScaler
    scaled_datasets['StandardScaler'].iloc[:, :5].boxplot(ax=axes[0, 1])
    axes[0, 1].set_title('StandardScaler (first 5)')
    axes[0, 1].tick_params(axis='x', rotation=45)
    
    # RobustScaler
    scaled_datasets['RobustScaler'].iloc[:, :5].boxplot(ax=axes[1, 0])
    axes[1, 0].set_title('RobustScaler (first 5)')
    axes[1, 0].tick_params(axis='x', rotation=45)
    
    # MinMaxScaler
    scaled_datasets['MinMaxScaler'].iloc[:, :5].boxplot(ax=axes[1, 1])
    axes[1, 1].set_title('MinMaxScaler (first 5)')
    axes[1, 1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Choose StandardScaler as default (good for most ML algorithms)
    X_final = scaled_datasets['StandardScaler']
    print(f"\nUsing StandardScaler for final dataset")
    print(f"Final processed dataset shape: {X_final.shape}")

else:
    print("No selected features available for scaling")
    X_final = pd.DataFrame()

No selected features available for scaling


## 6. Principal Component Analysis (Optional)

In [9]:
if not X_final.empty:
    print("=== PRINCIPAL COMPONENT ANALYSIS ===")
    
    # Determine optimal number of components
    n_components_max = min(15, X_final.shape[1], X_final.shape[0])
    
    pca_full = PCA()
    pca_full.fit(X_final)
    
    # Plot explained variance ratio
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(range(1, len(pca_full.explained_variance_ratio_[:20]) + 1), 
             pca_full.explained_variance_ratio_[:20], 'bo-')
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.title('PCA Explained Variance by Component')
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    cumsum_variance = np.cumsum(pca_full.explained_variance_ratio_)
    plt.plot(range(1, len(cumsum_variance[:20]) + 1), cumsum_variance[:20], 'ro-')
    plt.axhline(y=0.95, color='k', linestyle='--', alpha=0.7, label='95% variance')
    plt.axhline(y=0.90, color='gray', linestyle='--', alpha=0.7, label='90% variance')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('Cumulative Explained Variance')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Find number of components for 90% and 95% variance
    n_components_90 = np.argmax(cumsum_variance >= 0.90) + 1
    n_components_95 = np.argmax(cumsum_variance >= 0.95) + 1
    
    print(f"Components needed for 90% variance: {n_components_90}")
    print(f"Components needed for 95% variance: {n_components_95}")
    print(f"Total features: {X_final.shape[1]}")
    
    # Create PCA-transformed dataset (optional - for dimensionality reduction)
    n_components_final = min(n_components_90, 15)  # Use 90% variance or max 15 components
    
    if n_components_final < X_final.shape[1]:
        pca_final = PCA(n_components=n_components_final)
        X_pca = pca_final.fit_transform(X_final)
        
        # Create PCA feature names
        pca_feature_names = [f'PC{i+1}' for i in range(n_components_final)]
        X_pca_df = pd.DataFrame(X_pca, columns=pca_feature_names, index=X_final.index)
        
        print(f"\nPCA transformation complete:")
        print(f"  Original features: {X_final.shape[1]}")
        print(f"  PCA components: {X_pca_df.shape[1]}")
        print(f"  Variance retained: {cumsum_variance[n_components_final-1]:.3f}")
        print(f"  Dimensionality reduction: {(1 - X_pca_df.shape[1]/X_final.shape[1])*100:.1f}%")
        
        # Show component loadings for first few components
        print(f"\nTop feature loadings for first 3 components:")
        feature_names = X_final.columns
        
        for i in range(min(3, n_components_final)):
            print(f"\nPC{i+1} (explains {pca_final.explained_variance_ratio_[i]:.3f} variance):")
            loadings = list(zip(feature_names, np.abs(pca_final.components_[i])))
            loadings.sort(key=lambda x: x[1], reverse=True)
            
            for j, (feature, loading) in enumerate(loadings[:5]):
                print(f"  {j+1}. {feature:30s}: {loading:.3f}")
    
    else:
        print("PCA not beneficial - too few features")
        X_pca_df = X_final
        pca_final = None

else:
    print("No data available for PCA analysis")
    X_pca_df = pd.DataFrame()
    pca_final = None

No data available for PCA analysis


## 7. Final Dataset Export and Summary

In [10]:
# Export processed datasets
export_datasets = {}

if not X_final.empty:
    export_datasets['engineered_features'] = X_final
    print("=== FEATURE ENGINEERING SUMMARY ===")
    print()
    
    # Add participant_id back if it exists
    if 'participant_id' in engineered_features.columns:
        X_final_with_id = X_final.copy()
        X_final_with_id['participant_id'] = engineered_features['participant_id']
        export_datasets['engineered_features_with_id'] = X_final_with_id
    
    print(f"Feature Engineering Pipeline Results:")
    print(f"  Original raw features: {multimodal_features.shape[1] if not multimodal_features.empty else 0}")
    print(f"  After composite feature creation: {engineered_features.shape[1] if not engineered_features.empty else 0}")
    print(f"  After feature selection: {X_selected.shape[1] if not X_selected.empty else 0}")
    print(f"  Final processed features: {X_final.shape[1]}")
    print(f"  Total samples: {X_final.shape[0]}")
    print()
    
    # Feature categories in final dataset
    final_features = list(X_final.columns)
    feature_categories = {
        'EEG': [f for f in final_features if any(x in f.lower() for x in ['delta', 'theta', 'alpha', 'beta', 'gamma', 'eeg'])],
        'GSR': [f for f in final_features if any(x in f.lower() for x in ['gsr', 'conductance', 'peaks', 'arousal'])],
        'Facial AU': [f for f in final_features if any(x in f.lower() for x in ['au', 'happiness', 'sadness', 'anger', 'facial', 'emotion'])],
        'Self-Report': [f for f in final_features if f.startswith('sr_')],
        'Composite': [f for f in final_features if any(x in f.lower() for x in ['ratio', 'coupling', 'balance', 'intensity', 'complexity'])]
    }
    
    print("Final feature breakdown by category:")
    for category, features in feature_categories.items():
        if features:
            print(f"  {category}: {len(features)} features")
            print(f"    Examples: {features[:3]}")
    print()
    
    # Export main dataset
    output_path = '../data/engineered_features.csv'
    X_final_with_id.to_csv(output_path, index=False) if 'X_final_with_id' in locals() else X_final.to_csv(output_path, index=False)
    print(f"✅ Exported engineered features to: {output_path}")
    
    # Export PCA dataset if available
    if not X_pca_df.empty and X_pca_df.shape[1] != X_final.shape[1]:
        pca_output_path = '../data/pca_features.csv'
        
        if 'participant_id' in engineered_features.columns:
            X_pca_with_id = X_pca_df.copy()
            X_pca_with_id['participant_id'] = engineered_features['participant_id']
            X_pca_with_id.to_csv(pca_output_path, index=False)
        else:
            X_pca_df.to_csv(pca_output_path, index=False)
        
        print(f"✅ Exported PCA features to: {pca_output_path}")
    
    # Export feature metadata
    feature_metadata = {
        'selected_features': final_selected_features if 'final_selected_features' in locals() else list(X_final.columns),
        'feature_categories': feature_categories,
        'scaling_method': 'StandardScaler',
        'pca_components': n_components_final if pca_final is not None else None,
        'total_samples': X_final.shape[0],
        'total_features': X_final.shape[1]
    }
    
    import json
    metadata_path = '../data/feature_metadata.json'
    with open(metadata_path, 'w') as f:
        json.dump(feature_metadata, f, indent=2)
    print(f"✅ Exported feature metadata to: {metadata_path}")

else:
    print("❌ No processed features available for export")

print()
print("NEXT STEPS:")
print("1. Run 03_modeling_baseline_sentiment.ipynb for baseline model training")
print("2. Run 04_modeling_fusion.ipynb for multimodal fusion approaches")
print("3. Consider the PCA dataset for high-dimensional modeling approaches")
print("4. Review feature_metadata.json for detailed feature information")
print()
print("Feature engineering completed successfully! 🎉")

❌ No processed features available for export

NEXT STEPS:
1. Run 03_modeling_baseline_sentiment.ipynb for baseline model training
2. Run 04_modeling_fusion.ipynb for multimodal fusion approaches
3. Consider the PCA dataset for high-dimensional modeling approaches
4. Review feature_metadata.json for detailed feature information

Feature engineering completed successfully! 🎉
