# Market Predictor: Feature Engineering

This notebook focuses on generating and analyzing features from our collected data:
1. Technical Features
2. Sentiment Features
3. Macroeconomic Features
4. Feature Selection and Analysis

## Setup and Configuration

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Add project root to path
sys.path.append('..')

# Import project modules
from src.features import (
    TechnicalFeatures,
    SentimentFeatures,
    MacroFeatures,
    FeatureGenerator
)
from src.utils import setup_project_logger
from config import Config, load_validated_config

# Plotting settings
plt.style.use('seaborn')
%matplotlib inline
sns.set_theme(style="whitegrid")

# Setup logging
logger = setup_project_logger('feature_engineering')

## 1. Load Data and Configuration

Load the processed data from our previous notebook and configure feature parameters

In [None]:
# Load configuration
config = load_validated_config('config/parameters.yaml')

# Load processed data
market_data = pd.read_parquet('data/processed/market_data.parquet')
macro_data = pd.read_parquet('data/processed/macro_data.parquet')

# Load news and social data if available
news_data = {}
social_data = {}

try:
    for source in ['newsapi', 'alphavantage', 'finnhub']:
        file_path = f'data/processed/news_{source}.parquet'
        if os.path.exists(file_path):
            news_data[source] = pd.read_parquet(file_path)
            
    for source in ['reddit', 'twitter']:
        file_path = f'data/processed/social_{source}.parquet'
        if os.path.exists(file_path):
            social_data[source] = pd.read_parquet(file_path)
except Exception as e:
    logger.warning(f"Error loading some data sources: {e}")

# Display data information
print("Data Ranges:")
print(f"Market Data: {market_data.index.min()} to {market_data.index.max()}")
print(f"Macro Data: {macro_data.index.min()} to {macro_data.index.max()}")

## 2. Technical Feature Generation

Generate technical indicators and analyze their distributions and relationships:
- Price-based features
- Volume indicators
- Momentum indicators
- Volatility measures
- Pattern recognition features

In [None]:
# Initialize technical feature generator
tech_features = TechnicalFeatures(config)

# Generate technical features
technical_features = tech_features.calculate_all_features(market_data)

# Display feature information
print("\nTechnical Features Overview:")
print(technical_features.info())

# Plot key feature distributions
key_features = [
    'Returns', 'Daily_Volatility',
    'RSI_14', 'MFI',
    'ATR', 'OBV'
]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, feature in enumerate(key_features):
    if feature in technical_features.columns:
        sns.histplot(data=technical_features, x=feature, ax=axes[idx])
        axes[idx].set_title(f'{feature} Distribution')

plt.tight_layout()

# Calculate feature correlations
correlation_matrix = technical_features[key_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, 
            annot=True, 
            cmap='coolwarm', 
            center=0,
            fmt='.2f')
plt.title('Technical Feature Correlations')
plt.show()

# Log feature generation completion
logger.info(f"Generated {len(technical_features.columns)} technical features")

## 3. Sentiment Feature Generation

Process and combine sentiment data from multiple sources:
- News sentiment analysis
- Social media sentiment
- Aggregated sentiment indicators

In [None]:
# Initialize sentiment feature generator
sentiment_features = SentimentFeatures(config)

# Generate sentiment features from news data
news_features = pd.DataFrame(index=market_data.index)

for source, data in news_data.items():
    if not data.empty:
        source_features = sentiment_features.calculate_all_features(data)
        # Add source prefix to avoid column name conflicts
        source_features = source_features.add_prefix(f'{source}_')
        news_features = news_features.join(source_features)

# Generate sentiment features from social data
social_features = pd.DataFrame(index=market_data.index)

for source, data in social_data.items():
    if not data.empty:
        source_features = sentiment_features.calculate_all_features(data)
        # Add source prefix
        source_features = source_features.add_prefix(f'{source}_')
        social_features = social_features.join(source_features)

# Combine all sentiment features
all_sentiment_features = pd.concat([news_features, social_features], axis=1)

# Plot sentiment trends
plt.figure(figsize=(15, 6))
for column in all_sentiment_features.filter(like='sentiment_score').columns:
    all_sentiment_features[column].rolling(window=5).mean().plot(label=column)
plt.title('Sentiment Scores Over Time')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Plot sentiment correlation with returns
sentiment_correlation = pd.DataFrame(index=all_sentiment_features.columns)
sentiment_correlation['correlation_with_returns'] = [
    all_sentiment_features[col].corr(technical_features['Returns'])
    for col in all_sentiment_features.columns
]

plt.figure(figsize=(10, 6))
sentiment_correlation['correlation_with_returns'].sort_values().plot(kind='bar')
plt.title('Sentiment Feature Correlations with Returns')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Log sentiment feature generation completion
logger.info(f"Generated {len(all_sentiment_features.columns)} sentiment features")

## 4. Macroeconomic Feature Generation

Generate features from macroeconomic indicators:
- Economic indicators and their derivatives
- Interest rate features
- Growth indicators
- Volatility measures

In [None]:
# Initialize macro feature generator
macro_features = MacroFeatures(config)

# Generate macro features
macro_features_df = macro_features.calculate_all_features(macro_data, market_data)

# Display feature information
print("\nMacroeconomic Features Overview:")
print(macro_features_df.info())

# Plot key macro indicators trends
key_macro_features = [
    'GDP_YoY', 'CPI_YoY',
    'UNRATE_Change', 'FEDFUNDS_Change',
    'Economic_Activity_Index', 'Financial_Conditions_Index'
]

fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, feature in enumerate(key_macro_features):
    if feature in macro_features_df.columns:
        macro_features_df[feature].plot(ax=axes[idx])
        axes[idx].set_title(f'{feature} Over Time')
        axes[idx].grid(True)

plt.tight_layout()

# Calculate correlations with market returns
macro_correlations = pd.DataFrame(index=macro_features_df.columns)
macro_correlations['correlation_with_returns'] = [
    macro_features_df[col].corr(technical_features['Returns'])
    for col in macro_features_df.columns
]

# Plot top correlations
plt.figure(figsize=(12, 6))
macro_correlations['correlation_with_returns'].sort_values(ascending=False).head(10).plot(kind='bar')
plt.title('Top 10 Macro Features - Correlation with Returns')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Log macro feature generation completion
logger.info(f"Generated {len(macro_features_df.columns)} macroeconomic features")

## 5. Feature Combination and Selection

Combine all features and perform feature selection:
- Feature aggregation
- Feature importance analysis
- Correlation analysis
- Dimensionality reduction

In [None]:
# Initialize feature generator
feature_gen = FeatureGenerator(config)

# Combine all features
all_features = pd.concat([
    technical_features,
    all_sentiment_features,
    macro_features_df
], axis=1)

# Remove any duplicate columns
all_features = all_features.loc[:,~all_features.columns.duplicated()]

# Calculate feature importance using Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Prepare target variable (next day returns)
target = technical_features['Returns'].shift(-1).dropna()
features_for_importance = all_features.loc[target.index]

# Scale features
scaler = StandardScaler()
scaled_features = pd.DataFrame(
    scaler.fit_transform(features_for_importance),
    columns=features_for_importance.columns,
    index=features_for_importance.index
)

# Train Random Forest for feature importance
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(scaled_features, target)

# Create feature importance DataFrame
feature_importance = pd.DataFrame({
    'feature': features_for_importance.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 20 feature importance
plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
plt.title('Top 20 Most Important Features')
plt.tight_layout()
plt.show()

# Calculate feature correlations
correlation_matrix = scaled_features.corr()

# Plot correlation heatmap for top features
plt.figure(figsize=(12, 10))
top_features = feature_importance['feature'].head(15).tolist()
sns.heatmap(
    correlation_matrix.loc[top_features, top_features],
    annot=True,
    cmap='coolwarm',
    center=0,
    fmt='.2f'
)
plt.title('Correlation Matrix of Top Features')
plt.tight_layout()
plt.show()

# Log feature selection completion
logger.info(f"Selected {len(top_features)} top features for modeling")

## 6. Save Processed Features and Next Steps

Save the engineered features and prepare for model development

In [None]:
# Create features directory if it doesn't exist
import os
os.makedirs('data/features', exist_ok=True)

# Save all features
all_features.to_parquet('data/features/all_features.parquet')

# Save feature importance
feature_importance.to_csv('data/features/feature_importance.csv')

# Save selected features
selected_features = all_features[top_features]
selected_features.to_parquet('data/features/selected_features.parquet')

# Save feature metadata
feature_metadata = {
    'total_features': len(all_features.columns),
    'selected_features': len(top_features),
    'technical_features': len(technical_features.columns),
    'sentiment_features': len(all_sentiment_features.columns),
    'macro_features': len(macro_features_df.columns),
    'feature_generation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}

import json
with open('data/features/feature_metadata.json', 'w') as f:
    json.dump(feature_metadata, f, indent=4)

# Log completion
logger.info('Feature engineering completed and saved')

## Next Steps

1. Review the generated features and their importance scores
2. Consider:
   - Adding more derived features
   - Fine-tuning feature parameters
   - Implementing feature selection thresholds
3. Proceed to `03_model_development.ipynb` for model training

Key Files Generated:
- `data/features/all_features.parquet`: Complete feature set
- `data/features/selected_features.parquet`: Top selected features
- `data/features/feature_importance.csv`: Feature importance scores
- `data/features/feature_metadata.json`: Feature generation metadata