# Stage 06: Financial Data Preprocessing Pipeline

This notebook creates a comprehensive preprocessing pipeline for financial data, including cleaning, feature engineering, and risk metrics calculation.

## Objectives
- Clean and validate financial data
- Calculate returns and risk metrics
- Create technical indicators
- Prepare data for modeling

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
from datetime import datetime
import utils
import cleaning
import matplotlib.pyplot as plt
import seaborn as sns

print("🔧 Financial Data Preprocessing Pipeline")

## 1. Load Sample Financial Data

In [None]:
# Fetch fresh data for preprocessing
symbols = ["AAPL", "MSFT", "GOOGL"]
print(f"Fetching data for: {symbols}")

raw_data = utils.fetch_multiple_stocks(
    symbols=symbols,
    prefer_alphavantage=False,
    period="3mo"
)

if not raw_data.empty:
    print(f"✅ Raw data loaded: {raw_data.shape}")
    print(f"Date range: {raw_data['date'].min()} to {raw_data['date'].max()}")
    print(raw_data.head())
else:
    print("❌ Failed to load data")

## 2. Data Cleaning and Validation

In [None]:
if not raw_data.empty:
    # Clean the data using existing cleaning functions
    print("🧹 Cleaning financial data...")
    
    # Fill missing values with median for price columns
    price_columns = ['open', 'high', 'low', 'close']
    cleaned_data = cleaning.fill_missing_median(raw_data, columns=price_columns)
    
    # Drop rows with excessive missing data
    cleaned_data = cleaning.drop_missing(cleaned_data, threshold=0.5)
    
    # Normalize volume data
    if 'volume' in cleaned_data.columns:
        volume_normalized, scaler = cleaning.normalize_data(
            cleaned_data, 
            columns=['volume'], 
            method='standard'
        )
        cleaned_data['volume_normalized'] = volume_normalized['volume']
    
    print(f"✅ Data cleaned: {cleaned_data.shape}")
    
    # Generate cleaning report
    cleaning_report = cleaning.generate_cleaning_report(raw_data, cleaned_data)
    print("\n📊 Cleaning Report:")
    for key, value in cleaning_report.items():
        print(f"  {key}: {value}")

## 3. Financial Feature Engineering

In [None]:
def create_financial_features(df):
    """Create financial features for each symbol."""
    feature_df = df.copy()
    
    # Group by symbol for calculations
    for symbol in feature_df['symbol'].unique():
        mask = feature_df['symbol'] == symbol
        symbol_data = feature_df[mask].copy()
        
        # Calculate returns
        symbol_data['daily_return'] = symbol_data['close'].pct_change()
        symbol_data['log_return'] = np.log(symbol_data['close'] / symbol_data['close'].shift(1))
        
        # Calculate moving averages
        symbol_data['ma_5'] = symbol_data['close'].rolling(window=5).mean()
        symbol_data['ma_20'] = symbol_data['close'].rolling(window=20).mean()
        
        # Calculate volatility
        symbol_data['volatility_10'] = symbol_data['daily_return'].rolling(window=10).std()
        symbol_data['volatility_20'] = symbol_data['daily_return'].rolling(window=20).std()
        
        # Price-based features
        symbol_data['high_low_pct'] = (symbol_data['high'] - symbol_data['low']) / symbol_data['low']
        symbol_data['open_close_pct'] = (symbol_data['close'] - symbol_data['open']) / symbol_data['open']
        
        # Update main dataframe
        feature_df.loc[mask, symbol_data.columns] = symbol_data
    
    return feature_df

if not cleaned_data.empty:
    print("🔧 Creating financial features...")
    featured_data = create_financial_features(cleaned_data)
    
    print(f"✅ Features created: {featured_data.shape}")
    print(f"New columns: {[col for col in featured_data.columns if col not in cleaned_data.columns]}")
    
    # Show sample of features
    feature_cols = ['symbol', 'date', 'close', 'daily_return', 'volatility_10', 'ma_5', 'ma_20']
    print("\nSample features:")
    print(featured_data[feature_cols].head(10))

## 4. Risk Metrics Calculation

In [None]:
def calculate_risk_metrics(df):
    """Calculate risk metrics for each symbol."""
    risk_metrics = []
    
    for symbol in df['symbol'].unique():
        symbol_data = df[df['symbol'] == symbol].copy()
        returns = symbol_data['daily_return'].dropna()
        
        if len(returns) > 10:  # Need sufficient data
            metrics = {
                'symbol': symbol,
                'mean_return': returns.mean(),
                'volatility': returns.std(),
                'annualized_return': returns.mean() * 252,
                'annualized_volatility': returns.std() * np.sqrt(252),
                'sharpe_ratio': (returns.mean() * 252) / (returns.std() * np.sqrt(252)) if returns.std() > 0 else 0,
                'max_return': returns.max(),
                'min_return': returns.min(),
                'var_95': returns.quantile(0.05),
                'skewness': returns.skew(),
                'kurtosis': returns.kurtosis()
            }
            risk_metrics.append(metrics)
    
    return pd.DataFrame(risk_metrics)

if not featured_data.empty:
    print("📊 Calculating risk metrics...")
    risk_summary = calculate_risk_metrics(featured_data)
    
    print("\n📈 Risk Metrics Summary:")
    print(risk_summary.round(4))
    
    # Visualize risk metrics
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    
    # Annualized returns
    axes[0,0].bar(risk_summary['symbol'], risk_summary['annualized_return'])
    axes[0,0].set_title('Annualized Returns')
    axes[0,0].set_ylabel('Return')
    
    # Volatility
    axes[0,1].bar(risk_summary['symbol'], risk_summary['annualized_volatility'])
    axes[0,1].set_title('Annualized Volatility')
    axes[0,1].set_ylabel('Volatility')
    
    # Sharpe Ratio
    axes[1,0].bar(risk_summary['symbol'], risk_summary['sharpe_ratio'])
    axes[1,0].set_title('Sharpe Ratio')
    axes[1,0].set_ylabel('Sharpe Ratio')
    
    # VaR 95%
    axes[1,1].bar(risk_summary['symbol'], risk_summary['var_95'])
    axes[1,1].set_title('Value at Risk (95%)')
    axes[1,1].set_ylabel('VaR')
    
    plt.tight_layout()
    plt.show()

## 5. Save Processed Data

In [None]:
if not featured_data.empty:
    # Save processed features
    features_path = utils.save_with_timestamp(
        df=featured_data,
        prefix="financial_features",
        source="processed",
        ext="csv"
    )
    
    # Save risk metrics
    risk_path = utils.save_with_timestamp(
        df=risk_summary,
        prefix="risk_metrics",
        source="processed",
        ext="csv"
    )
    
    print(f"💾 Features saved to: {features_path}")
    print(f"💾 Risk metrics saved to: {risk_path}")

## 6. Summary

In [None]:
print("\n🎯 Stage 06 Summary:")
print("✅ Data cleaning completed")
print("✅ Financial features engineered")
print("✅ Risk metrics calculated")
print("✅ Processed data saved")

print("\n📋 Next Steps:")
print("- Stage 07: Build risk analysis models")
print("- Stage 08: Create portfolio optimization")
print("- Stage 09: Deploy and monitor system")