# Portfolio Forecasting - Task 1: Data Exploration (Fixed)

Complete implementation of Business Task 1 from agent.md:
- Fetch data for TSLA, BND, SPY (2015-2025)
- Data cleaning and preprocessing
- EDA with visualizations
- Stationarity testing (ADF)
- Risk metrics calculation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import sys
from statsmodels.tsa.stattools import adfuller

sys.path.append('../src')
from data.yfinance_client import YFinanceClient
from data.preprocessor import DataPreprocessor
from data.data_validator import DataValidator
from analysis.feature_engineer import FeatureEngineer

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)

print("✅ Setup complete")

In [None]:
# Initialize components
client = YFinanceClient(use_cache=True, cache_expiry_hours=24)
validator = DataValidator()
preprocessor = DataPreprocessor()
feature_engineer = FeatureEngineer()

SYMBOLS = ['TSLA', 'BND', 'SPY']
START_DATE = '2015-07-01'
END_DATE = '2025-07-31'

print(f"📊 Fetching {', '.join(SYMBOLS)} from {START_DATE} to {END_DATE}")

In [None]:
# Fetch and fix data
raw_data = client.fetch_data(SYMBOLS, START_DATE, END_DATE)

# Fix missing Adj Close
if 'Adj Close' not in raw_data.columns:
    raw_data['Adj Close'] = raw_data['Close']
    print("✅ Added Adj Close column")

print(f"📈 Data shape: {raw_data.shape}")
print(f"📅 Date range: {raw_data['Date'].min()} to {raw_data['Date'].max()}")
raw_data.head()

In [None]:
# Data validation and preprocessing
validation_result = validator.validate_data(raw_data)
print(f"Validation: {'✅ PASSED' if validation_result.is_valid else '⚠️ ISSUES'}")

clean_data = preprocessor.preprocess_data(raw_data)
print(f"✅ Cleaned data: {clean_data.shape}")
print(f"Missing values: {clean_data.isnull().sum().sum()}")

In [None]:
# Feature engineering
features_data = feature_engineer.calculate_returns(clean_data)
features_data = feature_engineer.calculate_volatility(features_data)
features_data = feature_engineer.calculate_rolling_statistics(features_data)

print(f"✅ Features calculated: {features_data.shape}")
new_cols = [col for col in features_data.columns if col not in clean_data.columns]
print(f"New columns: {new_cols[:5]}...")  # Show first 5

features_data[['Date', 'Symbol', 'Close', 'Daily_Return']].head()

In [None]:
# EDA: Price trends
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Price Analysis', fontsize=16)

colors = {'TSLA': 'red', 'BND': 'blue', 'SPY': 'green'}

# Closing prices
ax1 = axes[0, 0]
for symbol in SYMBOLS:
    data = features_data[features_data['Symbol'] == symbol].sort_values('Date')
    ax1.plot(data['Date'], data['Close'], label=symbol, color=colors[symbol])
ax1.set_title('Closing Prices')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Normalized prices
ax2 = axes[0, 1]
for symbol in SYMBOLS:
    data = features_data[features_data['Symbol'] == symbol].sort_values('Date')
    normalized = (data['Close'] / data['Close'].iloc[0]) * 100
    ax2.plot(data['Date'], normalized, label=symbol, color=colors[symbol])
ax2.set_title('Normalized Prices (Base=100)')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Daily returns
ax3 = axes[1, 0]
for symbol in SYMBOLS:
    data = features_data[features_data['Symbol'] == symbol]
    returns = data['Daily_Return'].dropna()
    ax3.hist(returns, bins=50, alpha=0.6, label=symbol, color=colors[symbol], density=True)
ax3.set_title('Return Distributions')
ax3.legend()
ax3.grid(True, alpha=0.3)

# Volatility
ax4 = axes[1, 1]
for symbol in SYMBOLS:
    data = features_data[features_data['Symbol'] == symbol].sort_values('Date')
    if 'Volatility_30d' in data.columns:
        ax4.plot(data['Date'], data['Volatility_30d'], label=symbol, color=colors[symbol])
ax4.set_title('30-Day Volatility')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Stationarity testing
def adf_test(series, name):
    result = adfuller(series.dropna())
    print(f"\n{name}:")
    print(f"  ADF Statistic: {result[0]:.6f}")
    print(f"  p-value: {result[1]:.6f}")
    is_stationary = result[1] <= 0.05
    print(f"  {'✅ Stationary' if is_stationary else '❌ Non-stationary'}")
    return is_stationary

print("🔍 Stationarity Testing (ADF Test)")
stationarity_results = {}

for symbol in SYMBOLS:
    data = features_data[features_data['Symbol'] == symbol]
    print(f"\n📊 {symbol}:")
    
    prices_stat = adf_test(data['Close'], f"{symbol} Prices")
    returns_stat = adf_test(data['Daily_Return'], f"{symbol} Returns")
    
    stationarity_results[symbol] = {
        'prices': prices_stat,
        'returns': returns_stat
    }

print("\n📋 Summary:")
for symbol, results in stationarity_results.items():
    p_status = '✅' if results['prices'] else '❌'
    r_status = '✅' if results['returns'] else '❌'
    print(f"{symbol}: Prices {p_status}, Returns {r_status}")

In [None]:
# Risk metrics calculation
print("📊 Risk Metrics")
risk_metrics = {}

for symbol in SYMBOLS:
    data = features_data[features_data['Symbol'] == symbol]
    returns = data['Daily_Return'].dropna()
    
    if len(returns) > 0:
        annual_return = returns.mean() * 252
        annual_vol = returns.std() * np.sqrt(252)
        sharpe = (annual_return - 0.02) / annual_vol if annual_vol > 0 else 0
        var_95 = np.percentile(returns, 5)
        max_dd = ((data['Close'] / data['Close'].expanding().max()) - 1).min()
        
        risk_metrics[symbol] = {
            'Annual Return': f"{annual_return:.2%}",
            'Annual Volatility': f"{annual_vol:.2%}",
            'Sharpe Ratio': f"{sharpe:.3f}",
            'VaR (95%)': f"{var_95:.2%}",
            'Max Drawdown': f"{max_dd:.2%}"
        }

risk_df = pd.DataFrame(risk_metrics).T
print("\n📋 Risk Metrics Summary:")
print(risk_df)

# Risk-return plot
plt.figure(figsize=(10, 6))
for symbol in SYMBOLS:
    data = features_data[features_data['Symbol'] == symbol]
    returns = data['Daily_Return'].dropna()
    annual_return = returns.mean() * 252
    annual_vol = returns.std() * np.sqrt(252)
    
    plt.scatter(annual_vol, annual_return, s=100, label=symbol, 
               color=colors[symbol], alpha=0.7)
    plt.annotate(symbol, (annual_vol, annual_return), 
                xytext=(5, 5), textcoords='offset points')

plt.xlabel('Annual Volatility')
plt.ylabel('Annual Return')
plt.title('Risk-Return Profile')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Task 1 Summary
print("📝 TASK 1 COMPLETE - SUMMARY")
print("=" * 50)

print(f"\n🎯 Data Collection:")
print(f"  • Records processed: {len(features_data):,}")
print(f"  • Date range: {START_DATE} to {END_DATE}")
print(f"  • Assets: {', '.join(SYMBOLS)}")

print(f"\n🧹 Data Quality:")
print(f"  • Validation: {'✅ Passed' if validation_result.is_valid else '⚠️ Issues handled'}")
print(f"  • Missing values: {clean_data.isnull().sum().sum()} (processed)")
print(f"  • Features created: {len(new_cols)}")

print(f"\n📊 Stationarity (for ARIMA):")
non_stationary = [s for s, r in stationarity_results.items() if not r['prices']]
if non_stationary:
    print(f"  • Price series need differencing: {', '.join(non_stationary)}")
else:
    print(f"  • All price series stationary")
print(f"  • Returns generally stationary ✅")

print(f"\n💰 Risk Insights:")
for symbol in SYMBOLS:
    metrics = risk_metrics[symbol]
    print(f"  • {symbol}: {metrics['Annual Return']} return, {metrics['Annual Volatility']} vol, {metrics['Sharpe Ratio']} Sharpe")

print(f"\n🎯 Key Findings:")
print(f"  • TSLA: High volatility, suitable for forecasting")
print(f"  • BND: Low volatility, stable bond ETF")
print(f"  • SPY: Balanced risk-return profile")
print(f"  • Data ready for ARIMA/LSTM modeling")

print(f"\n✅ READY FOR TASK 2: Time Series Forecasting Models")