# Smart Beta Portfolio - Data Exploration

This notebook demonstrates the data collection and initial exploration for the smart beta portfolio strategy.

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Import our data fetcher
from data_collection.data_fetcher import DataFetcher

# Set style
plt.style.use('default')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## 1. Initialize Data Fetcher

Let's start by initializing our data fetcher with a shorter time period for testing.

In [None]:
# Initialize with recent 5 years for testing
end_date = datetime.now().strftime('%Y-%m-%d')
start_date = (datetime.now() - timedelta(days=5*365)).strftime('%Y-%m-%d')

print(f"Fetching data from {start_date} to {end_date}")

fetcher = DataFetcher(start_date=start_date, end_date=end_date)

## 2. Fetch Stock Universe

Get the S&P 500 stock universe.

In [None]:
# Get S&P 500 tickers
tickers = fetcher.get_sp500_universe()
print(f"Retrieved {len(tickers)} tickers")
print(f"First 20 tickers: {tickers[:20]}")

## 3. Fetch Stock Price Data

Let's fetch price data for a subset of stocks to test the functionality.

In [None]:
# Test with first 50 stocks
test_tickers = tickers[:50]

# Fetch stock prices
prices = fetcher.get_stock_data(test_tickers)
print(f"Fetched price data shape: {prices.shape}")
print(f"Date range: {prices.index[0]} to {prices.index[-1]}")

# Display first few rows
print("\nFirst 5 rows of price data:")
prices.head()

## 4. Calculate Returns

Calculate daily returns from the price data.

In [None]:
# Calculate returns
returns = fetcher.calculate_returns(prices)
print(f"Returns data shape: {returns.shape}")

# Basic statistics
print("\nReturn statistics:")
print(f"Mean daily return: {returns.mean().mean():.4f}")
print(f"Daily volatility: {returns.std().mean():.4f}")
print(f"Annualized return: {returns.mean().mean() * 252:.4f}")
print(f"Annualized volatility: {returns.std().mean() * np.sqrt(252):.4f}")

# Display correlation matrix
print("\nCorrelation matrix (first 10 stocks):")
corr_matrix = returns.iloc[:, :10].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Stock Return Correlations')
plt.show()

## 5. Fetch Benchmark Data

Get benchmark (SPY) data for comparison.

In [None]:
# Fetch benchmark
benchmark = fetcher.get_benchmark_data()
benchmark_returns = fetcher.calculate_returns(benchmark.to_frame()['SPY'])

print(f"Benchmark data shape: {benchmark.shape}")
print(f"Benchmark annualized return: {benchmark_returns.mean() * 252:.4f}")
print(f"Benchmark annualized volatility: {benchmark_returns.std() * np.sqrt(252):.4f}")

# Plot benchmark performance
plt.figure(figsize=(12, 6))
cumulative_returns = (1 + benchmark_returns).cumprod()
plt.plot(cumulative_returns.index, cumulative_returns.values)
plt.title('SPY Cumulative Returns')
plt.xlabel('Date')
plt.ylabel('Cumulative Return')
plt.grid(True)
plt.show()

## 6. Fetch Fama-French Factors

Download factor data from Ken French's library.

In [None]:
# Fetch Fama-French factors
factors = fetcher.get_fama_french_factors()

if not factors.empty:
    print(f"Factor data shape: {factors.shape}")
    print(f"Factors available: {list(factors.columns)}")
    
    # Display factor statistics
    print("\nFactor Statistics (annualized):")
    factor_stats = pd.DataFrame({
        'Mean Return': factors.mean() * 12,
        'Volatility': factors.std() * np.sqrt(12),
        'Sharpe Ratio': (factors.mean() * 12) / (factors.std() * np.sqrt(12))
    })
    print(factor_stats)
    
    # Plot factor cumulative returns
    plt.figure(figsize=(12, 8))
    cumulative_factors = (1 + factors).cumprod()
    for factor in factors.columns:
        plt.plot(cumulative_factors.index, cumulative_factors[factor], label=factor)
    
    plt.title('Fama-French Factor Cumulative Returns')
    plt.xlabel('Date')
    plt.ylabel('Cumulative Return')
    plt.legend()
    plt.grid(True)
    plt.show()
    
else:
    print("Could not fetch Fama-French factor data")

## 7. Fetch Macroeconomic Data

Try to fetch macro data from FRED (requires API key).

In [None]:
# Fetch macro data
macro_data = fetcher.get_macro_data()

if not macro_data.empty:
    print(f"Macro data shape: {macro_data.shape}")
    print(f"Macro variables: {list(macro_data.columns)}")
    
    # Display recent values
    print("\nRecent macro data:")
    print(macro_data.tail())
    
    # Plot some key macro variables
    plt.figure(figsize=(15, 10))
    
    # Plot yield curve spreads if available
    if 'T10Y2Y' in macro_data.columns:
        plt.subplot(2, 2, 1)
        plt.plot(macro_data.index, macro_data['T10Y2Y'])
        plt.title('10Y-2Y Treasury Spread')
        plt.ylabel('Spread (%)')
        plt.grid(True)
    
    # Plot unemployment rate if available
    if 'UNRATE' in macro_data.columns:
        plt.subplot(2, 2, 2)
        plt.plot(macro_data.index, macro_data['UNRATE'])
        plt.title('Unemployment Rate')
        plt.ylabel('Rate (%)')
        plt.grid(True)
    
    # Plot inflation if available
    if 'CPIAUCSL' in macro_data.columns:
        plt.subplot(2, 2, 3)
        cpi_yoy = macro_data['CPIAUCSL'].pct_change(12) * 100
        plt.plot(macro_data.index, cpi_yoy)
        plt.title('CPI Inflation (YoY)')
        plt.ylabel('Inflation (%)')
        plt.grid(True)
    
    # Plot 10Y Treasury rate if available
    if 'DGS10' in macro_data.columns:
        plt.subplot(2, 2, 4)
        plt.plot(macro_data.index, macro_data['DGS10'])
        plt.title('10-Year Treasury Rate')
        plt.ylabel('Rate (%)')
        plt.grid(True)
    
    plt.tight_layout()
    plt.show()
    
else:
    print("Could not fetch macro data (FRED API key may be required)")

## 8. Data Quality Check

Perform basic data quality checks.

In [None]:
# Check for missing values in returns
missing_data = returns.isnull().sum()
print("Missing values per stock:")
print(missing_data[missing_data > 0].head(10))

# Check for extreme returns (potential data issues)
extreme_returns = returns[(returns > 0.5) | (returns < -0.5)]
print(f"\nNumber of extreme returns (>50% or <-50%): {extreme_returns.count().sum()}")

# Distribution of returns
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
returns.iloc[:, 0].hist(bins=50, alpha=0.7)
plt.title(f'Return Distribution - {returns.columns[0]}')
plt.xlabel('Daily Return')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
returns.mean().hist(bins=30, alpha=0.7)
plt.title('Distribution of Mean Returns Across Stocks')
plt.xlabel('Mean Daily Return')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## 9. Save Sample Data

Save our sample data for use in other notebooks.

In [None]:
# Save sample data
fetcher.save_data(prices, 'sample_prices.csv')
fetcher.save_data(returns, 'sample_returns.csv')
fetcher.save_data(benchmark.to_frame(), 'sample_benchmark.csv')

if not factors.empty:
    fetcher.save_data(factors, 'sample_factors.csv')

if not macro_data.empty:
    fetcher.save_data(macro_data, 'sample_macro.csv')

print("Sample data saved successfully!")

## Summary

This notebook demonstrated the data collection capabilities of our smart beta portfolio system:

1. **Stock Universe**: Successfully retrieved S&P 500 ticker symbols
2. **Price Data**: Fetched adjusted close prices with proper error handling
3. **Returns**: Calculated daily returns with basic statistics
4. **Benchmark**: Retrieved SPY data for performance comparison
5. **Factors**: Attempted to fetch Fama-French factor data
6. **Macro Data**: Attempted to fetch economic indicators from FRED
7. **Quality Control**: Performed basic data quality checks

Next steps:
- Set up FRED API key for macro data access
- Implement factor construction algorithms
- Develop machine learning models for factor timing
- Build portfolio optimization framework