# Data Collection for Economic Downturn Detection

This notebook handles the collection of all economic data sources needed for the recession prediction model. It pulls data from multiple sources and combines them into a unified dataset for analysis.

## Data Sources

1. **Federal Reserve Economic Data (FRED)**: Core economic indicators like GDP, unemployment, inflation
2. **National Bureau of Economic Research (NBER)**: Official recession dates and periods
3. **University of Michigan**: Consumer sentiment surveys and expectations
4. **Conference Board**: Consumer confidence index
5. **Business Sentiment Indicators**: Manufacturing PMI, business optimism, CEO confidence

## Data Coverage

**Data Cutoff Date**: May 2024

We collect data from January 1970 through May 2024, covering 8 recession periods and multiple economic cycles. This gives us enough historical data for model training while including recent economic conditions.

## Requirements

- FRED API key (set in .env file as FRED_API_KEY)
- Internet connection for data fetching
- Sufficient disk space for data storage

In [None]:
# Import required libraries
import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import logging
from dotenv import load_dotenv
from fredapi import Fred
import warnings
warnings.filterwarnings('ignore')

# Add the src directory to the path
sys.path.append('../src')

# Import the econ_downturn package
from econ_downturn import (
    get_fred_data, get_nber_data, get_all_data,
    setup_logger, load_environment
)

# Set up logging
logger = setup_logger('data_collection')

# Load environment variables
load_environment()

print("Data collection notebook initialized successfully!")
print(f"Current working directory: {os.getcwd()}")

## 1. FRED API Setup and Validation

Let's check that the FRED API key is set up correctly and test the connection.

In [None]:
# Check FRED API key
fred_api_key = os.getenv('FRED_API_KEY')

if not fred_api_key:
    print("FRED API key not found!")
    print("Please set the FRED_API_KEY environment variable in your .env file.")
    print("You can get a free API key from: https://fred.stlouisfed.org/")
    sys.exit(1)
else:
    print("FRED API key found")
    
# Test FRED API connection
try:
    fred = Fred(api_key=fred_api_key)
    # Test with a simple series
    test_data = fred.get_series('UNRATE', limit=1)
    print("FRED API connection successful")
    print(f"Latest unemployment rate: {test_data.iloc[-1]:.1f}% ({test_data.index[-1].strftime('%Y-%m')})")
except Exception as e:
    print(f"FRED API connection failed: {e}")
    sys.exit(1)

## 2. Define Data Collection Parameters

Set the date range and output directories for data collection.

In [None]:
# Define data collection parameters
START_DATE = '1970-01-01'
END_DATE = '2024-05-31'  # Data cutoff date

# Create output directories
DATA_DIR = '../data'
FRED_DIR = os.path.join(DATA_DIR, 'fred')
NBER_DIR = os.path.join(DATA_DIR, 'nber')
UMICH_DIR = os.path.join(DATA_DIR, 'umich')
CONF_BOARD_DIR = os.path.join(DATA_DIR, 'conf_board')
BUSINESS_DIR = os.path.join(DATA_DIR, 'business')
PROCESSED_DIR = os.path.join(DATA_DIR, 'processed')

# Create directories if they don't exist
for directory in [FRED_DIR, NBER_DIR, UMICH_DIR, CONF_BOARD_DIR, BUSINESS_DIR, PROCESSED_DIR]:
    os.makedirs(directory, exist_ok=True)
    
print(f"Data collection period: {START_DATE} to {END_DATE}")
print(f"Output directories created in: {DATA_DIR}")

## 3. Fetch FRED Economic Indicators

Get the main economic indicators from the Federal Reserve Economic Data (FRED) database.

In [None]:
print("Fetching FRED economic indicators...")
print("This may take a few minutes depending on your internet connection.")

# Fetch FRED data using the existing function
fred_data = get_fred_data(
    api_key=fred_api_key,
    start_date=START_DATE,
    end_date=END_DATE,
    output_dir=FRED_DIR
)

if fred_data is not None:
    print(f"FRED data collected successfully!")
    print(f"   Shape: {fred_data.shape}")
    print(f"   Date range: {fred_data.index.min()} to {fred_data.index.max()}")
    print(f"   Indicators: {list(fred_data.columns)}")
else:
    print("Failed to fetch FRED data")
    sys.exit(1)

## 4. Fetch NBER Recession Data

Get official recession dates from the National Bureau of Economic Research.

In [None]:
print("Fetching NBER recession data...")

# Fetch NBER data using the existing function
nber_data = get_nber_data(
    start_date=START_DATE,
    end_date=END_DATE,
    output_dir=NBER_DIR
)

if nber_data is not None:
    print(f"NBER recession data collected successfully!")
    print(f"   Shape: {nber_data.shape}")
    print(f"   Date range: {nber_data.index.min()} to {nber_data.index.max()}")
    print(f"   Recession periods: {nber_data['recession'].sum()} months")
    print(f"   Non-recession periods: {(nber_data['recession'] == 0).sum()} months")
else:
    print("Failed to fetch NBER data")
    sys.exit(1)

## 5. Fetch University of Michigan Consumer Sentiment Data

Get consumer sentiment data from the University of Michigan via FRED.

In [None]:
def fetch_umich_data(api_key, start_date='1970-01-01', end_date=None, output_dir='../data/umich'):
    """
    Fetch University of Michigan Consumer Sentiment data from FRED.
    
    Parameters
    ----------
    api_key : str
        FRED API key
    start_date : str
        Start date in 'YYYY-MM-DD' format
    end_date : str
        End date in 'YYYY-MM-DD' format, defaults to current date
    output_dir : str
        Directory to save the CSV files
        
    Returns
    -------
    pandas.DataFrame
        Merged dataset with all UMich sentiment indicators
    """
    if end_date is None:
        end_date = datetime.now().strftime('%Y-%m-%d')
    
    # UMich sentiment indicators from FRED
    umich_indicators = {
        'SENTIMENT': 'UMCSENT',     # University of Michigan: Consumer Sentiment
        'CURRENT': 'UMCURRENT',     # University of Michigan: Current Economic Conditions
        'EXPECTED': 'UMEXPECT',     # University of Michigan: Consumer Expectations
        'INFLATION_1Y': 'MICH1YR',  # University of Michigan: Inflation Expectation (1-Year)
        'INFLATION_5Y': 'MICH5YR'   # University of Michigan: Inflation Expectation (5-Year)
    }
    
    fred = Fred(api_key=api_key)
    data_frames = []
    
    for name, series_id in umich_indicators.items():
        try:
            logger.info(f"Fetching {name} (Series ID: {series_id})")
            series = fred.get_series(series_id, start_date, end_date)
            df = pd.DataFrame({name: series})
            data_frames.append(df)
            logger.info(f"Successfully fetched {name} with {len(series)} observations")
        except Exception as e:
            logger.error(f"Error fetching {name}: {e}")
    
    if data_frames:
        # Merge all DataFrames
        merged_data = pd.concat(data_frames, axis=1)
        
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        # Save individual series
        for name, df in zip(umich_indicators.keys(), data_frames):
            output_path = os.path.join(output_dir, f"{name.lower()}.csv")
            df.to_csv(output_path)
            logger.info(f"Saved {name} to {output_path}")
        
        # Save merged data
        merged_path = os.path.join(output_dir, "all_sentiment.csv")
        merged_data.to_csv(merged_path)
        logger.info(f"Saved merged UMich data to {merged_path}")
        
        return merged_data
    else:
        logger.error("No UMich data was successfully fetched")
        return None

print("Fetching University of Michigan Consumer Sentiment data...")

umich_data = fetch_umich_data(
    api_key=fred_api_key,
    start_date=START_DATE,
    end_date=END_DATE,
    output_dir=UMICH_DIR
)

if umich_data is not None:
    print(f"UMich sentiment data collected successfully!")
    print(f"   Shape: {umich_data.shape}")
    print(f"   Date range: {umich_data.index.min()} to {umich_data.index.max()}")
    print(f"   Indicators: {list(umich_data.columns)}")
else:
    print("Failed to fetch UMich data")
    sys.exit(1)

## 6. Fetch Conference Board Consumer Confidence Index

Get the Conference Board Consumer Confidence Index from FRED.

In [None]:
def fetch_conference_board_data(api_key, start_date='1970-01-01', end_date=None, output_dir='../data/conf_board'):
    """
    Fetch Conference Board Consumer Confidence Index data from FRED.
    
    Parameters
    ----------
    api_key : str
        FRED API key
    start_date : str
        Start date in 'YYYY-MM-DD' format
    end_date : str
        End date in 'YYYY-MM-DD' format, defaults to current date
    output_dir : str
        Directory to save the CSV files
        
    Returns
    -------
    pandas.DataFrame
        DataFrame with Conference Board Consumer Confidence Index
    """
    if end_date is None:
        end_date = datetime.now().strftime('%Y-%m-%d')
    
    # FRED series ID for Conference Board Consumer Confidence Index
    series_id = 'CSCICP03USM665S'
    
    try:
        fred = Fred(api_key=api_key)
        logger.info(f"Fetching Conference Board Consumer Confidence Index (Series ID: {series_id})")
        
        # Fetch data from FRED
        data = fred.get_series(series_id, start_date, end_date)
        
        # Convert to DataFrame
        df = pd.DataFrame(data, columns=['CONF_BOARD'])
        df.index.name = 'date'
        
        logger.info(f"Fetched Conference Board data with shape: {df.shape}")
        
        # Create output directory and save to CSV
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, 'consumer_confidence.csv')
        df.to_csv(output_path)
        logger.info(f"Saved Conference Board data to {output_path}")
        
        return df
    
    except Exception as e:
        logger.error(f"Error fetching Conference Board data: {e}")
        return pd.DataFrame()

print("Fetching Conference Board Consumer Confidence Index...")

conf_board_data = fetch_conference_board_data(
    api_key=fred_api_key,
    start_date=START_DATE,
    end_date=END_DATE,
    output_dir=CONF_BOARD_DIR
)

if not conf_board_data.empty:
    print(f"Conference Board data collected successfully!")
    print(f"   Shape: {conf_board_data.shape}")
    print(f"   Date range: {conf_board_data.index.min()} to {conf_board_data.index.max()}")
else:
    print("Conference Board data collection failed, but continuing...")

## 7. Fetch Business Sentiment Indicators

Get various business sentiment indicators from FRED.

In [None]:
def fetch_business_sentiment_data(api_key, start_date='1970-01-01', end_date=None, output_dir='../data/business'):
    """
    Fetch business sentiment indicators from FRED.
    
    Parameters
    ----------
    api_key : str
        FRED API key
    start_date : str
        Start date in 'YYYY-MM-DD' format
    end_date : str
        End date in 'YYYY-MM-DD' format, defaults to current date
    output_dir : str
        Directory to save the CSV files
        
    Returns
    -------
    pandas.DataFrame
        DataFrame with business sentiment indicators
    """
    if end_date is None:
        end_date = datetime.now().strftime('%Y-%m-%d')
    
    # FRED series IDs for business sentiment indicators
    series_ids = {
        'ISM_PMI': 'MANEMP',           # ISM Manufacturing PMI
        'ISM_NONMFG': 'NMFBAI',        # ISM Non-Manufacturing Index
        'BUS_OPTIMISM': 'NFCIBUSOPX',  # NFIB Small Business Optimism Index
        'CEO_CONFIDENCE': 'CEOCONF',   # CEO Confidence Index
        'PHILLY_FED': 'USPHCI'         # Philadelphia Fed Business Outlook Survey
    }
    
    fred = Fred(api_key=api_key)
    data_frames = []
    
    for name, series_id in series_ids.items():
        try:
            logger.info(f"Fetching {name} (Series ID: {series_id})")
            # Fetch data from FRED
            data = fred.get_series(series_id, start_date, end_date)
            
            # Convert to DataFrame
            df = pd.DataFrame(data, columns=[name])
            df.index.name = 'date'
            
            data_frames.append(df)
            logger.info(f"Fetched {name} data with shape: {df.shape}")
            
        except Exception as e:
            logger.warning(f"Could not fetch {name}: {e}")
    
    if data_frames:
        # Merge all DataFrames
        merged_df = pd.concat(data_frames, axis=1)
        
        logger.info(f"Merged business sentiment data with shape: {merged_df.shape}")
        
        # Create output directory and save to CSV
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, 'business_sentiment.csv')
        merged_df.to_csv(output_path)
        logger.info(f"Saved business sentiment data to {output_path}")
        
        return merged_df
    else:
        logger.warning("No business sentiment data fetched")
        return pd.DataFrame()

print("Fetching business sentiment indicators...")

business_data = fetch_business_sentiment_data(
    api_key=fred_api_key,
    start_date=START_DATE,
    end_date=END_DATE,
    output_dir=BUSINESS_DIR
)

if not business_data.empty:
    print(f"Business sentiment data collected successfully!")
    print(f"   Shape: {business_data.shape}")
    print(f"   Date range: {business_data.index.min()} to {business_data.index.max()}")
    print(f"   Indicators: {list(business_data.columns)}")
else:
    print("Business sentiment data collection failed, but continuing...")

## 8. Integrate All Data Sources

Combine all the data sources we've collected into one dataset for analysis.

In [None]:
print("Integrating all data sources...")

# Start with the core data from get_all_data function
try:
    integrated_data = get_all_data()
    print(f"Core data loaded: {integrated_data.shape}")
except Exception as e:
    print(f"Failed to load core data: {e}")
    # If get_all_data fails, manually combine the data
    data_sources = []
    
    if fred_data is not None:
        data_sources.append(fred_data)
    if nber_data is not None:
        data_sources.append(nber_data)
    if umich_data is not None:
        data_sources.append(umich_data)
    
    if data_sources:
        integrated_data = pd.concat(data_sources, axis=1)
        print(f"Manually integrated core data: {integrated_data.shape}")
    else:
        print("No data sources available for integration")
        sys.exit(1)

# Add Conference Board data if available
if not conf_board_data.empty:
    integrated_data = pd.concat([integrated_data, conf_board_data], axis=1)
    print(f"Added Conference Board data: {integrated_data.shape}")

# Add business sentiment data if available
if not business_data.empty:
    integrated_data = pd.concat([integrated_data, business_data], axis=1)
    print(f"Added business sentiment data: {integrated_data.shape}")

# Handle missing values with forward fill then backward fill
integrated_data = integrated_data.fillna(method='ffill').fillna(method='bfill')

# Save the integrated dataset
integrated_path = os.path.join(PROCESSED_DIR, 'integrated_data.csv')
integrated_data.to_csv(integrated_path)

print(f"\nFinal integrated dataset:")
print(f"   Shape: {integrated_data.shape}")
print(f"   Date range: {integrated_data.index.min()} to {integrated_data.index.max()}")
print(f"   Saved to: {integrated_path}")
print(f"   Columns: {list(integrated_data.columns)}")

## 9. Data Collection Summary

Summary of all the data we've collected.

In [None]:
print("\n" + "="*60)
print("DATA COLLECTION SUMMARY")
print("="*60)

print(f"\nCollection Period: {START_DATE} to {END_DATE}")
print(f"Total Data Points: {len(integrated_data)} time periods")
print(f"Total Indicators: {len(integrated_data.columns)} variables")

print("\nData Sources Collected:")

# FRED data summary
if fred_data is not None:
    print(f"   FRED Economic Indicators: {fred_data.shape[1]} indicators")
    print(f"      - GDP, Unemployment, CPI, Fed Funds Rate, etc.")
else:
    print(f"   FRED Economic Indicators: Failed")

# NBER data summary
if nber_data is not None:
    recession_months = nber_data['recession'].sum()
    total_months = len(nber_data)
    recession_pct = (recession_months / total_months) * 100
    print(f"   NBER Recession Data: {recession_months}/{total_months} recession months ({recession_pct:.1f}%)")
else:
    print(f"   NBER Recession Data: Failed")

# UMich data summary
if umich_data is not None:
    print(f"   UMich Consumer Sentiment: {umich_data.shape[1]} indicators")
    print(f"      - Consumer Sentiment, Current Conditions, Expectations, etc.")
else:
    print(f"   UMich Consumer Sentiment: Failed")

# Conference Board data summary
if not conf_board_data.empty:
    print(f"   Conference Board Consumer Confidence: 1 indicator")
else:
    print(f"   Conference Board Consumer Confidence: Not available")

# Business sentiment data summary
if not business_data.empty:
    print(f"   Business Sentiment Indicators: {business_data.shape[1]} indicators")
    print(f"      - ISM PMI, NFIB Optimism, CEO Confidence, etc.")
else:
    print(f"   Business Sentiment Indicators: Not available")

print(f"\nData Storage:")
print(f"   Raw data saved in: {DATA_DIR}/[source]/")
print(f"   Integrated data saved in: {integrated_path}")

print(f"\nData collection completed successfully!")
print(f"   The integrated dataset is ready for feature engineering and analysis.")
print(f"   Next step: Run notebook 01_data_exploration.ipynb")