# Data Acquisition and Ingestion

In [6]:
# main.py
import sys
import os
from dotenv import load_dotenv

# Add parent directory to path for imports
sys.path.append("..")

from src.utils import (
    fetch_yfinance, scrape_sp500_table, fetch_alphavantage,
    validate_dataframe, save_with_timestamp, fetch_stock_data,
    get_summary_stats
)
from src.config import load_env, get_key

def main():
    """Main data pipeline execution."""
    try:
        # Load environment variables
        load_dotenv()
        
        # Alternative config loading (choose one approach)
        # load_env()
        # data_dir = get_key("DATA_DIR")
        
        # Get API keys
        alpha_key = os.getenv("ALPHAVANTAGE_API_KEY")
        if not alpha_key:
            print("⚠️ ALPHAVANTAGE_API_KEY not found in environment")
        
        print("🚀 Starting data pipeline...")
        
        # 1. API Pull: MSFT stock data with Alpha Vantage primary, yfinance fallback
        print("\n📊 Fetching MSFT data (Alpha Vantage primary, yfinance fallback)...")
        df_api = fetch_stock_data("MSFT", prefer_alphavantage=True)
        
        if not df_api.empty:
            data_source = df_api['data_source'].iloc[0] if 'data_source' in df_api.columns else 'unknown'
            print(f"✅ API data fetched successfully from {data_source}")
            print(f"Columns: {df_api.columns.tolist()}")
            if 'date' in df_api.columns:
                print(f"Date range: {df_api['date'].min()} to {df_api['date'].max()}")
            
            # Validate and save
            validate_dataframe(df_api, required_cols=["date", "open", "close", "high", "low"])
            api_path = save_with_timestamp(df_api, prefix="stock_api", source=f"{data_source}_msft")
            print(f"💾 Saved API data to: {api_path}")
            
            # Show summary statistics
            print("\n📈 MSFT Summary Statistics:")
            print(get_summary_stats(df_api[['open', 'high', 'low', 'close', 'volume']]))
        
        # 2. Web Scraping: S&P500 companies
        print("\n🌐 Scraping S&P500 companies from Wikipedia...")
        df_sp500 = scrape_sp500_table()
        
        if not df_sp500.empty:
            print("✅ S&P500 data scraped successfully")
            print(f"Found {len(df_sp500)} companies")
            
            # Validate and save
            validate_dataframe(df_sp500, required_cols=["Symbol", "Security"])
            scrape_path = save_with_timestamp(df_sp500, prefix="sp500_companies", source="wikipedia")
            print(f"💾 Saved scrape data to: {scrape_path}")
            
            # Show sample data
            print("\n🏢 Sample S&P500 Companies:")
            print(df_sp500[['Symbol', 'Security', 'GICS Sector']].head())
        
        print("\n✅ Data pipeline completed successfully!")
        
    except Exception as e:
        print(f"❌ Pipeline failed with error: {str(e)}")
        raise

if __name__ == "__main__":
    main()

🚀 Starting data pipeline...

📊 Fetching MSFT data (Alpha Vantage primary, yfinance fallback)...
📊 Fetching MSFT from Alpha Vantage (TIME_SERIES_DAILY)
✅ Fetched 100 records from Alpha Vantage for MSFT
✅ API data fetched successfully from alphavantage
Columns: ['date', 'open', 'high', 'low', 'close', 'volume', 'symbol', 'fetch_timestamp', 'data_source']
Date range: 2025-03-27 00:00:00 to 2025-08-19 00:00:00

🔍 Validating DataFrame...
✅ Validation passed!
   Shape: (100, 9)
   Columns: ['date', 'open', 'high', 'low', 'close', 'volume', 'symbol', 'fetch_timestamp', 'data_source']
✅ No missing data found
   Memory usage: 0.02 MB
💾 Saved 100 rows to ./data\alphavantage_msft\stock_api_20250820_155458.csv
💾 Saved API data to: ./data\alphavantage_msft\stock_api_20250820_155458.csv

📈 MSFT Summary Statistics:
             open        high         low     close        volume
count  100.000000  100.000000  100.000000  100.0000  1.000000e+02
mean   459.722350  463.950975  455.798957  460.0438  2.1

  dfs = pd.read_html(response.text)


✅ Scraped 503 S&P500 companies
✅ S&P500 data scraped successfully
Found 503 companies

🔍 Validating DataFrame...
✅ Validation passed!
   Shape: (503, 10)
   Columns: ['Symbol', 'Security', 'GICS Sector', 'GICS Sub-Industry', 'Headquarters Location', 'Date added', 'CIK', 'Founded', 'scrape_date', 'source_url']
✅ No missing data found
   Memory usage: 0.28 MB
💾 Saved 503 rows to ./data\wikipedia\sp500_companies_20250820_155458.csv
💾 Saved scrape data to: ./data\wikipedia\sp500_companies_20250820_155458.csv

🏢 Sample S&P500 Companies:
  Symbol             Security             GICS Sector
0    MMM                   3M             Industrials
1    AOS          A. O. Smith             Industrials
2    ABT  Abbott Laboratories             Health Care
3   ABBV               AbbVie             Health Care
4    ACN            Accenture  Information Technology

✅ Data pipeline completed successfully!
