# AAPL Data Loading Test with S3 Stock Client

This notebook demonstrates how to use the S3StockDataClient to load Apple (AAPL) stock data from our S3 parquet storage.

## Setup and Configuration

In [1]:
import sys
import os
import pandas as pd
import logging
from datetime import datetime, date

# # Add the project root to Python path
# project_root = os.path.dirname(os.getcwd())
# if project_root not in sys.path:
#     sys.path.insert(0, project_root)

# Import the S3 client
from clients.s3_stock_client import S3StockDataClient

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("Setup complete!")

Setup complete!


## Initialize S3 Stock Client

Configure the client to connect to our S3 bucket containing the processed stock data.

In [2]:
# Initialize the S3 client
client = S3StockDataClient(
    bucket="anawatp-us-stocks",
    base_prefix="parquet", 
    aws_region="us-west-2",  # Match the region from our ETL job
    cache_enabled=False  # Disable caching for testing
)

print(f"S3 Client initialized for bucket: {client.config['bucket']}")
print(f"Base path: {client.base_path}")

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


S3 Client initialized for bucket: anawatp-us-stocks
Base path: s3://anawatp-us-stocks/parquet


## Health Check

Verify that we can connect to S3 and access the data.

In [3]:
# Perform health check
health = client.health_check()

print("=== Health Check Results ===")
print(f"Status: {health['status']}")
print(f"S3 Connection: {health['s3_connection']}")
print(f"Bucket Accessible: {health['bucket_accessible']}")
print(f"Partitions Found: {health['partitions_found']}")

if health['issues']:
    print("\nIssues found:")
    for issue in health['issues']:
        print(f"  - {issue}")
else:
    print("\n✅ No issues found!")

KeyboardInterrupt: 

## Explore Available Data

Let's see what years and tickers are available in our dataset.

In [None]:
# Get available years
try:
    available_years = client.get_available_years()
    print(f"Available years: {available_years}")
except Exception as e:
    print(f"Error getting years: {e}")
    available_years = []

In [None]:
# Get available tickers for the most recent year
if available_years:
    latest_year = max(available_years)
    print(f"\nChecking tickers for {latest_year}...")
    
    try:
        tickers = client.get_available_tickers(year=latest_year)
        print(f"Found {len(tickers)} tickers in {latest_year}")
        print(f"First 10 tickers: {tickers[:10]}")
        
        # Check if AAPL is available
        if "AAPL" in tickers:
            print("\n✅ AAPL data is available!")
        else:
            print("\n❌ AAPL data not found in available tickers")
    except Exception as e:
        print(f"Error getting tickers: {e}")
        tickers = []
else:
    print("No years available to check tickers")
    latest_year = None
    tickers = []

## List AAPL Partitions

Let's see what AAPL data partitions are available across all years using the new ticker-only feature.

In [ ]:
# List all AAPL partitions across all years using the new ticker-only feature
try:
    print("=== Testing ticker-only list_partitions feature ===")
    aapl_partitions = client.list_partitions(ticker="AAPL")
    
    print(f"Found {len(aapl_partitions)} AAPL partitions across all years:")
    for partition in aapl_partitions:
        print(f"  Year: {partition['year']}, Files: {partition['files']}, Path: {partition['path']}")
        
    # Also test specific year filtering for comparison
    if latest_year:
        print(f"\n=== AAPL partitions for {latest_year} only ===")
        aapl_2024 = client.list_partitions(year=latest_year, ticker="AAPL")
        print(f"Found {len(aapl_2024)} AAPL partitions for {latest_year}:")
        for partition in aapl_2024:
            print(f"  Year: {partition['year']}, Files: {partition['files']}, Path: {partition['path']}")
            
    # Test case insensitive ticker input
    print(f"\n=== Testing case insensitive ticker (aapl lowercase) ===")
    aapl_lower = client.list_partitions(ticker="aapl")
    print(f"Found {len(aapl_lower)} partitions for 'aapl' (should be same as 'AAPL')")
        
except Exception as e:
    print(f"Error listing AAPL partitions: {e}")
    aapl_partitions = []

## Load AAPL Data

Now let's load some AAPL data and examine its structure.

In [None]:
# Load AAPL data for the latest available year
if latest_year and "AAPL" in tickers:
    try:
        print(f"Loading AAPL data for {latest_year}...")
        
        aapl_data = client.get_data(
            tickers="AAPL",
            years=latest_year
        )
        
        print(f"\n✅ Successfully loaded AAPL data!")
        print(f"Shape: {aapl_data.shape}")
        print(f"Columns: {list(aapl_data.columns)}")
        print(f"Date range: {aapl_data['window_start_et'].min()} to {aapl_data['window_start_et'].max()}")
        
    except Exception as e:
        print(f"Error loading AAPL data: {e}")
        aapl_data = None
else:
    print("Cannot load AAPL data - not available or no years found")
    aapl_data = None

## Examine Data Structure

Let's look at the first few rows and basic statistics.

In [None]:
if aapl_data is not None and not aapl_data.empty:
    print("=== First 5 rows ===")
    display(aapl_data.head())
    
    print("\n=== Data Types ===")
    print(aapl_data.dtypes)
    
    print("\n=== Basic Statistics ===")
    display(aapl_data.describe())
else:
    print("No data available to examine")

## Data Quality Checks

Let's perform some basic data quality checks.

In [None]:
if aapl_data is not None and not aapl_data.empty:
    print("=== Data Quality Checks ===")
    
    # Check for null values
    null_counts = aapl_data.isnull().sum()
    print(f"Null values per column:")
    for col, count in null_counts.items():
        if count > 0:
            print(f"  {col}: {count}")
    if null_counts.sum() == 0:
        print("  ✅ No null values found")
    
    # Check price consistency
    price_issues = aapl_data[
        (aapl_data['high'] < aapl_data['low']) |
        (aapl_data['high'] < aapl_data['open']) |
        (aapl_data['high'] < aapl_data['close']) |
        (aapl_data['low'] > aapl_data['open']) |
        (aapl_data['low'] > aapl_data['close'])
    ]
    
    if len(price_issues) == 0:
        print("  ✅ Price data consistency checks passed")
    else:
        print(f"  ❌ Found {len(price_issues)} rows with price inconsistencies")
    
    # Check trading hours (should be 9:30 AM - 4:00 PM ET)
    aapl_data['hour'] = pd.to_datetime(aapl_data['window_start_et']).dt.hour
    aapl_data['minute'] = pd.to_datetime(aapl_data['window_start_et']).dt.minute
    
    outside_hours = aapl_data[
        ~(
            ((aapl_data['hour'] == 9) & (aapl_data['minute'] >= 30)) |
            ((aapl_data['hour'] > 9) & (aapl_data['hour'] < 16))
        )
    ]
    
    if len(outside_hours) == 0:
        print("  ✅ All data is within trading hours (9:30 AM - 4:00 PM ET)")
    else:
        print(f"  ❌ Found {len(outside_hours)} rows outside trading hours")
        print(f"      Hours found: {sorted(outside_hours['hour'].unique())}")
    
    # Check weekdays only
    aapl_data['weekday'] = pd.to_datetime(aapl_data['window_start_et']).dt.dayofweek
    weekend_data = aapl_data[aapl_data['weekday'].isin([5, 6])]  # Saturday=5, Sunday=6
    
    if len(weekend_data) == 0:
        print("  ✅ All data is from weekdays only")
    else:
        print(f"  ❌ Found {len(weekend_data)} rows from weekends")
else:
    print("No data available for quality checks")

## Test Date Range Filtering

Let's test loading AAPL data for a specific date range.

In [None]:
if latest_year and "AAPL" in tickers:
    try:
        # Load AAPL data for first week of the year
        start_date = f"{latest_year}-01-01"
        end_date = f"{latest_year}-01-07"
        
        print(f"Loading AAPL data from {start_date} to {end_date}...")
        
        aapl_week = client.get_data(
            tickers="AAPL",
            years=latest_year,
            start_date=start_date,
            end_date=end_date
        )
        
        print(f"\n✅ Successfully loaded AAPL data for date range!")
        print(f"Shape: {aapl_week.shape}")
        
        if not aapl_week.empty:
            print(f"Actual date range: {aapl_week['window_start_et'].min()} to {aapl_week['window_start_et'].max()}")
            display(aapl_week.head())
        else:
            print("No data found for the specified date range")
        
    except Exception as e:
        print(f"Error loading AAPL data with date filter: {e}")
else:
    print("Cannot test date filtering - AAPL data not available")

## Test Streaming Data

Let's test the streaming functionality for large datasets.

In [None]:
if latest_year and "AAPL" in tickers:
    try:
        print(f"Testing streaming AAPL data for {latest_year}...")
        
        total_rows = 0
        chunk_count = 0
        
        for chunk in client.stream_data(
            tickers="AAPL",
            years=latest_year,
            chunk_size=1000
        ):
            chunk_count += 1
            total_rows += len(chunk)
            
            if chunk_count == 1:
                print(f"First chunk shape: {chunk.shape}")
                print(f"First chunk columns: {list(chunk.columns)}")
            
            # Only process first few chunks for demo
            if chunk_count >= 3:
                break
        
        print(f"\n✅ Streaming test completed!")
        print(f"Processed {chunk_count} chunks with {total_rows} total rows")
        
    except Exception as e:
        print(f"Error testing streaming: {e}")
else:
    print("Cannot test streaming - AAPL data not available")

## Summary

This notebook demonstrated the key features of the S3StockDataClient:

1. **Health checking** - Verify S3 connectivity and data availability
2. **Data exploration** - List available years, tickers, and partitions  
3. **Data loading** - Load specific ticker data with filtering
4. **Data quality** - Verify data consistency and trading hours filtering
5. **Date filtering** - Load data for specific date ranges
6. **Streaming** - Handle large datasets with chunked processing

The client provides a robust interface for accessing our processed stock data stored in S3 parquet format with year/ticker partitioning.