# Local Spark with S3 Integration (Fixed)

This notebook demonstrates how to:
1. Load AWS credentials from Environment Variables (Docker) or local JSON file
2. Configure a local Spark session for S3 access
3. Read sample clickstream data from S3
4. Validate the connection and explore the data

**Prerequisites:**
- Java 8 or 11 installed (`java -version`)
- PySpark installed (`pip install pyspark`)
- AWS credentials (Env Vars or file)

## 1. Setup and Configuration

In [None]:
# Install required packages if needed
# !pip install pyspark boto3

In [None]:
import os
import json
from pathlib import Path

# Configuration - Update these paths for your environment
CREDENTIALS_FILE = Path.home() / '.aws' / 'aws_credentials.json'
# Alternative: use a path relative to this notebook
# CREDENTIALS_FILE = Path('./aws_credentials.json')

# S3 paths - Update for your buckets
S3_BRONZE_BUCKET = os.environ.get('SCRIPT_BUCKET', 'your-bronze-bucket-name').replace('-scripts-', '-bronze-')
S3_DATA_KEY = 'clickstream/sample_clickstream.json'

print(f"Credentials file: {CREDENTIALS_FILE}")
print(f"S3 path: s3://{S3_BRONZE_BUCKET}/{S3_DATA_KEY}")

## 2. Load AWS Credentials

Prioritizes Environment Variables (injected by Docker) over local files.

In [None]:
def load_aws_credentials(credentials_path: Path) -> dict:
    """
    Load AWS credentials from Environment Variables or JSON file.
    Priority: Env Vars > JSON File
    """
    # 1. Check Environment Variables (Docker injection)
    if os.environ.get('AWS_ACCESS_KEY_ID') and os.environ.get('AWS_SECRET_ACCESS_KEY'):
        print("‚úÖ Loaded credentials from Environment Variables")
        return {
            'aws_access_key_id': os.environ['AWS_ACCESS_KEY_ID'],
            'aws_secret_access_key': os.environ['AWS_SECRET_ACCESS_KEY'],
            'region': os.environ.get('AWS_DEFAULT_REGION', 'eu-north-1'),
            'session_token': os.environ.get('AWS_SESSION_TOKEN')
        }
    
    # 2. Check JSON File
    if not credentials_path.exists():
        # If not found and no env vars, return None (let Spark try default chain)
        print(f"‚ö†Ô∏è  Credentials file not found: {credentials_path}")
        print("   Will attempt to use default chain (e.g. ~/.aws/credentials if mounted)")
        return None
    
    with open(credentials_path, 'r') as f:
        creds = json.load(f)
    
    print(f"‚úÖ Loaded credentials from JSON file")
    return creds

# Load credentials
aws_creds = load_aws_credentials(CREDENTIALS_FILE)

## 3. Create Spark Session with S3 Configuration

This configures Spark to use the Hadoop AWS library for S3 access.

In [None]:
from pyspark.sql import SparkSession

def create_spark_session_with_s3(credentials: dict = None) -> SparkSession:
    """
    Create a local Spark session configured for S3 access.
    
    Args:
        credentials: Dictionary with AWS credentials (optional if using IAM role or env vars)
    
    Returns:
        Configured SparkSession
    """
    builder = SparkSession.builder \
        .appName("LocalS3SparkSession") \
        .master("local[*]") \
        .config("spark.driver.memory", "2g") \
        .config("spark.sql.shuffle.partitions", "4") \
        .config("spark.jars.packages", 
                "org.apache.hadoop:hadoop-aws:3.3.4,"
                "com.amazonaws:aws-java-sdk-bundle:1.12.262") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.connection.timeout", "10000") \
        .config("spark.hadoop.fs.s3a.connection.establish.timeout", "5000")
    
    # Add credentials configuration if provided
    if credentials:
        builder = builder \
            .config("spark.hadoop.fs.s3a.access.key", credentials['aws_access_key_id']) \
            .config("spark.hadoop.fs.s3a.secret.key", credentials['aws_secret_access_key'])
        
        if credentials.get('session_token'):
            builder = builder \
                .config("spark.hadoop.fs.s3a.session.token", credentials['session_token']) \
                .config("spark.hadoop.fs.s3a.aws.credentials.provider",
                        "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider")
        
        if credentials.get('region'):
            builder = builder \
                .config("spark.hadoop.fs.s3a.endpoint", f"s3.{credentials['region']}.amazonaws.com")
    else:
        # Use default credential provider chain (env vars, ~/.aws/credentials, IAM role)
        builder = builder \
            .config("spark.hadoop.fs.s3a.aws.credentials.provider",
                    "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    
    spark = builder.getOrCreate()
    
    print(f"‚úÖ Spark session created")
    print(f"   Version: {spark.version}")
    print(f"   App ID: {spark.sparkContext.applicationId}")
    
    return spark

# Create Spark session
spark = create_spark_session_with_s3(aws_creds)

## 4. Read Data

In [None]:
s3_path = f"s3a://{S3_BRONZE_BUCKET}/{S3_DATA_KEY}"
print(f"üìñ Reading from: {s3_path}")

try:
    df = spark.read.json(s3_path)
    print(f"‚úÖ Successfully read {df.count()} records")
    df.show()
except Exception as e:
    print(f"‚ùå Failed to read from S3: {e}")