# 🌡️ 온습도 관측 데이터 분석 - AWS S3 Enhanced Version
(Temperature & Humidity Sensor Data Analysis - AWS S3 Integration)

**WebEx 강의 실습 - Scikit-Learn Data Splitting + AWS S3 Data Pipeline**

## 📋 **학습 목표:**
1. **AWS S3에서 온습도 센서 데이터** 로드 및 탐색
2. **Boto3를 활용한 클라우드 데이터 파이프라인** 구축
3. **Scikit-Learn train_test_split** 활용한 데이터 분할
4. **Stratified Splitting** 으로 데이터 분포 유지
5. **Time Series Splitting** 으로 시계열 데이터 처리
6. **데이터 분할 품질 검증** 및 모델 학습
7. **결과를 S3로 업로드** 하는 완전한 클라우드 워크플로우

## 🎯 **핵심 개념:**
- **AWS S3 Data Pipeline** - 클라우드 기반 데이터 수집 및 저장
- **Train/Validation/Test Split** - 모델 학습/검증/평가용 데이터 분리
- **Stratified Sampling** - 클래스 비율 유지한 분할
- **Time Series Cross-Validation** - 시간 순서 고려한 검증
- **Data Leakage 방지** - 미래 정보가 과거로 유출되지 않도록 방지
- **Cloud-to-Cloud Workflow** - 입력부터 출력까지 완전한 클라우드 파이프라인

## 📚 **Section 1: Import Required Libraries + AWS SDK Setup**
필요한 라이브러리들과 AWS SDK를 import하고 환경을 설정합니다.

In [None]:
# 🔐 AWS Credentials Setup and Verification
import os
import subprocess
import json

print("🔍 AWS Credentials Detection:")
print("=" * 50)

# Method 1: Check for IAM Role (EC2/ECS/Lambda environment)
def check_iam_role():
    """Check if we're running with an IAM role (preferred method)"""
    try:
        # Try to access instance metadata (works on EC2)
        import urllib.request
        import json
        
        # EC2 Instance Metadata Service v2 (IMDSv2)
        try:
            # Get session token first
            token_request = urllib.request.Request(
                'http://169.254.169.254/latest/api/token',
                headers={'X-aws-ec2-metadata-token-ttl-seconds': '21600'}
            )
            token_request.get_method = lambda: 'PUT'
            token_response = urllib.request.urlopen(token_request, timeout=2)
            token = token_response.read().decode('utf-8')
            
            # Get IAM role info
            role_request = urllib.request.Request(
                'http://169.254.169.254/latest/meta-data/iam/security-credentials/',
                headers={'X-aws-ec2-metadata-token': token}
            )
            role_response = urllib.request.urlopen(role_request, timeout=2)
            role_name = role_response.read().decode('utf-8').strip()
            
            if role_name:
                print(f"✅ IAM Role detected: {role_name}")
                
                # Get role details
                creds_request = urllib.request.Request(
                    f'http://169.254.169.254/latest/meta-data/iam/security-credentials/{role_name}',
                    headers={'X-aws-ec2-metadata-token': token}
                )
                creds_response = urllib.request.urlopen(creds_request, timeout=2)
                creds_data = json.loads(creds_response.read().decode('utf-8'))
                
                print(f"🔑 Access Key: {creds_data.get('AccessKeyId', 'N/A')}")
                print(f"⏰ Expires: {creds_data.get('Expiration', 'N/A')}")
                return True, role_name
            
        except Exception:
            # Fallback to IMDSv1
            role_response = urllib.request.urlopen(
                'http://169.254.169.254/latest/meta-data/iam/security-credentials/', 
                timeout=2
            )
            role_name = role_response.read().decode('utf-8').strip()
            if role_name:
                print(f"✅ IAM Role detected (IMDSv1): {role_name}")
                return True, role_name
                
    except Exception:
        pass
    
    # Check for other IAM role indicators
    if os.getenv('AWS_ROLE_ARN') or os.getenv('AWS_WEB_IDENTITY_TOKEN_FILE'):
        print("✅ IAM Role detected (ECS/EKS/Lambda environment)")
        return True, "Service Role"
    
    return False, None

# Check IAM Role first
iam_role_available, role_name = check_iam_role()

# Method 2: Check AWS CLI configuration
try:
    result = subprocess.run(['aws', 'configure', 'list'], 
                          capture_output=True, text=True, timeout=10)
    if result.returncode == 0:
        print("✅ AWS CLI configured successfully!")
        print("📋 AWS CLI Configuration:")
        
        # Parse AWS CLI output to check source
        cli_lines = result.stdout.strip().split('\n')
        for line in cli_lines:
            if 'access_key' in line and 'iam-role' in line:
                print("🎯 AWS CLI using IAM role credentials!")
            elif 'access_key' in line:
                print(f"🔧 {line}")
        
        aws_cli_configured = True
    else:
        print("⚠️ AWS CLI not configured properly")
        aws_cli_configured = False
except (subprocess.TimeoutExpired, FileNotFoundError):
    print("⚠️ AWS CLI not found or not responding")
    aws_cli_configured = False

# Method 3: Check environment variables
aws_access_key = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')
aws_region = os.getenv('AWS_DEFAULT_REGION', 'us-east-1')

print(f"\n🔧 Environment Variables:")
if aws_access_key:
    print(f"🔑 AWS_ACCESS_KEY_ID: {aws_access_key}")
    print(f"🔐 AWS_SECRET_ACCESS_KEY: {'*' * (len(aws_secret_key) - 4) + aws_secret_key[-4:] if aws_secret_key else 'NOT SET'}")
    print(f"🌍 AWS_DEFAULT_REGION: {aws_region}")
    env_vars_configured = True
else:
    print("❌ No AWS environment variables found")
    env_vars_configured = False

# Method 4: Try to load from .env file (optional)
env_file_configured = False
try:
    from dotenv import load_dotenv
    if os.path.exists('.env'):
        load_dotenv()
        print(f"\n📄 .env file found and loaded")
        # Re-check environment variables after loading .env
        aws_access_key = os.getenv('AWS_ACCESS_KEY_ID')
        if aws_access_key:
            env_file_configured = True
            print(f"🔑 Loaded AWS_ACCESS_KEY_ID from .env: {aws_access_key}")
    else:
        print(f"\n📄 No .env file found (this is OK if AWS CLI or IAM role is configured)")
except ImportError:
    print(f"\n📄 python-dotenv not installed (this is OK if AWS CLI or IAM role is configured)")

# Summary and Test Connection
print(f"\n🎯 Credentials Summary:")
print(f"✅ IAM Role: {iam_role_available} {'(' + role_name + ')' if role_name else ''}")
print(f"✅ AWS CLI Configured: {aws_cli_configured}")
print(f"✅ Environment Variables: {env_vars_configured}")
print(f"✅ .env File: {env_file_configured}")

credentials_available = iam_role_available or aws_cli_configured or env_vars_configured or env_file_configured

if credentials_available:
    print(f"\n🚀 Testing S3 Connection...")
    
    # Test S3 access
    try:
        import boto3
        from botocore.exceptions import NoCredentialsError, ClientError
        
        s3_client = boto3.client('s3')
        
        # Test basic S3 access
        response = s3_client.list_buckets()
        print("✅ S3 connection successful!")
        
        # Check if elbee-ai bucket is accessible
        try:
            s3_client.head_bucket(Bucket='elbee-ai')
            print("✅ elbee-ai bucket accessible!")
            
            # Test your IAM role permissions
            if iam_role_available:
                print(f"\n🎯 Testing IAM Role Permissions ({role_name}):")
                
                # Test ListBucket permission
                try:
                    response = s3_client.list_objects_v2(
                        Bucket='elbee-ai', 
                        Prefix='project-data/',
                        MaxKeys=5
                    )
                    print("✅ s3:ListBucket permission working")
                    
                    if 'Contents' in response:
                        print(f"📁 Files in elbee-ai/project-data/:")
                        for obj in response['Contents']:
                            file_size_mb = obj['Size'] / (1024 * 1024)
                            print(f"  📄 {obj['Key']} ({file_size_mb:.2f} MB)")
                            
                        # Test GetObject permission on first file
                        first_file = response['Contents'][0]['Key']
                        try:
                            s3_client.head_object(Bucket='elbee-ai', Key=first_file)
                            print("✅ s3:GetObject permission working")
                        except ClientError as e:
                            print(f"❌ s3:GetObject permission issue: {e}")
                    else:
                        print(f"📁 project-data/ folder is empty or doesn't exist")
                        
                except ClientError as e:
                    error_code = e.response['Error']['Code']
                    if error_code == 'AccessDenied':
                        print("❌ s3:ListBucket permission denied - check IAM policy")
                    else:
                        print(f"❌ ListBucket error: {error_code}")
            else:
                # List files for non-IAM role authentication
                try:
                    response = s3_client.list_objects_v2(
                        Bucket='elbee-ai', 
                        Prefix='project-data/',
                        MaxKeys=5
                    )
                    if 'Contents' in response:
                        print(f"📁 Files in elbee-ai/project-data/:")
                        for obj in response['Contents']:
                            file_size_mb = obj['Size'] / (1024 * 1024)
                            print(f"  📄 {obj['Key']} ({file_size_mb:.2f} MB)")
                    else:
                        print(f"📁 project-data/ folder is empty or doesn't exist")
                except ClientError as e:
                    print(f"⚠️ Could not list files in project-data/: {e}")
                
        except ClientError as e:
            error_code = e.response['Error']['Code']
            if error_code == '403':
                print("❌ Access denied to elbee-ai bucket - check permissions")
            elif error_code == '404':
                print("❌ elbee-ai bucket not found")
            else:
                print(f"❌ Error accessing elbee-ai bucket: {error_code}")
        
    except NoCredentialsError:
        print("❌ AWS credentials not properly configured")
    except Exception as e:
        print(f"❌ Error testing S3 connection: {e}")
        
else:
    print(f"\n❌ No AWS credentials found!")
    print(f"💡 Options:")
    print(f"   1. Use IAM role (recommended for EC2/ECS/Lambda)")
    print(f"   2. Configure AWS CLI: aws configure")
    print(f"   3. Set environment variables")
    print(f"   4. Create .env file")

print(f"\n📁 Working Directory: {os.getcwd()}")

if iam_role_available:
    print(f"🎯 Using IAM Role authentication - Most secure! 🔒")
else:
    print(f"🎯 Using traditional credential authentication")
    
print(f"🎯 Ready to proceed with S3 data loading!")

In [3]:
# 🚀 Quick AWS Setup Test
# Run this cell first to verify everything is working

print("🔍 Quick AWS Setup Verification")
print("=" * 40)

try:
    import boto3
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    print("✅ All required libraries installed")
    
    # Test AWS credentials
    try:
        s3 = boto3.client('s3')
        sts = boto3.client('sts')
        
        # Get identity
        identity = sts.get_caller_identity()
        print(f"✅ AWS Identity: {identity.get('Arn', 'Unknown')}")
        
        # Test S3 access
        buckets = s3.list_buckets()
        print(f"✅ S3 Access: Found {len(buckets['Buckets'])} buckets")
        
        # Test target bucket
        s3.head_bucket(Bucket='elbee-ai')
        print("✅ elbee-ai bucket accessible")
        
        print("\n🎉 READY TO GO!")
        print("✅ All systems operational")
        print("✅ You can now run the rest of the notebook")
        
    except Exception as aws_error:
        print(f"❌ AWS Error: {aws_error}")
        print("\n💡 To fix AWS issues:")
        print("1. Export credentials: export AWS_ACCESS_KEY_ID='your_key'")
        print("2. Export secret: export AWS_SECRET_ACCESS_KEY='your_secret'")
        print("3. Export region: export AWS_DEFAULT_REGION='us-east-1'")
        print("4. Re-run this cell")
        
except ImportError as import_error:
    print(f"❌ Import Error: {import_error}")
    print("💡 Install missing packages: pip install boto3 pandas numpy matplotlib")

print(f"\n📁 Current directory: {os.getcwd() if 'os' in locals() else 'Unknown'}")
print(f"🎯 Notebook ready for temperature/humidity analysis!")

🔍 Quick AWS Setup Verification
✅ All required libraries installed
✅ AWS Identity: arn:aws:iam::819556863188:root
✅ S3 Access: Found 1 buckets
✅ elbee-ai bucket accessible

🎉 READY TO GO!
✅ All systems operational
✅ You can now run the rest of the notebook

📁 Current directory: Unknown
🎯 Notebook ready for temperature/humidity analysis!


In [2]:
# 📦 Install Required Packages
# Run this cell first to install necessary packages

import subprocess
import sys

def install_package(package):
    """Install a package using pip"""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✅ {package} installed successfully")
        return True
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to install {package}: {e}")
        return False

print("📦 Installing required packages for AWS S3 integration...")
print("=" * 50)

# List of required packages
required_packages = [
    "boto3",
    "python-dotenv",
    "pandas",
    "numpy", 
    "matplotlib",
    "seaborn",
    "scikit-learn"
]

installed_count = 0
for package in required_packages:
    if install_package(package):
        installed_count += 1

print(f"\n📊 Installation Summary:")
print(f"✅ Successfully installed: {installed_count}/{len(required_packages)} packages")

if installed_count == len(required_packages):
    print("🎉 All packages installed! You can proceed to the next cell.")
else:
    print("⚠️ Some packages failed to install. You may need to install them manually.")
    print("💡 Try running: pip install boto3 python-dotenv")

print("\n🔄 Please restart the notebook kernel after installation if needed.")

📦 Installing required packages for AWS S3 integration...
Collecting boto3
  Using cached boto3-1.40.64-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.41.0,>=1.40.64 (from boto3)
  Using cached botocore-1.40.64-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.15.0,>=0.14.0 (from boto3)
  Using cached s3transfer-0.14.0-py3-none-any.whl.metadata (1.7 kB)
Using cached boto3-1.40.64-py3-none-any.whl (139 kB)
Using cached botocore-1.40.64-py3-none-any.whl (14.1 MB)
Using cached jmespath-1.0.1-py3-none-any.whl (20 kB)
Using cached s3transfer-0.14.0-py3-none-any.whl (85 kB)
Installing collected packages: jmespath, botocore, s3transfer, boto3
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [boto3]32m1/4[0m [botocore]
[1A[2KSuccessfully installed boto3-1.40.64 botocore-1.40.64 jmespath-1.0.1 s3transfer-0.14.0
✅ boto3 installed successfully
C

In [None]:
# 🔧 Advanced AWS Environment Diagnostic
# Run this cell to diagnose AWS configuration issues

print("🔍 Advanced AWS Environment Diagnostic")
print("=" * 50)

import os
import sys
import subprocess
import platform

# 1. Check Python environment
print("🐍 Python Environment:")
print(f"   Python version: {sys.version}")
print(f"   Python executable: {sys.executable}")
print(f"   Current working directory: {os.getcwd()}")
print()

# 2. Check if running on Windows host vs WSL/container - Fixed for Windows compatibility
print("💻 System Environment:")
print(f"   Platform: {sys.platform}")
print(f"   OS name: {os.name}")
system_platform = platform.system()
print(f"   System: {system_platform}")

if os.name == 'nt' or system_platform == 'Windows':
    print("   🪟 Running on Windows")
    
    # Check if in WSL by looking for WSL-specific environment variables
    is_wsl = 'WSL_DISTRO_NAME' in os.environ or 'WSL_INTEROP' in os.environ
    if is_wsl:
        print("   🐧 WSL detected within Windows")
else:
    print("   🐧 Running on Unix-like system")
    
    # Safe WSL detection for Linux systems
    try:
        platform_info = subprocess.run(['uname', '-a'], capture_output=True, text=True, timeout=5)
        if platform_info.returncode == 0 and 'microsoft' in platform_info.stdout.lower():
            print("   🐧 WSL (Windows Subsystem for Linux) detected")
    except Exception:
        pass  # Ignore if uname fails
print()

# 3. Check AWS CLI configuration
print("⚙️ AWS CLI Configuration:")
try:
    # Check if AWS CLI is available - using 'where' on Windows, 'which' on Unix
    if system_platform == 'Windows':
        aws_check_cmd = ['where', 'aws']
    else:
        aws_check_cmd = ['which', 'aws']
    
    aws_location = subprocess.run(aws_check_cmd, capture_output=True, text=True, timeout=10)
    
    if aws_location.returncode == 0:
        print(f"   ✅ AWS CLI found at: {aws_location.stdout.strip()}")
        
        # Check AWS version
        aws_version = subprocess.run(['aws', '--version'], capture_output=True, text=True, timeout=10)
        if aws_version.returncode == 0:
            print(f"   ✅ AWS CLI version: {aws_version.stdout.strip()}")
        
        # Check AWS configuration
        aws_config = subprocess.run(['aws', 'configure', 'list'], capture_output=True, text=True, timeout=10)
        if aws_config.returncode == 0:
            print("   ✅ AWS CLI configuration:")
            for line in aws_config.stdout.split('\n'):
                if line.strip():
                    print(f"      {line}")
        else:
            print("   ❌ AWS CLI not configured")
            print(f"   Error output: {aws_config.stderr}")
    else:
        print("   ❌ AWS CLI not found in PATH")
        
except FileNotFoundError:
    print("   ❌ AWS CLI command not found")
except subprocess.TimeoutExpired:
    print("   ❌ AWS CLI command timed out")
except Exception as e:
    print(f"   ❌ Error checking AWS CLI: {e}")
print()

# 4. Check environment variables
print("🌍 Environment Variables:")
aws_env_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_DEFAULT_REGION', 'AWS_REGION', 'AWS_PROFILE']
for var in aws_env_vars:
    value = os.environ.get(var)
    if value:
        if 'SECRET' in var:
            print(f"   ✅ {var}: ***[HIDDEN]***")
        else:
            print(f"   ✅ {var}: {value}")
    else:
        print(f"   ❌ {var}: Not set")
print()

# 5. Check boto3 configuration
print("🔄 Boto3 Configuration:")
try:
    import boto3
    print("   ✅ boto3 imported successfully")
    
    # Check current session
    session = boto3.Session()
    credentials = session.get_credentials()
    
    if credentials:
        print("   ✅ Credentials found")
        print(f"      Access Key: {credentials.access_key[:8]}***[HIDDEN]***")
        if hasattr(credentials, 'method'):
            print(f"      Method: {credentials.method}")
    else:
        print("   ❌ No credentials found")
    
    # Check region
    region = session.region_name
    if region:
        print(f"   ✅ Region: {region}")
    else:
        print("   ❌ No region configured")
        
except ImportError:
    print("   ❌ boto3 not installed")
except Exception as e:
    print(f"   ❌ boto3 error: {e}")
print()

# 6. Test AWS connectivity
print("🌐 AWS Connectivity Test:")
try:
    import boto3
    
    # Test STS (most basic AWS service)
    print("   Testing STS (Identity service)...")
    sts = boto3.client('sts')
    identity = sts.get_caller_identity()
    print(f"   ✅ Identity: {identity.get('Arn', 'Unknown')}")
    print(f"   ✅ Account: {identity.get('Account', 'Unknown')}")
    print(f"   ✅ User ID: {identity.get('UserId', 'Unknown')}")
    
    # Test S3 access
    print("   Testing S3 access...")
    s3 = boto3.client('s3')
    buckets = s3.list_buckets()
    print(f"   ✅ S3 Access: Found {len(buckets['Buckets'])} buckets")
    
    # Test specific bucket
    print("   Testing elbee-ai bucket...")
    try:
        s3.head_bucket(Bucket='elbee-ai')
        print("   ✅ elbee-ai bucket accessible")
    except Exception as bucket_error:
        print(f"   ❌ elbee-ai bucket error: {bucket_error}")
    
    print("\n🎉 AWS CONNECTION SUCCESSFUL!")
    
except Exception as e:
    print(f"   ❌ AWS connection failed: {e}")
    print(f"   Error type: {type(e).__name__}")
    
    print("\n🔧 Troubleshooting Steps:")
    if system_platform == 'Windows':
        print("1. Since you're on Windows:")
        print("   - Open Command Prompt or PowerShell as Administrator")
        print("   - Run: aws configure")
        print("   - Enter your AWS Access Key ID and Secret")
        print("   - Set region to: us-east-1")
        print()
        print("2. Alternative for Jupyter:")
        print("   - Set environment variables in a cell:")
        print("   import os")
        print("   os.environ['AWS_ACCESS_KEY_ID'] = 'your_key_here'")
        print("   os.environ['AWS_SECRET_ACCESS_KEY'] = 'your_secret_here'")
        print("   os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'")
    else:
        print("1. If on Unix/Linux system:")
        print("   - Open terminal")
        print("   - Run: aws configure")
        print("   - Enter your AWS Access Key ID and Secret")
        print("   - Set region to: us-east-1")
    print()
    print("3. If credentials are set but still failing:")
    print("   - Restart Jupyter server")
    print("   - Restart kernel: Kernel > Restart Kernel")
    print("   - Check if credentials are correctly typed")

print("\n" + "=" * 50)
print("🎯 Diagnostic Complete!")

In [None]:
# 🛠️ AWS Credentials Quick Fix
# Run this cell if the diagnostic shows missing credentials

print("🛠️ AWS Credentials Quick Fix")
print("=" * 40)

import os

# Option 1: Set credentials directly (temporary fix)
print("Option 1: Set credentials directly in this session")
print("⚠️ WARNING: Only use this for testing, not for production!")
print()

# Get credentials from user input or environment
access_key = os.environ.get('AWS_ACCESS_KEY_ID')
secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')

if not access_key:
    print("💡 AWS Access Key ID not found in environment")
    print("   Please set it using one of these methods:")
    print("   1. aws configure")
    print("   2. export AWS_ACCESS_KEY_ID='your-access-key'")
    print("   3. Uncomment and edit the lines below:")
    print()
    print("# Uncomment and fill in your credentials if needed:")
    print("# os.environ['AWS_ACCESS_KEY_ID'] = 'your-access-key-here'")
    print("# os.environ['AWS_SECRET_ACCESS_KEY'] = 'your-secret-key-here'")
    print("# os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'")

# Check if credentials are now set
aws_key = os.environ.get('AWS_ACCESS_KEY_ID')
if aws_key:
    print(f"✅ AWS_ACCESS_KEY_ID: {aws_key[:8]}***[HIDDEN]***")
else:
    print("❌ AWS_ACCESS_KEY_ID: Not set")

aws_secret = os.environ.get('AWS_SECRET_ACCESS_KEY') 
if aws_secret:
    print("✅ AWS_SECRET_ACCESS_KEY: ***[HIDDEN]***")
else:
    print("❌ AWS_SECRET_ACCESS_KEY: Not set")

aws_region = os.environ.get('AWS_DEFAULT_REGION')
if aws_region:
    print(f"✅ AWS_DEFAULT_REGION: {aws_region}")
else:
    print("❌ AWS_DEFAULT_REGION: Not set")

print()
print("💡 Recommended setup methods:")
print("1. AWS CLI configuration:")
print("   - Open Command Prompt or Terminal")
print("   - Run: aws configure")
print("   - Enter your credentials when prompted")
print("2. Environment variables (temporary):")
print("   - Windows: Use setup_aws_credentials.ps1")
print("   - Linux/WSL: Use source export_aws_credentials.sh")
print("3. Restart Jupyter server after AWS CLI configuration")

# Test if credentials work now
if aws_key and aws_secret:
    print("\n🧪 Testing credentials...")
    try:
        import boto3
        sts = boto3.client('sts')
        identity = sts.get_caller_identity()
        print(f"✅ SUCCESS! Identity: {identity.get('Arn', 'Unknown')}")
    except Exception as e:
        print(f"❌ Still having issues: {e}")
else:
    print("\n⏭️ Set credentials above and re-run this cell to test")

In [None]:
# 🚀 Generate AWS Export Scripts for Different Environments
# This cell creates export scripts for various shell environments

import os
import subprocess
import platform
from pathlib import Path

print("🚀 AWS Credentials Export Script Generator")
print("=" * 50)

# Get AWS credentials from environment or AWS CLI
AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', '')
AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', '')
AWS_DEFAULT_REGION = os.environ.get('AWS_DEFAULT_REGION', 'us-east-1')

# Try to get credentials from AWS CLI if not in environment
if not AWS_ACCESS_KEY_ID:
    try:
        import boto3
        session = boto3.Session()
        credentials = session.get_credentials()
        if credentials:
            AWS_ACCESS_KEY_ID = credentials.access_key
            AWS_SECRET_ACCESS_KEY = credentials.secret_key
            print("📋 Retrieved credentials from AWS CLI configuration")
        else:
            print("⚠️ No credentials found in AWS CLI configuration")
    except Exception as e:
        print(f"⚠️ Could not retrieve AWS CLI credentials: {e}")

# Check current environment - Fixed for Windows compatibility
current_os = os.name
system_platform = platform.system()
is_wsl = False

# Safe way to check for WSL without using uname on Windows
try:
    if system_platform == "Linux":
        # Only run uname on Linux systems
        platform_info = subprocess.run(['uname', '-a'], capture_output=True, text=True, errors='ignore')
        is_wsl = 'microsoft' in platform_info.stdout.lower() if platform_info.returncode == 0 else False
    elif system_platform == "Windows":
        # Check for WSL environment variables on Windows
        is_wsl = 'WSL_DISTRO_NAME' in os.environ or 'WSL_INTEROP' in os.environ
except Exception:
    # If any command fails, assume native environment
    pass

print(f"💻 Environment Detection:")
print(f"   OS Type: {current_os}")
print(f"   Platform: {system_platform}")
print(f"   WSL Status: {'WSL' if is_wsl else 'Native'}")

# Get secret key if not set
if not AWS_SECRET_ACCESS_KEY:
    print("\n🔐 AWS Secret Key Required:")
    print("⚠️  No AWS credentials found in environment or AWS CLI")
    print("💡 Please set up credentials using one of these methods:")
    print("   1. Run: aws configure")
    print("   2. Use the credential setup scripts")
    print("   3. Set environment variables manually")
    
    # Use placeholder for template generation
    AWS_ACCESS_KEY_ID = AWS_ACCESS_KEY_ID or "YOUR_ACCESS_KEY_HERE"
    AWS_SECRET_ACCESS_KEY = "YOUR_SECRET_KEY_HERE"

print(f"\n📋 Current AWS Configuration:")
if AWS_ACCESS_KEY_ID and AWS_ACCESS_KEY_ID != "YOUR_ACCESS_KEY_HERE":
    print(f"✅ AWS_ACCESS_KEY_ID: {AWS_ACCESS_KEY_ID[:8]}***")
else:
    print(f"❌ AWS_ACCESS_KEY_ID: Not configured")

if AWS_SECRET_ACCESS_KEY and AWS_SECRET_ACCESS_KEY != "YOUR_SECRET_KEY_HERE":
    print(f"✅ AWS_SECRET_ACCESS_KEY: ***[CONFIGURED]***")
else:
    print(f"❌ AWS_SECRET_ACCESS_KEY: Not configured")

print(f"✅ AWS_DEFAULT_REGION: {AWS_DEFAULT_REGION}")

# Generate export commands for different environments
print(f"\n📋 Export Commands for Different Environments:")
print("=" * 50)

print(f"\n🪟 Windows Command Prompt:")
print(f"set AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID}")
print(f"set AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY}")
print(f"set AWS_DEFAULT_REGION={AWS_DEFAULT_REGION}")

print(f"\n💙 PowerShell:")
print(f"$env:AWS_ACCESS_KEY_ID = '{AWS_ACCESS_KEY_ID}'")
print(f"$env:AWS_SECRET_ACCESS_KEY = '{AWS_SECRET_ACCESS_KEY}'")
print(f"$env:AWS_DEFAULT_REGION = '{AWS_DEFAULT_REGION}'")

print(f"\n🐧 Bash/WSL/Linux:")
print(f"export AWS_ACCESS_KEY_ID='{AWS_ACCESS_KEY_ID}'")
print(f"export AWS_SECRET_ACCESS_KEY='{AWS_SECRET_ACCESS_KEY}'")
print(f"export AWS_DEFAULT_REGION='{AWS_DEFAULT_REGION}'")

print(f"\n🐍 Python/Jupyter (this session):")
print(f"os.environ['AWS_ACCESS_KEY_ID'] = '{AWS_ACCESS_KEY_ID}'")
print(f"os.environ['AWS_SECRET_ACCESS_KEY'] = '{AWS_SECRET_ACCESS_KEY}'")
print(f"os.environ['AWS_DEFAULT_REGION'] = '{AWS_DEFAULT_REGION}'")

# Set for current Python session if credentials are valid
if (AWS_ACCESS_KEY_ID and AWS_ACCESS_KEY_ID != "YOUR_ACCESS_KEY_HERE" and 
    AWS_SECRET_ACCESS_KEY and AWS_SECRET_ACCESS_KEY != "YOUR_SECRET_KEY_HERE"):
    os.environ['AWS_ACCESS_KEY_ID'] = AWS_ACCESS_KEY_ID
    os.environ['AWS_SECRET_ACCESS_KEY'] = AWS_SECRET_ACCESS_KEY
    os.environ['AWS_DEFAULT_REGION'] = AWS_DEFAULT_REGION
    print(f"\n✅ Environment variables set for current Python session!")
else:
    print(f"\n⚠️ Skipping Python session setup - credentials not configured")
    print(f"💡 Please run 'aws configure' or set credentials manually")

# Check available export scripts
script_files = [
    'export_aws_credentials.ps1',
    'export_aws_credentials.bat', 
    'export_aws_credentials.sh'
]

print(f"\n📂 Available Export Scripts:")
for script in script_files:
    script_path = Path(script)
    if script_path.exists():
        print(f"✅ {script} - Ready to use")
    else:
        print(f"❌ {script} - Not found")

print(f"\n💡 How to Use the Export Scripts:")
if system_platform == "Windows":
    print("🪟 Since you're on Windows:")
    print("1. Command Prompt: Run export_aws_credentials.bat")
    print("2. PowerShell: Run ./export_aws_credentials.ps1")
    print("3. For WSL: Run source export_aws_credentials.sh (inside WSL)")
    print("4. Jupyter (current): Run the Python commands above")
else:
    print("1. 🪟 Windows Command Prompt: Run export_aws_credentials.bat")
    print("2. 💙 PowerShell: Run ./export_aws_credentials.ps1")
    print("3. 🐧 Bash/WSL: Run source export_aws_credentials.sh")
    print("4. 🐍 Jupyter: Run the Python commands above in a cell")

# Test current environment
print(f"\n🧪 Testing Current Environment:")
current_access_key = os.environ.get('AWS_ACCESS_KEY_ID')
current_secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
current_region = os.environ.get('AWS_DEFAULT_REGION')

if current_access_key:
    print(f"✅ AWS_ACCESS_KEY_ID: {current_access_key[:8]}***")
else:
    print(f"❌ AWS_ACCESS_KEY_ID: Not set")

if current_secret_key:
    print(f"✅ AWS_SECRET_ACCESS_KEY: ***[HIDDEN]***")
else:
    print(f"❌ AWS_SECRET_ACCESS_KEY: Not set")

if current_region:
    print(f"✅ AWS_DEFAULT_REGION: {current_region}")
else:
    print(f"❌ AWS_DEFAULT_REGION: Not set")

if current_access_key and current_secret_key:
    print(f"\n🎉 AWS credentials are configured for this session!")
    try:
        import boto3
        sts = boto3.client('sts')
        identity = sts.get_caller_identity()
        print(f"✅ AWS Identity verified: {identity.get('Arn', 'Unknown')}")
    except Exception as e:
        print(f"⚠️ AWS test failed: {e}")
else:
    print(f"\n⚠️ AWS credentials need to be set up")

print(f"\n🎯 Next Steps:")
print("1. Configure AWS credentials using 'aws configure'")
print("2. Re-run this cell to generate proper export commands")
print("3. Use one of the export scripts for persistent setup")
print("4. Run the diagnostic cell to verify setup")

In [None]:
# 📤 S3 Data Preparation (Optional)
# This cell creates and uploads sample data if it doesn't exist in S3

def create_and_upload_sample_data():
    """Create sample temperature/humidity data and upload to S3 if original file doesn't exist"""
    
    import boto3
    import pandas as pd
    import numpy as np
    from datetime import datetime, timedelta
    from io import StringIO
    
    print("📤 Preparing sample data for S3...")
    
    # Create realistic temperature/humidity data
    np.random.seed(42)
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2024, 12, 31)
    date_range = pd.date_range(start=start_date, end=end_date, freq='H')
    
    n_samples = len(date_range)
    day_of_year = date_range.dayofyear
    hour_of_day = date_range.hour
    
    # Temperature with seasonal and daily patterns
    base_temp = 15 + 10 * np.sin(2 * np.pi * day_of_year / 365.25)
    daily_temp = 5 * np.sin(2 * np.pi * hour_of_day / 24)
    noise_temp = np.random.normal(0, 2, n_samples)
    temperature = base_temp + daily_temp + noise_temp
    
    # Humidity with inverse temperature relationship
    base_humidity = 60 + 20 * np.sin(2 * np.pi * (day_of_year + 90) / 365.25)
    temp_humidity = -0.5 * (temperature - 20)
    noise_humidity = np.random.normal(0, 5, n_samples)
    humidity = np.clip(base_humidity + temp_humidity + noise_humidity, 10, 95)
    
    # Absolute humidity calculation
    absolute_humidity = (humidity / 100) * 6.112 * np.exp(17.67 * temperature / (243.5 + temperature))
    
    # Create DataFrame
    df = pd.DataFrame({
        'timestamp': date_range,
        'temperature': np.round(temperature, 1),
        'humidity': np.round(humidity, 1),
        'absolute_humidity': np.round(absolute_humidity, 2),
        'location': np.random.choice(['실내', '실외'], n_samples, p=[0.6, 0.4])
    })
    
    # Convert to CSV string
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False, encoding='utf-8')
    csv_content = csv_buffer.getvalue()
    
    # Upload to S3
    try:
        s3_client = boto3.client('s3')
        s3_client.put_object(
            Bucket='elbee-ai',
            Key='project-data/온습도_관측_데이터.csv',
            Body=csv_content.encode('utf-8'),
            ContentType='text/csv'
        )
        print(f"✅ Sample data uploaded to s3://elbee-ai/project-data/온습도_관측_데이터.csv")
        print(f"📊 Data shape: {df.shape}")
        print(f"📅 Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
        return True
        
    except Exception as e:
        print(f"❌ Failed to upload sample data: {e}")
        return False

# Check if data exists, if not create and upload sample data
try:
    s3_client = boto3.client('s3')
    s3_client.head_object(Bucket='elbee-ai', Key='project-data/온습도_관측_데이터.csv')
    print("✅ Data file already exists in S3!")
    
except s3_client.exceptions.NoSuchKey:
    print("⚠️ Data file not found in S3. Creating sample data...")
    success = create_and_upload_sample_data()
    if success:
        print("🎯 Ready to proceed with data analysis!")
    else:
        print("⚠️ Will proceed with local simulation data")
        
except Exception as e:
    print(f"⚠️ Could not check S3 file: {e}")
    print("💡 Will attempt to create sample data...")
    create_and_upload_sample_data()

In [None]:
# 🔧 필수 라이브러리 Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# ☁️ AWS 관련 라이브러리
import boto3
from botocore.exceptions import ClientError, NoCredentialsError
import io
import os
from urllib.parse import urlparse

# Scikit-Learn 데이터 분할 관련 모듈들
from sklearn.model_selection import (
    train_test_split,      # 기본 train/test 분할
    StratifiedShuffleSplit, # 계층화 분할
    TimeSeriesSplit,       # 시계열 분할
    cross_val_score,       # 교차검증
    validation_curve       # 검증 곡선
)

# 모델링 관련
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 🔤 한글 폰트 설정 (matplotlib) - 개선된 버전
import matplotlib.font_manager as fm
import platform

def setup_korean_fonts():
    """한글 폰트를 자동으로 감지하고 설정하는 함수"""
    
    print("🔤 한글 폰트 설정 중...")
    
        "    # 운영체제별 한글 폰트 리스트 (우선순위 순)\\n",
    "    if platform.system() == 'Windows':\\n",
    "        korean_fonts = ['Gulim', 'Malgun Gothic', 'Microsoft YaHei', 'SimHei']\\n",
    "    elif platform.system() == 'Darwin':  # macOS\\n",
    "        korean_fonts = ['Gulim', 'Apple SD Gothic Neo', 'AppleGothic']\\n",
    "    else:  # Linux\\n",
    "        korean_fonts = ['Gulim', 'Noto Sans CJK KR', 'Nanum Gothic', 'DejaVu Sans']"
    
    # 사용 가능한 폰트 검색
    available_fonts = [f.name for f in fm.fontManager.ttflist]
    print(f"📝 시스템 폰트 수: {len(available_fonts)}개")
    
    selected_font = None
    for font in korean_fonts:
        if font in available_fonts:
            selected_font = font
            print(f"✅ 선택된 한글 폰트: {selected_font}")
            break
    
    if selected_font:
        plt.rcParams['font.family'] = selected_font
        plt.rcParams['axes.unicode_minus'] = False
        
        # 간단한 폰트 테스트 (시각화 없이)
        print(f"✅ 한글 폰트 설정 완료: {selected_font}")
        
    else:
        print("⚠️ 한글 폰트를 찾을 수 없습니다. 기본 설정을 사용합니다.")
        plt.rcParams['font.family'] = 'DejaVu Sans'
        plt.rcParams['axes.unicode_minus'] = False
        selected_font = 'DejaVu Sans'
    
    return selected_font

# 한글 폰트 설정 실행
font_name = setup_korean_fonts()

print("✅ 모든 라이브러리가 성공적으로 로드되었습니다!")
print(f"📊 Pandas Version: {pd.__version__}")
print(f"🔢 NumPy Version: {np.__version__}")
print(f"☁️ Boto3 Version: {boto3.__version__}")
print(f"🤖 Scikit-Learn 분할 모듈들이 준비되었습니다!")
print(f"📈 Matplotlib & Seaborn 시각화 준비완료!")
print(f"🔤 한글 폰트: {font_name}")
print("\n🎯 AWS S3 기반 온습도 센서 데이터 분석을 시작하겠습니다!")

## ☁️ **Section 2: AWS S3 Configuration and Data Loading**
AWS S3 설정 및 온습도 센서 데이터를 클라우드에서 로드합니다.

In [None]:
# ☁️ AWS S3 클라이언트 설정
print("☁️ AWS S3 클라이언트를 설정하고 있습니다...")

# S3 버킷 및 경로 설정
S3_BUCKET = 'elbee-ai'
S3_PREFIX = 'project-data/'
DATA_FILE_NAME = '온습도_관측_데이터.csv'  # 원본 파일명과 동일
S3_DATA_KEY = f"{S3_PREFIX}{DATA_FILE_NAME}"

def setup_s3_client():
    """AWS S3 클라이언트를 설정하고 연결을 테스트하는 함수"""
    try:
        # AWS 자격 증명 방법 (우선순위 순):
        # 1. 환경변수 (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
        # 2. AWS CLI 설정 (~/.aws/credentials)
        # 3. IAM 역할 (EC2/Lambda에서 실행 시)
        
        s3_client = boto3.client('s3')
        
        # 연결 테스트 - 버킷 존재 여부 확인
        try:
            s3_client.head_bucket(Bucket=S3_BUCKET)
            print(f"✅ S3 버킷 '{S3_BUCKET}' 연결 성공!")
            
            # 버킷 내 파일 목록 확인 (프리픽스 기준)
            response = s3_client.list_objects_v2(
                Bucket=S3_BUCKET, 
                Prefix=S3_PREFIX,
                MaxKeys=10
            )
            
            if 'Contents' in response:
                print(f"📁 '{S3_PREFIX}' 경로의 파일 목록:")
                for obj in response['Contents']:
                    file_size_mb = obj['Size'] / (1024 * 1024)
                    print(f"  📄 {obj['Key']} ({file_size_mb:.2f} MB)")
            else:
                print(f"📁 '{S3_PREFIX}' 경로에 파일이 없습니다.")
                
            return s3_client
            
        except ClientError as e:
            error_code = e.response['Error']['Code']
            if error_code == '404':
                print(f"❌ S3 버킷 '{S3_BUCKET}'을 찾을 수 없습니다.")
            else:
                print(f"❌ S3 버킷 액세스 오류: {error_code}")
            return None
            
    except NoCredentialsError:
        print("❌ AWS 자격 증명을 찾을 수 없습니다.")
        print("💡 다음 중 하나를 설정하세요:")
        print("   1. 환경변수: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY")
        print("   2. AWS CLI: aws configure")
        print("   3. IAM 역할 (클라우드 환경)")
        return None
    except Exception as e:
        print(f"❌ S3 클라이언트 설정 중 오류: {e}")
        return None

def load_data_from_s3(s3_client, bucket, key):
    """S3에서 CSV 데이터를 로드하는 함수"""
    try:
        print(f"📥 S3에서 데이터를 다운로드 중: s3://{bucket}/{key}")
        
        # S3 객체 다운로드
        response = s3_client.get_object(Bucket=bucket, Key=key)
        
        # CSV 데이터 읽기
        csv_content = response['Body'].read()
        
        # 인코딩 시도 (한글 파일명 고려)
        encodings = ['utf-8', 'cp949', 'euc-kr']
        df = None
        
        for encoding in encodings:
            try:
                df = pd.read_csv(io.BytesIO(csv_content), encoding=encoding)
                print(f"✅ 인코딩 '{encoding}'으로 데이터 로드 성공!")
                break
            except UnicodeDecodeError:
                continue
        
        if df is None:
            print("❌ 모든 인코딩 시도 실패. UTF-8로 강제 로드합니다.")
            df = pd.read_csv(io.BytesIO(csv_content), encoding='utf-8', errors='ignore')
        
        print(f"✅ S3 데이터 로드 완료! 크기: {df.shape}")
        return df
        
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == 'NoSuchKey':
            print(f"❌ 파일을 찾을 수 없습니다: s3://{bucket}/{key}")
        else:
            print(f"❌ S3 다운로드 오류: {error_code}")
        return None
    except Exception as e:
        print(f"❌ 데이터 로드 중 오류: {e}")
        return None

def create_simulation_data():
    """S3 데이터를 로드할 수 없을 때 시뮬레이션 데이터를 생성하는 함수"""
    print("🔄 시뮬레이션 온습도 데이터를 생성 중...")
    
    # 시간 범위 설정 (최근 1년간의 시간당 데이터)
    np.random.seed(42)  # 재현 가능한 결과를 위한 시드 설정
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2024, 12, 31)
    date_range = pd.date_range(start=start_date, end=end_date, freq='H')

    # 실제와 유사한 온습도 패턴 생성
    n_samples = len(date_range)

    # 계절별 온도 패턴 (한국 기후 반영)
    day_of_year = date_range.dayofyear
    hour_of_day = date_range.hour

    # 기본 온도 패턴 (계절성 + 일일 주기)
    base_temp = 15 + 10 * np.sin(2 * np.pi * day_of_year / 365.25)  # 계절 패턴
    daily_temp = 5 * np.sin(2 * np.pi * hour_of_day / 24)  # 일일 패턴
    noise_temp = np.random.normal(0, 2, n_samples)  # 노이즈
    temperature = base_temp + daily_temp + noise_temp

    # 습도 패턴 (온도와 반비례 관계 + 계절성)
    base_humidity = 60 + 20 * np.sin(2 * np.pi * (day_of_year + 90) / 365.25)  # 계절 패턴
    temp_humidity = -0.5 * (temperature - 20)  # 온도와 반비례
    noise_humidity = np.random.normal(0, 5, n_samples)  # 노이즈
    humidity = np.clip(base_humidity + temp_humidity + noise_humidity, 10, 95)

    # 절대습도 계산 (온도와 상대습도로부터)
    absolute_humidity = (humidity / 100) * 6.112 * np.exp(17.67 * temperature / (243.5 + temperature))

    # 기상 조건 분류 (온습도 기반)
    conditions = []
    for temp, hum in zip(temperature, humidity):
        if temp < 5:
            condition = "추위" if hum < 60 else "습한추위"
        elif temp < 20:
            condition = "서늘함" if hum < 60 else "습한서늘함"
        elif temp < 30:
            condition = "적정" if hum < 70 else "습함"
        else:
            condition = "더위" if hum < 80 else "무더위"
        conditions.append(condition)

    # 센서 위치 (실내/실외 구분)
    sensor_locations = np.random.choice(['실내', '실외'], n_samples, p=[0.6, 0.4])

    # 데이터프레임 생성
    df = pd.DataFrame({
        'timestamp': date_range,
        'temperature': np.round(temperature, 1),
        'humidity': np.round(humidity, 1),
        'absolute_humidity': np.round(absolute_humidity, 2),
        'condition': conditions,
        'location': sensor_locations,
        'day_of_week': date_range.day_name(),
        'hour': date_range.hour,
        'month': date_range.month,
        'season': pd.cut(date_range.month, 
                         bins=[0, 3, 6, 9, 12], 
                         labels=['겨울', '봄', '여름', '가을'])
    })
    
    print(f"✅ 시뮬레이션 데이터 생성 완료! 크기: {df.shape}")
    return df

# S3 클라이언트 설정 및 데이터 로드 실행
s3_client = setup_s3_client()
sensor_data = None

if s3_client:
    # S3에서 데이터 로드 시도
    sensor_data = load_data_from_s3(s3_client, S3_BUCKET, S3_DATA_KEY)

if sensor_data is None:
    # S3 로드 실패 시 시뮬레이션 데이터 생성
    print("\n⚠️ S3에서 데이터를 로드할 수 없어 시뮬레이션 데이터를 사용합니다.")
    sensor_data = create_simulation_data()
else:
    print("\n✅ S3에서 실제 온습도 데이터를 성공적으로 로드했습니다!")

## 🔄 **Section 3: Data Preprocessing and Standardization**
S3에서 로드한 데이터를 전처리하고 표준화합니다.

In [None]:
# 🔄 S3 데이터 전처리 및 표준화
print("🔄 S3 데이터 전처리 및 표준화를 시작합니다...")

def standardize_s3_data(df):
    """S3에서 로드한 데이터를 표준 형식으로 변환하는 함수"""
    print(f"📋 원본 데이터 정보: {df.shape}, 컬럼: {list(df.columns)}")
    
    # 데이터프레임 복사
    standardized_df = df.copy()
    
    # 1. 타임스탬프 컬럼 처리
    timestamp_columns = ['timestamp', 'datetime', 'date', 'time', 'Date', 'Time']
    timestamp_col = None
    
    for col in timestamp_columns:
        if col in standardized_df.columns:
            timestamp_col = col
            break
    
    if timestamp_col:
        print(f"📅 타임스탬프 컬럼 '{timestamp_col}' 발견")
        if standardized_df[timestamp_col].dtype == 'object':
            standardized_df[timestamp_col] = pd.to_datetime(standardized_df[timestamp_col])
        if timestamp_col != 'timestamp':
            standardized_df['timestamp'] = standardized_df[timestamp_col]
    else:
        print("📅 타임스탬프 컬럼이 없어 인덱스 기반으로 생성합니다.")
        standardized_df['timestamp'] = pd.date_range(start='2024-01-01', periods=len(standardized_df), freq='H')
    
    # 2. 온도 컬럼 매핑
    temp_columns = ['temperature', 'temp', 'T', 'Temperature', 'TEMP']
    temp_col = None
    
    for col in temp_columns:
        if col in standardized_df.columns:
            temp_col = col
            break
    
    if temp_col and temp_col != 'temperature':
        standardized_df['temperature'] = standardized_df[temp_col]
        print(f"🌡️ 온도 컬럼 '{temp_col}' → 'temperature'로 매핑")
    elif not temp_col:
        print("⚠️ 온도 컬럼을 찾을 수 없습니다. 첫 번째 숫자 컬럼을 사용합니다.")
        numeric_cols = standardized_df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            standardized_df['temperature'] = standardized_df[numeric_cols[0]]
    
    # 3. 습도 컬럼 매핑
    humidity_columns = ['humidity', 'rh', 'RH', 'Humidity', 'HUMIDITY']
    humidity_col = None
    
    for col in humidity_columns:
        if col in standardized_df.columns:
            humidity_col = col
            break
    
    if humidity_col and humidity_col != 'humidity':
        standardized_df['humidity'] = standardized_df[humidity_col]
        print(f"💧 습도 컬럼 '{humidity_col}' → 'humidity'로 매핑")
    elif not humidity_col:
        print("⚠️ 습도 컬럼을 찾을 수 없습니다. 두 번째 숫자 컬럼을 사용합니다.")
        numeric_cols = standardized_df.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 1:
            standardized_df['humidity'] = standardized_df[numeric_cols[1]]
    
    # 4. 절대습도 컬럼 처리
    abs_humidity_columns = ['absolute_humidity', 'ah', 'AH', 'AbsoluteHumidity']
    abs_humidity_col = None
    
    for col in abs_humidity_columns:
        if col in standardized_df.columns:
            abs_humidity_col = col
            break
    
    if abs_humidity_col and abs_humidity_col != 'absolute_humidity':
        standardized_df['absolute_humidity'] = standardized_df[abs_humidity_col]
        print(f"💨 절대습도 컬럼 '{abs_humidity_col}' → 'absolute_humidity'로 매핑")
    elif not abs_humidity_col:
        # 절대습도가 없으면 온도와 상대습도로부터 계산
        if 'temperature' in standardized_df.columns and 'humidity' in standardized_df.columns:
            print("💨 절대습도를 온도와 상대습도로부터 계산합니다.")
            temp = standardized_df['temperature']
            rh = standardized_df['humidity']
            # Magnus 공식을 사용한 절대습도 계산
            standardized_df['absolute_humidity'] = (rh / 100) * 6.112 * np.exp(17.67 * temp / (243.5 + temp))
    
    # 5. 파생 변수 생성
    if 'timestamp' in standardized_df.columns:
        ts = standardized_df['timestamp']
        standardized_df['hour'] = ts.dt.hour
        standardized_df['day_of_week'] = ts.dt.day_name()
        standardized_df['month'] = ts.dt.month
        standardized_df['season'] = pd.cut(ts.dt.month, 
                                         bins=[0, 3, 6, 9, 12], 
                                         labels=['겨울', '봄', '여름', '가을'])
        print("📅 시간 기반 파생 변수 생성 완료")
    
    # 6. 기상 조건 분류 (온습도가 있는 경우)
    if 'temperature' in standardized_df.columns and 'humidity' in standardized_df.columns:
        conditions = []
        for temp, hum in zip(standardized_df['temperature'], standardized_df['humidity']):
            if pd.isna(temp) or pd.isna(hum):
                conditions.append('알수없음')
            elif temp < 5:
                condition = "추위" if hum < 60 else "습한추위"
                conditions.append(condition)
            elif temp < 20:
                condition = "서늘함" if hum < 60 else "습한서늘함"
                conditions.append(condition)
            elif temp < 30:
                condition = "적정" if hum < 70 else "습함"
                conditions.append(condition)
            else:
                condition = "더위" if hum < 80 else "무더위"
                conditions.append(condition)
        
        standardized_df['condition'] = conditions
        print("🌤️ 기상 조건 분류 완료")
    
    # 7. 센서 위치 정보 (없으면 랜덤 생성)
    if 'location' not in standardized_df.columns:
        np.random.seed(42)
        standardized_df['location'] = np.random.choice(['실내', '실외'], len(standardized_df), p=[0.6, 0.4])
        print("📍 센서 위치 정보 생성 완료")
    
    print(f"✅ 데이터 표준화 완료! 최종 크기: {standardized_df.shape}")
    return standardized_df

# 데이터 표준화 실행
sensor_data = standardize_s3_data(sensor_data)

# 표준화된 데이터 정보 출력
print(f"\n📊 표준화된 데이터 정보:")
print(f"📏 데이터 크기: {sensor_data.shape}")
print(f"📊 컬럼 목록: {list(sensor_data.columns)}")

if 'timestamp' in sensor_data.columns:
    print(f"📅 기간: {sensor_data['timestamp'].min()} ~ {sensor_data['timestamp'].max()}")
if 'temperature' in sensor_data.columns:
    print(f"🌡️ 온도 범위: {sensor_data['temperature'].min():.1f}°C ~ {sensor_data['temperature'].max():.1f}°C")
if 'humidity' in sensor_data.columns:
    print(f"💧 습도 범위: {sensor_data['humidity'].min():.1f}% ~ {sensor_data['humidity'].max():.1f}%")
if 'absolute_humidity' in sensor_data.columns:
    print(f"💨 절대습도 범위: {sensor_data['absolute_humidity'].min():.2f} ~ {sensor_data['absolute_humidity'].max():.2f} g/m³")

# 데이터 구조 확인
print(f"\n📊 데이터 구조:")
print(sensor_data.info())

print(f"\n📈 첫 10개 레코드:")
display(sensor_data.head(10))

## 🔍 **Section 4: Enhanced Data Exploration and Quality Assessment**
S3 데이터의 품질을 평가하고 탐색적 분석을 수행합니다.

In [None]:
# 🔍 S3 데이터 품질 평가 및 탐색적 분석
print("🔍 S3 온습도 데이터 품질 평가 및 탐색적 분석을 시작합니다...")

# 1. 데이터 품질 평가
print("\n📊 === 데이터 품질 평가 ===")

# 기술통계량
numeric_cols = sensor_data.select_dtypes(include=[np.number]).columns
print(f"\n📈 숫자형 컬럼 기술통계량:")
print(sensor_data[numeric_cols].describe())

# 결측값 확인
print(f"\n❓ 결측값 확인:")
missing_values = sensor_data.isnull().sum()
missing_percent = (missing_values / len(sensor_data)) * 100
missing_df = pd.DataFrame({
    '결측값 개수': missing_values,
    '결측값 비율(%)': missing_percent
})
print(missing_df[missing_df['결측값 개수'] > 0])

if missing_values.sum() == 0:
    print("✅ 결측값이 없습니다!")

# 중복값 확인
duplicate_count = sensor_data.duplicated().sum()
print(f"\n🔄 중복 레코드: {duplicate_count}개 ({duplicate_count/len(sensor_data)*100:.1f}%)")

# 2. 데이터 분포 시각화
print("\n📊 === 데이터 분포 시각화 ===")

if 'temperature' in sensor_data.columns and 'humidity' in sensor_data.columns:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('🌡️💧 온습도 데이터 분포 분석 (S3 소스)', fontsize=16, fontweight='bold')
    
    # 온도 분포
    axes[0, 0].hist(sensor_data['temperature'], bins=50, alpha=0.7, color='orange', edgecolor='black')
    axes[0, 0].set_title('온도 분포')
    axes[0, 0].set_xlabel('온도 (°C)')
    axes[0, 0].set_ylabel('빈도')
    axes[0, 0].grid(True, alpha=0.3)
    
    # 습도 분포
    axes[0, 1].hist(sensor_data['humidity'], bins=50, alpha=0.7, color='blue', edgecolor='black')
    axes[0, 1].set_title('습도 분포')
    axes[0, 1].set_xlabel('습도 (%)')
    axes[0, 1].set_ylabel('빈도')
    axes[0, 1].grid(True, alpha=0.3)
    
    # 온도-습도 산점도
    axes[1, 0].scatter(sensor_data['temperature'], sensor_data['humidity'], 
                      alpha=0.5, s=10, c='green')
    axes[1, 0].set_title('온도-습도 상관관계')
    axes[1, 0].set_xlabel('온도 (°C)')
    axes[1, 0].set_ylabel('습도 (%)')
    axes[1, 0].grid(True, alpha=0.3)
    
    # 기상 조건 분포
    if 'condition' in sensor_data.columns:
        condition_counts = sensor_data['condition'].value_counts()
        axes[1, 1].pie(condition_counts.values, labels=condition_counts.index, autopct='%1.1f%%')
        axes[1, 1].set_title('기상 조건 분포')
    
    plt.tight_layout()
    plt.show()

# 3. 시계열 패턴 분석
if 'timestamp' in sensor_data.columns:
    print("\n📅 === 시계열 패턴 분석 ===")
    
    fig, axes = plt.subplots(3, 1, figsize=(15, 12))
    fig.suptitle('📈 시계열 패턴 분석 (S3 데이터)', fontsize=16, fontweight='bold')
    
    # 시간별 온도 변화
    if 'temperature' in sensor_data.columns:
        axes[0].plot(sensor_data['timestamp'], sensor_data['temperature'], 
                    alpha=0.7, color='red', linewidth=0.5)
        axes[0].set_title('시간별 온도 변화')
        axes[0].set_ylabel('온도 (°C)')
        axes[0].grid(True, alpha=0.3)
    
    # 시간별 습도 변화
    if 'humidity' in sensor_data.columns:
        axes[1].plot(sensor_data['timestamp'], sensor_data['humidity'], 
                    alpha=0.7, color='blue', linewidth=0.5)
        axes[1].set_title('시간별 습도 변화')
        axes[1].set_ylabel('습도 (%)')
        axes[1].grid(True, alpha=0.3)
    
    # 일별 평균 패턴
    if 'hour' in sensor_data.columns and 'temperature' in sensor_data.columns:
        hourly_avg = sensor_data.groupby('hour')[['temperature', 'humidity']].mean()
        axes[2].plot(hourly_avg.index, hourly_avg['temperature'], 
                    'o-', color='red', label='온도', linewidth=2)
        if 'humidity' in hourly_avg.columns:
            ax2 = axes[2].twinx()
            ax2.plot(hourly_avg.index, hourly_avg['humidity'], 
                    's-', color='blue', label='습도', linewidth=2)
            ax2.set_ylabel('습도 (%)', color='blue')
            ax2.tick_params(axis='y', labelcolor='blue')
        
        axes[2].set_title('시간대별 평균 온습도 패턴')
        axes[2].set_xlabel('시간 (0-23)')
        axes[2].set_ylabel('온도 (°C)', color='red')
        axes[2].tick_params(axis='y', labelcolor='red')
        axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# 4. 상관관계 분석
print("\n🔗 === 상관관계 분석 ===")
correlation_matrix = sensor_data[numeric_cols].corr()
print("📊 숫자형 변수 간 상관관계:")
print(correlation_matrix.round(3))

# 상관관계 히트맵
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('🔗 변수 간 상관관계 히트맵 (S3 데이터)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# 5. 데이터 품질 요약
print("\n✅ === S3 데이터 품질 요약 ===")
print(f"📏 전체 레코드 수: {len(sensor_data):,}개")
print(f"📊 전체 컬럼 수: {len(sensor_data.columns)}개")
print(f"🔢 숫자형 컬럼: {len(numeric_cols)}개")
print(f"❓ 결측값 비율: {(missing_values.sum() / (len(sensor_data) * len(sensor_data.columns)) * 100):.2f}%")
print(f"🔄 중복값 비율: {(duplicate_count / len(sensor_data) * 100):.2f}%")

if 'timestamp' in sensor_data.columns:
    time_span = sensor_data['timestamp'].max() - sensor_data['timestamp'].min()
    print(f"📅 데이터 기간: {time_span.days}일 ({time_span.total_seconds()/3600:.1f}시간)")

print("\n🎯 S3 데이터 탐색이 완료되었습니다. 이제 Scikit-Learn 분할 분석을 진행하겠습니다!")

## 📖 **Section 5: Scikit-Learn Data Splitting with AWS S3**
이제 S3에서 로드한 온습도 데이터를 사용하여 Scikit-Learn의 다양한 데이터 분할 기법을 실습합니다.

### 🎯 **학습 목표:**
1. **Basic Train/Test Split** - 기본적인 데이터 분할
2. **Stratified Split** - 클래스 비율을 유지하는 분할
3. **Time Series Split** - 시계열 데이터에 적합한 분할
4. **Cross-Validation** - 교차 검증을 통한 모델 성능 평가
5. **결과를 S3에 저장** - 분할된 데이터와 모델 결과를 클라우드에 저장

In [None]:
# 🔧 Cross-Platform AWS Environment Detection and Auto-Setup
# This cell automatically detects your environment and provides setup guidance

import os
import sys
import platform
import subprocess
from pathlib import Path

def detect_execution_environment():
    """Comprehensive environment detection for AWS S3 integration"""
    
    env_info = {
        'platform_system': platform.system(),
        'platform_machine': platform.machine(),
        'python_version': sys.version,
        'python_executable': sys.executable,
        'working_directory': os.getcwd(),
        'is_wsl': False,
        'is_jupyter': False,
        'aws_cli_available': False,
        'aws_cli_path': None,
        'conda_environment': None,
        'venv_active': False
    }
    
    print("🔍 Environment Detection and Setup Guidance")
    print("=" * 50)
    
    # Detect WSL
    try:
        if env_info['platform_system'] == 'Linux':
            with open('/proc/version', 'r') as f:
                if 'microsoft' in f.read().lower():
                    env_info['is_wsl'] = True
        elif env_info['platform_system'] == 'Windows':
            env_info['is_wsl'] = any(var in os.environ for var in ['WSL_DISTRO_NAME', 'WSL_INTEROP'])
    except:
        pass
    
    # Detect Jupyter environment
    env_info['is_jupyter'] = any(module in sys.modules for module in ['ipykernel', 'IPython'])
    
    # Detect virtual environment
    env_info['venv_active'] = hasattr(sys, 'real_prefix') or (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix)
    
    # Detect conda environment
    if 'CONDA_DEFAULT_ENV' in os.environ:
        env_info['conda_environment'] = os.environ['CONDA_DEFAULT_ENV']
    
    # Check AWS CLI availability
    try:
        if env_info['platform_system'] == 'Windows':
            cmd = ['where', 'aws']
        else:
            cmd = ['which', 'aws']
        
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
        if result.returncode == 0:
            env_info['aws_cli_available'] = True
            env_info['aws_cli_path'] = result.stdout.strip()
    except:
        pass
    
    return env_info

def check_aws_credentials():
    """Check AWS credentials availability"""
    print("\n🔐 AWS Credentials Check:")
    print("-" * 25)
    
    # Check environment variables
    aws_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_DEFAULT_REGION']
    env_creds = all(os.environ.get(var) for var in aws_vars)
    
    for var in aws_vars:
        value = os.environ.get(var)
        if value:
            if 'SECRET' in var:
                print(f"   ✅ {var}: ***[HIDDEN]***")
            else:
                print(f"   ✅ {var}: {value}")
        else:
            print(f"   ❌ {var}: Not set")
    
    # Test boto3 credentials
    try:
        import boto3
        session = boto3.Session()
        credentials = session.get_credentials()
        
        if credentials:
            print(f"   ✅ Boto3 session: Available")
            print(f"   ✅ Access key: {credentials.access_key[:8]}***")
        else:
            print(f"   ❌ Boto3 session: No credentials found")
            
    except ImportError:
        print(f"   ⚠️ Boto3 not installed")
    except Exception as e:
        print(f"   ❌ Boto3 error: {e}")
    
    return env_creds

def provide_setup_guidance(env_info, has_credentials):
    """Provide environment-specific setup guidance"""
    print(f"\n🎯 Setup Guidance for Your Environment:")
    print("=" * 40)
    
    print(f"💻 Platform: {env_info['platform_system']}")
    print(f"🐍 Python: {env_info['python_executable']}")
    
    if env_info['conda_environment']:
        print(f"🐍 Conda Env: {env_info['conda_environment']}")
    elif env_info['venv_active']:
        print(f"🐍 Virtual Env: Active")
    
    print(f"📓 Jupyter: {'Yes' if env_info['is_jupyter'] else 'No'}")
    print(f"🐧 WSL: {'Yes' if env_info['is_wsl'] else 'No'}")
    print(f"⚙️ AWS CLI: {'Available' if env_info['aws_cli_available'] else 'Not found'}")
    
    if env_info['aws_cli_path']:
        print(f"📍 AWS CLI Path: {env_info['aws_cli_path']}")
    
    # Provide specific guidance
    print(f"\n💡 Recommended Setup Steps:")
    print("-" * 30)
    
    step = 1
    
    # AWS CLI installation
    if not env_info['aws_cli_available']:
        print(f"{step}. 📥 Install AWS CLI:")
        if env_info['platform_system'] == 'Windows':
            print(f"   Windows: Run install_aws_windows.bat")
            print(f"   Or: Download from https://aws.amazon.com/cli/")
        else:
            print(f"   Linux/WSL: Run bash install_aws_linux.sh")
            print(f"   Or: curl https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip")
        step += 1
    
    # Credentials setup
    if not has_credentials:
        print(f"{step}. 🔐 Configure AWS Credentials:")
        if env_info['platform_system'] == 'Windows':
            print(f"   PowerShell: .\\setup_aws_credentials.ps1 -Persistent")
            print(f"   Command Prompt: aws configure")
        else:
            print(f"   Terminal: aws configure")
            print(f"   Or: export AWS_ACCESS_KEY_ID='your-key'")
        step += 1
    
    # Package installation
    print(f"{step}. 📦 Install Required Packages:")
    print(f"   pip install boto3 pandas numpy matplotlib scikit-learn")
    step += 1
    
    # Test setup
    print(f"{step}. 🧪 Test Your Setup:")
    print(f"   python test_aws_integration.py")
    
    print(f"\n📚 Documentation:")
    print(f"   📖 Full Guide: AWS_S3_INTEGRATION_GUIDE.md")
    print(f"   🔧 Windows Script: install_aws_windows.bat")
    print(f"   🐧 Linux Script: install_aws_linux.sh")
    print(f"   💙 PowerShell: setup_aws_credentials.ps1")

def check_required_packages():
    """Check if required packages are installed"""
    print(f"\n📦 Required Packages Check:")
    print("-" * 28)
    
    required_packages = {
        'boto3': 'AWS SDK for Python',
        'pandas': 'Data manipulation and analysis',
        'numpy': 'Numerical computing',
        'matplotlib': 'Data visualization',
        'scikit-learn': 'Machine learning library'
    }
    
    missing_packages = []
    
    for package, description in required_packages.items():
        try:
            if package == 'scikit-learn':
                __import__('sklearn')
            else:
                __import__(package)
            print(f"   ✅ {package}: {description}")
        except ImportError:
            print(f"   ❌ {package}: Not installed - {description}")
            missing_packages.append(package)
    
    if missing_packages:
        print(f"\n💡 Install missing packages:")
        print(f"   pip install {' '.join(missing_packages)}")
        return False
    else:
        print(f"\n✅ All required packages are installed!")
        return True

def create_quick_setup_scripts():
    """Create platform-specific quick setup scripts in current directory"""
    print(f"\n📝 Creating Quick Setup Scripts:")
    print("-" * 35)
    
    # Check if scripts already exist
    scripts = {
        'quick_aws_setup.py': 'Python setup script',
        'setup_env.bat': 'Windows batch script',
        'setup_env.sh': 'Linux/WSL shell script'
    }
    
    for script, description in scripts.items():
        script_path = Path(script)
        if script_path.exists():
            print(f"   ✅ {script}: Already exists - {description}")
        else:
            print(f"   📝 {script}: Will be created - {description}")
    
    # Python setup script
    python_script = '''#!/usr/bin/env python3
"""Quick AWS S3 setup verification and troubleshooting"""

import os
import sys

def quick_aws_test():
    print("🧪 Quick AWS S3 Integration Test")
    print("=" * 35)
    
    # Test imports
    try:
        import boto3
        print("✅ boto3 imported successfully")
    except ImportError:
        print("❌ boto3 not installed: pip install boto3")
        return False
    
    # Test credentials
    try:
        sts = boto3.client('sts')
        identity = sts.get_caller_identity()
        print(f"✅ AWS Identity: {identity.get('Arn', 'Unknown')}")
        
        # Test S3
        s3 = boto3.client('s3')
        buckets = s3.list_buckets()
        print(f"✅ S3 Access: {len(buckets['Buckets'])} buckets found")
        
        return True
        
    except Exception as e:
        print(f"❌ AWS Error: {e}")
        print("💡 Fix: Run 'aws configure' or set environment variables")
        return False

if __name__ == "__main__":
    success = quick_aws_test()
    sys.exit(0 if success else 1)
'''
    
    with open('quick_aws_setup.py', 'w') as f:
        f.write(python_script)
    
    print(f"   ✅ quick_aws_setup.py created")
    print(f"\n💡 Run: python quick_aws_setup.py")

# Main execution
def main():
    """Main environment detection and setup function"""
    
    # Detect environment
    env_info = detect_execution_environment()
    
    # Check AWS credentials
    has_credentials = check_aws_credentials()
    
    # Check required packages
    packages_ok = check_required_packages()
    
    # Provide setup guidance
    provide_setup_guidance(env_info, has_credentials)
    
    # Create quick setup scripts
    create_quick_setup_scripts()
    
    # Final status
    print(f"\n🎯 Environment Status Summary:")
    print("=" * 35)
    print(f"   Platform: {env_info['platform_system']}")
    print(f"   AWS CLI: {'✅ Available' if env_info['aws_cli_available'] else '❌ Missing'}")
    print(f"   Credentials: {'✅ Configured' if has_credentials else '❌ Missing'}")
    print(f"   Packages: {'✅ Complete' if packages_ok else '❌ Missing'}")
    
    all_ready = env_info['aws_cli_available'] and has_credentials and packages_ok
    
    if all_ready:
        print(f"\n🎉 Your environment is ready for AWS S3 integration!")
        print(f"✅ You can proceed with the notebook analysis")
    else:
        print(f"\n⚠️ Setup required before proceeding")
        print(f"💡 Follow the guidance above to complete setup")
    
    return all_ready

# Run the environment detection
environment_ready = main()