# Upload Specific Datasets and Model to S3

This notebook uploads processed datasets and trained models to S3.

**Security Note**: Credentials are read from environment variables. Do NOT hardcode keys in this file.

In [1]:
import os
import boto3
from pathlib import Path
from dotenv import load_dotenv
from botocore.exceptions import ClientError

# Load .env file if it exists
load_dotenv()

# Check if credentials are available (without printing them)
if not os.getenv("AWS_ACCESS_KEY_ID") or not os.getenv("AWS_SECRET_ACCESS_KEY"):
    raise EnvironmentError("❌ AWS credentials not found. Please set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables.")

print("✅ AWS Credentials detected.")

✅ AWS Credentials detected.


In [2]:
# ---- Config ----
bucket = "housing-ml-artifacts"   # Updated to unique name to avoid AccessDenied
region = os.environ.get("AWS_DEFAULT_REGION", "us-east-1") # Use region from env or default to us-east-1

# Set project root as parent of the notebooks folder
PROJECT_ROOT = Path("..").resolve()
local_data_dir = PROJECT_ROOT / "data" / "processed"
local_model_dir = PROJECT_ROOT / "models"

# Initialize S3 Client
s3 = boto3.client("s3", region_name=region)

In [3]:
# ---- Ensure Bucket Exists ----
def create_bucket_if_not_exists(bucket_name, region=None):
    try:
        s3.head_bucket(Bucket=bucket_name)
        print(f"✅ Bucket '{bucket_name}' already exists.")
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == '404':
            print(f"⚠️ Bucket '{bucket_name}' not found. Creating...")
            if region is None or region == "us-east-1":
                s3.create_bucket(Bucket=bucket_name)
            else:
                s3.create_bucket(
                    Bucket=bucket_name,
                    CreateBucketConfiguration={'LocationConstraint': region}
                )
            print(f"✅ Bucket '{bucket_name}' created successfully.")
        else:
             print(f"❌ Error checking bucket: {e}")
             raise

create_bucket_if_not_exists(bucket, region)

✅ Bucket 'housing-ml-artifacts' already exists.


In [4]:
# ---- Helper function ----
def upload_file(local_path: Path, s3_key: str):
    if not local_path.exists():
        print(f"❌ File not found: {local_path}")
        return
    print(f"⬆️ Uploading {local_path.name} → s3://{bucket}/{s3_key}")
    try:
        s3.upload_file(str(local_path), bucket, s3_key)
        print("   ✅ Upload success")
    except Exception as e:
        print(f"   ❌ Upload failed: {e}")

In [5]:
# ---- Upload required datasets ----
upload_file(local_data_dir / "feature_engineered_holdout.csv", "processed/feature_engineered_holdout.csv")
upload_file(local_data_dir / "cleaning_holdout.csv", "processed/cleaning_holdout.csv")
upload_file(local_data_dir / "feature_engineered_train.csv", "processed/feature_engineered_train.csv")

# ---- Upload model ----
upload_file(local_model_dir / "xgb_best_model.pkl", "models/xgb_best_model.pkl")

⬆️ Uploading feature_engineered_holdout.csv → s3://housing-ml-artifacts/processed/feature_engineered_holdout.csv
   ✅ Upload success
⬆️ Uploading cleaning_holdout.csv → s3://housing-ml-artifacts/processed/cleaning_holdout.csv
   ✅ Upload success
⬆️ Uploading feature_engineered_train.csv → s3://housing-ml-artifacts/processed/feature_engineered_train.csv
   ✅ Upload success
⬆️ Uploading xgb_best_model.pkl → s3://housing-ml-artifacts/models/xgb_best_model.pkl
   ✅ Upload success
