In [2]:
"""
Sanity check making sure I can write to the s3 bucket I created for this project.
"""

import boto3

s3 = boto3.client('s3')
bucket = 'sagemaker-bucket-ml-data-pipeline' 

# Test write
test_content = "Hello, ML Pipeline!"
s3.put_object(Bucket=bucket, Key='test.txt', Body=test_content)
print("✅ Write test successful")

# Test read
response = s3.get_object(Bucket=bucket, Key='test.txt')
content = response['Body'].read().decode('utf-8')
print(f"✅ Read test successful: {content}")

# Clean up test file
s3.delete_object(Bucket=bucket, Key='test.txt')
print("✅ Cleanup successful")

print("\nYour S3 access is working perfectly!")

✅ Write test successful
✅ Read test successful: Hello, ML Pipeline!
✅ Cleanup successful

Your S3 access is working perfectly!


In [3]:
import boto3

s3 = boto3.client('s3')
bucket = 'sagemaker-bucket-ml-data-pipeline'  # put your bucket here

# Upload raw Titanic data, make sure the file is in the correct location
s3.upload_file('train.csv', bucket, 'ml-pipeline/raw/titanic.csv')

## **Creating roles for AWS Glue**

First, need to create the roles for it.

1. **IAM Console → Roles → Create role**
2. **Trusted entity type:** AWS service
3. **Use case:** Glue (it auto-creates the trust policy!)
4. **Permissions:** 
   - Search and add: `AWSGlueServiceRole` (managed policy)
   - Click "Next"
5. **Role name:** `GlueETLRole`
6. **Create role**

7. **Then add S3 permissions:**
   - Find your new `GlueETLRole`
   - Click "Add permissions" → "Create inline policy"
   - Visual editor:
     - Service: S3
     - Actions: GetObject, PutObject, ListBucket, DeleteObject
     - Resources: 
       - Bucket: `sagemaker-bucket-ml-data-pipeline`
       - Object: `sagemaker-bucket-ml-data-pipeline/*` (This will auto-update to "*")
   - Name: `S3Access`
   - Click Create policy

## **Creating the AWS Glue job**

1. **AWS Console → Glue → ETL Jobs**
2. **Click "Create job"**
3. **Choose "Spark script editor"**
4. **Paste your glue_etl.py**
5. **Set job details:**

 - Name: titanic-etl-job
 - IAM Role: Create new or use existing (needs S3 + Glue permissions)
 - Glue version: 4.0 (latest)
 - Language: Python 3

*Additional job params:*
 - Add parameter: --INPUT_PATH = s3://sagemaker-bucket-ml-data-pipeline/ml-pipeline/raw/titanic.csv
 - Add parameter: --OUTPUT_PATH = s3://sagemaker-bucket-ml-data-pipeline/ml-pipeline/processed/

6. Save and Run




In [7]:
import boto3
import pandas as pd

""" Verifying the data loaded correctly """

s3 = boto3.client('s3')
bucket = 'sagemaker-bucket-ml-data-pipeline'

# List processed files
response = s3.list_objects_v2(
    Bucket=bucket,
    Prefix='ml-pipeline/processed/'
)

print("Processed files created by Glue:")
for obj in response.get('Contents', []):
    size_mb = obj['Size'] / (1024 * 1024)
    print(f"  {obj['Key']} ({size_mb:.2f} MB)")

# Read one of the Parquet files to verify
import pyarrow.parquet as pq
import io

# Get the parquet file (there might be multiple parts)
parquet_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.parquet')]

if parquet_files:
    obj = s3.get_object(Bucket=bucket, Key=parquet_files[0])
    df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
    
    print(f"\n✅ Successfully read processed data!")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"\nFirst few rows:")
    print(df.head())

Processed files created by Glue:
  ml-pipeline/processed/part-00000-3fef16c9-ca7d-4e6a-98ce-1266b2282c16-c000.snappy.parquet (0.01 MB)

✅ Successfully read processed data!
Shape: (891, 8)
Columns: ['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize', 'IsAlone']

First few rows:
   Survived  Pclass  Sex   Age     Fare  Embarked  FamilySize  IsAlone
0         0       3    0  22.0   7.2500         2           1        0
1         1       1    1  38.0  71.2833         0           1        0
2         1       3    1  26.0   7.9250         2           0        1
3         1       1    1  35.0  53.1000         2           1        0
4         0       3    0  35.0   8.0500         2           0        1


In [9]:
import sagemaker
from sagemaker import image_uris
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

sess = sagemaker.Session()
bucket = 'sagemaker-bucket-ml-data-pipeline'
role = sagemaker.get_execution_role()

# Point to the PROCESSED data from Glue
train_path = f's3://{bucket}/ml-pipeline/processed/'
print(f"Training data location: {train_path}")

# Get XGBoost container
container = image_uris.retrieve('xgboost', sess.boto_region_name, '1.5-1')

# Create estimator
xgb = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=f's3://{bucket}/ml-pipeline/models/',
    sagemaker_session=sess
)

# Set hyperparameters
xgb.set_hyperparameters(
    objective='binary:logistic',
    num_round=100,
    max_depth=5,
    eta=0.2,
    eval_metric='auc'
)

# Train on processed Parquet data
train_input = TrainingInput(
    train_path, 
    content_type='application/x-parquet'  # Note: Parquet, not CSV!
)

print("Starting training...")
xgb.fit({'train': train_input})

print(f"\n✅ Training complete!")
print(f"Model location: {xgb.model_data}")

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-10-25-23-25-26-666


Training data location: s3://sagemaker-bucket-ml-data-pipeline/ml-pipeline/processed/
Starting training...
2025-10-25 23:25:28 Starting - Starting the training job...
2025-10-25 23:25:41 Starting - Preparing the instances for training...
2025-10-25 23:26:29 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-10-25 23:27:27.163 ip-10-0-95-85.us-west-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-10-25 23:27:27.191 ip-10-0-95-85.us-west-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-10-25:23:27:27:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-10-25:23:27:27:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2025-10-25:23:27:27:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
