In [1]:
# Quick Titanic data prep (if you don't have it already)
import pandas as pd
from sklearn.model_selection import train_test_split
import os

import sagemaker
from sagemaker import image_uris
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput


import boto3
from datetime import datetime



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# Initialize SageMaker session
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()

# Download Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)

# Basic preprocessing
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Prepare for XGBoost (label first, then features)
df_xgb = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

# Split and save
train_data, test_data = train_test_split(df_xgb, test_size=0.2, random_state=42)

train_data.to_csv('train.csv', header=False, index=False)

# Upload to S3
train_path = f's3://{bucket}/titanic-data/train/train.csv'
boto3.Session().resource('s3').Bucket(bucket).Object('titanic-data/train/train.csv').upload_file('train.csv')

print(f"âœ… Training data uploaded to: {train_path}")

âœ… Training data uploaded to: s3://sagemaker-us-east-2-854757836160/titanic-data/train/train.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [3]:
print(f"Using bucket: {bucket}")
print(f"Using role: {role}")

# Get XGBoost container
container = image_uris.retrieve('xgboost', sess.boto_region_name, '1.5-1')

# Path to your training data (assuming you have it from Week 2)
train_path = f's3://{bucket}/titanic-data/train/'

print("\n" + "="*60)
print("TRAINING MODEL A (Conservative)")
print("="*60)

# Model A: Conservative hyperparameters
xgb_model_a = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=f's3://{bucket}/ab-test/model-a/',
    sagemaker_session=sess
)

# Conservative hyperparameters (faster, simpler)
xgb_model_a.set_hyperparameters(
    objective='binary:logistic',
    num_round=50,         # Fewer rounds
    max_depth=3,          # Shallower trees
    eta=0.3,              # Higher learning rate
    subsample=1.0         # Use all data
)

# Train Model A
xgb_model_a.fit({'train': TrainingInput(train_path, content_type='text/csv')})

print(f"\nâœ… Model A trained successfully!")
print(f"Model A artifact: {xgb_model_a.model_data}")

print("\n" + "="*60)
print("TRAINING MODEL B (Aggressive - Better Accuracy)")
print("="*60)

# Model B: Aggressive hyperparameters
xgb_model_b = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=f's3://{bucket}/ab-test/model-b/',
    sagemaker_session=sess
)

# Aggressive hyperparameters (more accurate, slower)
xgb_model_b.set_hyperparameters(
    objective='binary:logistic',
    num_round=100,        # More rounds
    max_depth=6,          # Deeper trees
    eta=0.1,              # Lower learning rate
    subsample=0.8         # Use 80% of data per tree
)

# Train Model B
xgb_model_b.fit({'train': TrainingInput(train_path, content_type='text/csv')})

print(f"\nâœ… Model B trained successfully!")
print(f"Model B artifact: {xgb_model_b.model_data}")

print("\n" + "="*60)
print("PHASE 1 COMPLETE!")
print("="*60)
print("\nBoth models trained and saved to S3:")
print(f"  Model A: {xgb_model_a.model_data}")
print(f"  Model B: {xgb_model_b.model_data}")
print("\nReady for Phase 2: Deployment!")

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-11-17-19-57-50-238


Using bucket: sagemaker-us-east-2-854757836160
Using role: arn:aws:iam::854757836160:role/service-role/AmazonSageMaker-ExecutionRole-20251026T175451

TRAINING MODEL A (Conservative)
2025-11-17 19:57:52 Starting - Starting the training job...
2025-11-17 19:58:25 Downloading - Downloading input data...
2025-11-17 19:58:50 Downloading - Downloading the training image......
2025-11-17 19:59:51 Training - Training image download completed. Training in progress.
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-11-17 19:59:47.040 ip-10-0-154-248.us-east-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-11-17 19:59:47.062 ip-10-0-154-248.us-east-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-11-17:19:59:47:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-11-17:19:59:47:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning 

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-11-17-20-00-37-185


Training seconds: 99
Billable seconds: 99

âœ… Model A trained successfully!
Model A artifact: s3://sagemaker-us-east-2-854757836160/ab-test/model-a/sagemaker-xgboost-2025-11-17-19-57-50-238/output/model.tar.gz

TRAINING MODEL B (Aggressive - Better Accuracy)
2025-11-17 20:00:37 Starting - Starting the training job...
2025-11-17 20:01:02 Starting - Preparing the instances for training...
2025-11-17 20:01:19 Downloading - Downloading input data...
2025-11-17 20:01:45 Downloading - Downloading the training image...
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-11-17 20:02:37.562 ip-10-0-78-198.us-east-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-11-17 20:02:37.585 ip-10-0-78-198.us-east-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-11-17:20:02:37:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-11-17:20:02:37:INFO] Failed to parse hyperparameter obje

**Multi-Variant Endpoint**

In [9]:
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from datetime import datetime
import boto3
import time

print("\n" + "="*60)
print("PHASE 2: DEPLOY MULTI-VARIANT ENDPOINT")
print("="*60)

# Get SageMaker client
client = boto3.client('sagemaker')

# ============================================================
# First, check for and delete any existing endpoints
# ============================================================

print("\n" + "-"*60)
print("Checking for existing endpoints to clean up...")
print("-"*60)

try:
    response = client.list_endpoints(
        StatusEquals='InService',
        MaxResults=100
    )
    
    for endpoint in response['Endpoints']:
        if 'titanic-ab-test' in endpoint['EndpointName']:
            print(f"Found existing endpoint: {endpoint['EndpointName']}")
            print(f"  Deleting to free up resources...")
            client.delete_endpoint(EndpointName=endpoint['EndpointName'])
            print(f"  âœ… Deleted")
except Exception as e:
    print(f"Note: {e}")

print("\nWaiting 30 seconds for resources to be released...")
time.sleep(30)

# ============================================================
# Create and Register Both Models
# ============================================================

print("\n" + "-"*60)
print("Creating Model A and Model B")
print("-"*60)

# Create Model A
model_a_name = f'model-a-{datetime.now().strftime("%Y%m%d%H%M%S")}'
print(f"Creating Model A: {model_a_name}")

client.create_model(
    ModelName=model_a_name,
    PrimaryContainer={
        'Image': container,
        'ModelDataUrl': xgb_model_a.model_data
    },
    ExecutionRoleArn=role
)
print("âœ… Model A created successfully")

# Create Model B
model_b_name = f'model-b-{datetime.now().strftime("%Y%m%d%H%M%S")}'
print(f"Creating Model B: {model_b_name}")

client.create_model(
    ModelName=model_b_name,
    PrimaryContainer={
        'Image': container,
        'ModelDataUrl': xgb_model_b.model_data
    },
    ExecutionRoleArn=role
)
print("âœ… Model B created successfully")

# ============================================================
# Deploy Both Models with Traffic Split (using ml.t2.medium)
# ============================================================

print("\n" + "-"*60)
print("Deploying both models with 80/20 traffic split")
print("Using ml.t2.medium instances (within quota)")
print("-"*60)

# Create endpoint configuration
endpoint_config_name = f'ab-config-{datetime.now().strftime("%Y%m%d%H%M%S")}'
endpoint_name = f'titanic-ab-test-{datetime.now().strftime("%Y%m%d-%H%M%S")}'

print(f"Creating endpoint config: {endpoint_config_name}")

client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            'VariantName': 'VariantA',
            'ModelName': model_a_name,
            'InitialInstanceCount': 1,
            'InstanceType': 'ml.t2.medium',  # Smaller instance
            'InitialVariantWeight': 80
        },
        {
            'VariantName': 'VariantB',
            'ModelName': model_b_name,
            'InitialInstanceCount': 1,
            'InstanceType': 'ml.t2.medium',  # Smaller instance
            'InitialVariantWeight': 20
        }
    ]
)
print("âœ… Endpoint config created")

# Create endpoint
print(f"\nCreating endpoint: {endpoint_name}")
print("(This takes ~8-10 minutes...)")

client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name
)

# Wait for endpoint to be in service
print("Waiting for endpoint to be in service...")
waiter = client.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=endpoint_name)

print(f"\nâœ… Endpoint deployed successfully!")
print(f"\n" + "="*60)
print("PHASE 2 COMPLETE!")
print("="*60)
print(f"\nEndpoint: {endpoint_name}")
print(f"Traffic split:")
print(f"  VariantA (Conservative): 80%")
print(f"  VariantB (Aggressive):   20%")
print(f"\nBoth variants are now live and receiving traffic!")

# Create predictor for Phase 3
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer

predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=CSVSerializer()
)

print(f"\nðŸ’¾ Save this endpoint name: {endpoint_name}")


PHASE 2: DEPLOY MULTI-VARIANT ENDPOINT

------------------------------------------------------------
Checking for existing endpoints to clean up...
------------------------------------------------------------
Found existing endpoint: titanic-ab-test-20251117-200652
  Deleting to free up resources...
  âœ… Deleted

Waiting 30 seconds for resources to be released...

------------------------------------------------------------
Creating Model A and Model B
------------------------------------------------------------
Creating Model A: model-a-20251117201745
âœ… Model A created successfully
Creating Model B: model-b-20251117201746
âœ… Model B created successfully

------------------------------------------------------------
Deploying both models with 80/20 traffic split
Using ml.t2.medium instances (within quota)
------------------------------------------------------------
Creating endpoint config: ab-config-20251117201747
âœ… Endpoint config created

Creating endpoint: titanic-ab-test-202