# ML Pipeline Orchestrator Notebook
This notebook orchestrates the complete ML pipeline using SageMaker SDK

In [None]:
import argparse
import os
import json
import time
import boto3
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
import mlflow

print("🚀 Starting Complete ML Pipeline Orchestration with Real SageMaker SDK")

In [None]:
from sagemaker_studio import Project, ClientConfig

# Get project connections dynamically
# Get region from execution environment
region = boto3.Session().region_name

client_config = ClientConfig(region=region, overrides={"datazone":{"endpoint_url":f"https://iceland-gamma.{region}.api.aws"}})
proj = Project(config=client_config)

# Get region from boto3 session
region = boto3.Session().region_name
print(f"✅ Region: {region}")

# Get S3 shared bucket from connection
s3_shared_conn = proj.connection('default.s3_shared')
bucket = s3_shared_conn.data.s3_uri.rstrip('/').split('/')[-2]  # Extract bucket name
print(f"✅ S3 Bucket from connection: {bucket}")

# Get IAM role from connection
iam_conn = proj.connection('default.iam')
role = iam_conn.iam_role
print(f"✅ IAM Role from connection: {role}")

# MLflow tracking server ARN
mlflow_arn = "arn:aws:sagemaker:us-east-1:198737698272:mlflow-tracking-server/smus-integration-mlflow"
print(f"✅ MLflow ARN: {mlflow_arn}")


In [None]:
# Generate synthetic training and inference data
print("\n📊 Generating synthetic training and inference data...")

import pandas as pd
import numpy as np

# Generate training data
np.random.seed(42)
X_train = np.random.randn(1000, 20)
y_train = np.random.choice([0, 1, 2], 1000)
train_df = pd.DataFrame(X_train, columns=[f'feature_{i}' for i in range(20)])
train_df['target'] = y_train

# Generate inference data (no target column)
np.random.seed(123)
X_test = np.random.randn(100, 20)
test_df = pd.DataFrame(X_test, columns=[f'feature_{i}' for i in range(20)])

# Upload to S3
s3_client = boto3.client('s3', region_name=region)

# Save and upload training data
train_local = '/tmp/training_data.csv'
train_df.to_csv(train_local, index=False)
train_s3_key = 'shared/ml/bundle/training-data/training_data.csv'
s3_client.upload_file(train_local, bucket, train_s3_key)
print(f"✅ Uploaded training data to s3://{bucket}/{train_s3_key}")

# Save and upload inference data
test_local = '/tmp/inference_data.csv'
test_df.to_csv(test_local, index=False)
test_s3_key = 'shared/ml/bundle/inference-data/inference_data.csv'
s3_client.upload_file(test_local, bucket, test_s3_key)
print(f"✅ Uploaded inference data to s3://{bucket}/{test_s3_key}")

In [None]:
# Configuration
model_name = "realistic-classifier-v1"

# MLflow setup (only if available)
if mlflow_arn:
    mlflow.set_tracking_uri(mlflow_arn)
    print(f"✅ MLflow tracking enabled: {mlflow_arn}")
else:
    print("⚠️  MLflow tracking disabled (no connection)")

# Create SageMaker session
boto_session = boto3.Session(region_name=region)
session = sagemaker.Session(boto_session=boto_session)

print(f"\n📋 Configuration:")
print(f"  Region: {region}")
print(f"  Bucket: {bucket}")
print(f"  Role: {role}")
print(f"  Model: {model_name}")

In [None]:
# Step 1: Training with Real SageMaker SDK
print("\n📈 Step 1: Real Model Training with SageMaker SDK")

# Build environment variables (only add MLflow if available)
env_vars = {}
if mlflow_arn:
    env_vars["MLFLOW_TRACKING_SERVER_ARN"] = mlflow_arn

source_dir = f's3://{bucket}/shared/ml/bundle/training-code/training-code.tar.gz'
training_data = f's3://{bucket}/shared/ml/bundle/training-data/'
output_path = f's3://{bucket}/shared/ml/output/model-artifacts/'

print(f"  Training code: {source_dir}")
print(f"  Training data: {training_data}")
print(f"  Output path: {output_path}")

sklearn_estimator = SKLearn(
    entry_point='sagemaker_training_script.py',
    source_dir=source_dir,
    framework_version='1.2-1',
    py_version='py3',
    instance_type='ml.m5.large',
    instance_count=1,
    role=role,
    output_path=output_path,
    environment=env_vars,
    hyperparameters={
        'n-estimators': 100, 
        'max-depth': 10, 
        'random-state': 42,
        'model-name': model_name
    }
)

job_name = f"orchestrated-training-{int(time.time())}"
print(f"Starting real training job: {job_name}")

# This will create a real SageMaker training job
sklearn_estimator.fit(
    inputs={'training': training_data},
    job_name=job_name
)

print(f"✅ Real training completed: {sklearn_estimator.model_data}")

In [None]:
# Step 2: Champion Model Tagging with Real MLflow
print("\n🏆 Step 2: Real Champion Model Tagging")
champion_tagged = False

if not mlflow_arn:
    print("⚠️  Skipping champion tagging (no MLflow connection)")
else:
    try:
        client = mlflow.MlflowClient()
        
        # Get latest model version
        versions = client.search_model_versions(f"name='{model_name}'")
        if versions:
            latest_version = max(versions, key=lambda x: int(x.version))
            client.set_registered_model_alias(
                name=model_name,
                alias="champion",
                version=latest_version.version
            )
            print(f"✅ Real champion tagging: version {latest_version.version}")
            champion_tagged = True
        else:
            print("⚠️  No model versions found, will be created by training job")
    except Exception as e:
        print(f"⚠️  Champion tagging will happen after training job completes: {e}")