# AWS SageMaker Customer Churn Prediction

This notebook demonstrates how to train and deploy a machine learning model for customer churn prediction using Amazon SageMaker.

## Overview
- Train a Random Forest model using SageMaker's SKLearn estimator
- Deploy the model to a SageMaker endpoint for real-time inference
- Handle both JSON and CSV input formats for predictions

## Prerequisites
- AWS account with SageMaker access
- Proper IAM roles configured
- S3 bucket for storing data and model artifacts

## 1. Setup and Imports

In [None]:
import boto3
import sagemaker
from sagemaker.sklearn import SKLearn, SKLearnModel
from sagemaker.inputs import TrainingInput
import pandas as pd
import numpy as np

# Initialize SageMaker session
session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = session.default_bucket()

print(f"SageMaker role: {role}")
print(f"Default S3 bucket: {bucket}")
# Initialize S3 client
s3 = boto3.client('s3')

## 2. Upload Data to S3 Bucket

Before training, we need to upload our datasets to S3 so SageMaker can access them during training.

# Verify inference script exists
inference_script_path = 'inference.py'
if not os.path.exists(inference_script_path):
    print(f"Error: {inference_script_path} not found in current directory")
    print("Available files:", os.listdir('.'))
else:
    print(f"Found inference script: {inference_script_path}")

# Create SageMaker model using the actual inference script
model = SKLearnModel(
    model_data=sklearn_estimator.model_data,
    role=role,
    entry_point='inference.py',
    source_dir='.',
    framework_version='1.0-1',
    py_version='py3'
)

print("SageMaker model created successfully with enhanced inference script")

# Deploy to endpoint
endpoint_name = 'customer-churn-endpoint'
print(f"Deploying model to endpoint: {endpoint_name}")
print("This may take several minutes...")

try:
    predictor = model.deploy(
        initial_instance_count=1,
        instance_type='ml.t2.medium',
        endpoint_name=endpoint_name
    )
    
    print(f"Model successfully deployed to endpoint: {endpoint_name}")
    print(f"Endpoint name: {predictor.endpoint_name}")
    
except Exception as e:
    print(f"Deployment error: {e}")
    print("Note: If endpoint already exists, you may need to delete it first or use a different name")

## 3. Model Training with SageMaker

We'll use our existing training script and SageMaker's SKLearn estimator to train our Random Forest model using the data we uploaded to S3.

In [None]:
# Upload the training script to S3
import os

# Verify training script exists
training_script_path = 'training.py'
if not os.path.exists(training_script_path):
    print(f"❌ Error: {training_script_path} not found in current directory")
    print("Available files:", os.listdir('.'))
else:
    print(f"✅ Found training script: {training_script_path}")
    
    # Upload training script to S3
    s3_key = 'code/training.py'
    s3.upload_file(training_script_path, bucket, s3_key)
    print(f"📤 Training script uploaded to s3://{bucket}/{s3_key}")

# Create SKLearn estimator with the actual training script
sklearn_estimator = SKLearn(
    entry_point='training.py',
    source_dir='.',
    role=role,
    instance_type='ml.m5.large',
    framework_version='1.0-1',
    py_version='py3',
    hyperparameters={
        'n-estimators': 100,
        'max-depth': 10,
        'min-samples-split': 2,
        'min-samples-leaf': 1,
        'random-state': 42
    }
)

print("🤖 SKLearn estimator created successfully with enhanced training script")

# Define training data location (using the processed data uploaded in section 2)
train_input = TrainingInput(f's3://{bucket}/data/processed/', content_type='text/csv')

print(f"📊 Training data location: s3://{bucket}/data/processed/")
print("🚀 Starting training job...")

# Start training job
sklearn_estimator.fit({'train': train_input})
print("✅ Training job completed successfully!")

## 4. Model Deployment and Inference

Now we'll use our existing inference script and deploy the trained model to a SageMaker endpoint for real-time predictions. The inference script includes:

- Enhanced error handling and logging
- Support for both JSON and CSV input formats
- Detailed prediction results with confidence scores
- Health check functionality
- Batch prediction support

In [None]:
# Verify inference script exists
inference_script_path = 'inference.py'
if not os.path.exists(inference_script_path):
    print(f"Error: {inference_script_path} not found in current directory")
    print("Available files:", os.listdir('.'))
else:
    print(f"Found inference script: {inference_script_path}")

# Create SageMaker model using the actual inference script
model = SKLearnModel(
    model_data=sklearn_estimator.model_data,
    role=role,
    entry_point='inference.py',
    source_dir='.',
    framework_version='1.0-1',
    py_version='py3'
)

print("SageMaker model created successfully with enhanced inference script")

# Deploy to endpoint
endpoint_name = 'customer-churn-endpoint'
print(f"Deploying model to endpoint: {endpoint_name}")
print("This may take several minutes...")

try:
    predictor = model.deploy(
        initial_instance_count=1,
        instance_type='ml.t2.medium',
        endpoint_name=endpoint_name
    )
    
    print(f"Model successfully deployed to endpoint: {endpoint_name}")
    print(f"Endpoint name: {predictor.endpoint_name}")
    
except Exception as e:
    print(f"Deployment error: {e}")
    print("Note: If endpoint already exists, you may need to delete it first or use a different name")

## 5. Testing the Endpoint

Let's test our deployed endpoint with sample data using the uploaded processed dataset.

# Test the Endpoint with Correct Format

print("🧪 Testing endpoint with correct format...")

# Load the data to understand structure
try:
    df = pd.read_csv('../data/customer_churn_processed.csv')
    if 'Churn' in df.columns:
        features_df = df.drop('Churn', axis=1)
        print(f"📊 Model expects {len(features_df.columns)} features")
        
        # Get sample data
        sample_data = features_df.iloc[0].to_dict()
        
        # Convert numpy types to Python types
        for key, value in sample_data.items():
            if hasattr(value, 'item'):
                sample_data[key] = value.item()
        
        print(f"📝 Testing with {len(sample_data)} features...")
        
        # Test 1: JSON format (most reliable for SageMaker)
        print("\n🧪 Test 1: JSON with proper serialization")
        try:
            # Use the predictor's built-in serialization
            from sagemaker.serializers import JSONSerializer
            from sagemaker.deserializers import JSONDeserializer
            
            # Set serializers
            predictor.serializer = JSONSerializer()
            predictor.deserializer = JSONDeserializer()
            
            result = predictor.predict(sample_data)
            print("✅ JSON test successful!")
            print(f"🎯 Prediction: {result}")
                
        except Exception as e:
            print(f"❌ JSON test failed: {e}")
            
            # Test 2: Manual JSON approach
            print("\n🧪 Test 2: Manual JSON formatting")
            try:
                import json
                json_data = json.dumps(sample_data)
                print(f"📤 Sending JSON: {json_data[:100]}...")  # Show first 100 chars
                
                result = predictor.predict(
                    json_data,
                    initial_args={'ContentType': 'application/json'}
                )
                print("✅ Manual JSON test successful!")
                print(f"🎯 Result: {result}")
                
            except Exception as e2:
                print(f"❌ Manual JSON failed: {e2}")
                
                # Test 3: Use SageMaker CSV serializer
                print("\n🧪 Test 3: CSV with SageMaker serializer")
                try:
                    from sagemaker.serializers import CSVSerializer
                    
                    predictor.serializer = CSVSerializer()
                    predictor.deserializer = JSONDeserializer()
                    
                    # Convert to list of values (no headers)
                    values = list(sample_data.values())
                    result = predictor.predict([values])  # Wrap in list for CSV
                    
                    print("✅ CSV serializer test successful!")
                    print(f"🎯 Result: {result}")
                    
                except Exception as e3:
                    print(f"❌ CSV serializer failed: {e3}")
                    print("\n� Debug info:")
                    print(f"   - Sample data keys: {list(sample_data.keys())[:5]}...")
                    print(f"   - Sample data values: {list(sample_data.values())[:5]}...")
                    print(f"   - Data types: {[type(v).__name__ for v in list(sample_data.values())[:5]]}")
    else:
        print("❌ No 'Churn' column found in data")
        
except FileNotFoundError:
    print("❌ Training data file not found")
    print("💡 Make sure '../data/customer_churn_processed.csv' exists")
except Exception as e:
    print(f"❌ Error loading data: {e}")

print("\n🎉 Testing completed!")



In [None]:
# Debug: Understanding the CSV Parsing Issue

print("🔍 Diagnosing the CSV parsing issue...")

# The error shows: "Expected 4 fields in line 2, saw 12"
# This means your inference script expects CSV with only 4 columns
# But your data has many more columns

try:
    # Check your actual data structure
    df = pd.read_csv('../data/customer_churn_processed.csv')
    if 'Churn' in df.columns:
        features_df = df.drop('Churn', axis=1)
        print(f"📊 Your data has {len(features_df.columns)} features")
        print(f"📋 First 10 features: {list(features_df.columns)[:10]}")
        
        # Show what CSV data looks like
        sample_row = features_df.iloc[0]
        csv_string = ','.join([str(val) for val in sample_row.values])
        print(f"\n📝 CSV string would be:")
        print(f"   {csv_string[:100]}...")  # First 100 chars
        print(f"   Total fields: {len(sample_row)}")
        
    # Check inference script CSV handling
    print(f"\n🔍 Checking inference.py CSV handling...")
    with open('inference.py', 'r') as f:
        content = f.read()
        
    # Look for CSV parsing
    lines = content.split('\n')
    for i, line in enumerate(lines):
        if 'read_csv' in line and 'header=None' in line:
            print(f"   Line {i+1}: {line.strip()}")
            # Show context
            for j in range(max(0, i-2), min(len(lines), i+3)):
                if j != i:
                    print(f"   Line {j+1}: {lines[j].strip()}")
            break
    
    print(f"\n💡 SOLUTION:")
    print(f"   The issue is that your inference script's CSV parser")
    print(f"   expects a specific number of columns, but your data has {len(features_df.columns)}")
    print(f"   We need to use JSON format instead, which is more flexible")
    
except Exception as e:
    print(f"❌ Error in diagnosis: {e}")

print(f"\n🎯 RECOMMENDATION: Use JSON format with proper SageMaker serializers")

## 6. Cleanup

Remember to delete the endpoint when you're done to avoid ongoing charges.

In [None]:
# Cleanup Resources
# Remember to delete the endpoint when you're done to avoid ongoing charges

import boto3

print(f"Current endpoint: {endpoint_name}")

# Option 1: Delete just the endpoint (keeps the model)
def delete_endpoint():
    try:
        predictor.delete_endpoint()
        print(f"Endpoint '{endpoint_name}' deleted successfully")
    except Exception as e:
        print(f"Error deleting endpoint: {e}")

# Option 2: Delete endpoint and model
def delete_endpoint_and_model():
    try:
        predictor.delete_endpoint()
        predictor.delete_model()
        print(f"Endpoint '{endpoint_name}' and model deleted successfully")
    except Exception as e:
        print(f"Error deleting endpoint and model: {e}")

# Option 3: List all endpoints to see what's running
def list_endpoints():
    sm_client = boto3.client('sagemaker')
    try:
        response = sm_client.list_endpoints()
        endpoints = response['Endpoints']
        
        if endpoints:
            print("Active endpoints:")
            for ep in endpoints:
                print(f"- {ep['EndpointName']} (Status: {ep['EndpointStatus']})")
        else:
            print("No active endpoints found")
    except Exception as e:
        print(f"Error listing endpoints: {e}")

# Show current status
print("\nEndpoint Management Options:")
print("1. delete_endpoint() - Delete only the endpoint")
print("2. delete_endpoint_and_model() - Delete endpoint and model")
print("3. list_endpoints() - List all active endpoints")

# Uncomment the line below when you want to delete the endpoint
# delete_endpoint()

# Show active endpoints
list_endpoints()