<a href="https://colab.research.google.com/github/betsenara/UCSD-ML-Engineering-AI/blob/main/SageMaker_Pipelines_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Import Packages and Declare Constants

In [None]:
import boto3
import sagemaker
import datetime as dt
import pandas as pd

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [None]:
#Replace this value with the S3 Bucket Created
default_bucket = "sagemaker-studio-211125403081-f0htqm1fzcs"

In [None]:
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
sklearn_processor_version="0.23-1"
model_package_group_name="ChurnModelPackageGroup"
pipeline_name= "ChurnModelSMPipeline"
clarify_image = sagemaker.image_uris.retrieve(framework='sklearn',version=sklearn_processor_version,region=region)

## Step 2: Generate Baseline Dataset

In [None]:
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    ## Convert to datetime columns
    df["firstorder"]=pd.to_datetime(df["firstorder"],errors='coerce')
    df["lastorder"] = pd.to_datetime(df["lastorder"],errors='coerce')
    ## Drop Rows with null values
    df = df.dropna()
    ## Create Column which gives the days between the last order and the first order
    df["first_last_days_diff"] = (df['lastorder']-df['firstorder']).dt.days
    ## Create Column which gives the days between when the customer record was created and the first order
    df['created'] = pd.to_datetime(df['created'])
    df['created_first_days_diff']=(df['created']-df['firstorder']).dt.days
    ## Drop Columns
    df.drop(['custid','created','firstorder','lastorder'],axis=1,inplace=True)
    ## Apply one hot encoding on favday and city columns
    df = pd.get_dummies(df,prefix=['favday','city'],columns=['favday','city'])
    return df

In [None]:
baseline_data = preprocess_data("data/storedata_total.csv")
baseline_data.pop("retained")
baseline_sample = baseline_data.sample(frac=0.0002)

In [None]:
pd.DataFrame(baseline_sample).to_csv("data/baseline.csv",header=False,index=False)

## Step 3: Generate Batch Dataset

In [None]:
batch_data = preprocess_data("data/storedata_total.csv")
batch_data.pop("retained")
batch_sample = batch_data.sample(frac=0.2)

In [None]:
pd.DataFrame(batch_sample).to_csv("data/batch.csv",header=False,index=False)

## Step 4: Copy Data and Scripts to S3 Bucket

In [None]:
s3_client = boto3.resource('s3')
s3_client.Bucket(default_bucket).upload_file("data/storedata_total.csv","data/storedata_total.csv")
s3_client.Bucket(default_bucket).upload_file("data/batch.csv","data/batch/batch.csv")
s3_client.Bucket(default_bucket).upload_file("data/baseline.csv","input/baseline/baseline.csv")

In [None]:
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/preprocess.py","input/code/preprocess.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/evaluate.py","input/code/evaluate.py")
s3_client.Bucket(default_bucket).upload_file("pipelines/customerchurn/generate_config.py","input/code/generate_config.py")

## Step 5: Get the Pipeline Instance

In [None]:
from pipelines.customerchurn.pipeline import get_pipeline

pipeline = get_pipeline(
    region = region,
    role=role,
    default_bucket=default_bucket,
    model_package_group_name=model_package_group_name,
    pipeline_name=pipeline_name,
    custom_image_uri=clarify_image,
    sklearn_processor_version=sklearn_processor_version
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [None]:
pipeline.definition()



'{"Version": "2020-12-01", "Metadata": {}, "Parameters": [{"Name": "ProcessingInstanceType", "Type": "String", "DefaultValue": "ml.m5.large"}, {"Name": "ProcessingInstanceCount", "Type": "Integer", "DefaultValue": 1}, {"Name": "TrainingInstanceType", "Type": "String", "DefaultValue": "ml.m5.large"}, {"Name": "InputData", "Type": "String", "DefaultValue": "s3://sagemaker-studio-211125403081-f0htqm1fzcs/data/storedata_total.csv"}, {"Name": "BatchData", "Type": "String", "DefaultValue": "s3://sagemaker-studio-211125403081-f0htqm1fzcs/data/batch/batch.csv"}, {"Name": "UseSpotInstances", "Type": "Boolean", "DefaultValue": true}, {"Name": "MaxRun", "Type": "Integer", "DefaultValue": 9000}, {"Name": "MaxWait", "Type": "Integer", "DefaultValue": 10000}], "PipelineExperimentConfig": {"ExperimentName": {"Get": "Execution.PipelineName"}, "TrialName": {"Get": "Execution.PipelineExecutionId"}}, "Steps": [{"Name": "ChurnModelProcess", "Type": "Processing", "Arguments": {"ProcessingResources": {"Clus

## Step 5: Submit the pipeline to SageMaker and start execution

In [None]:
pipeline.upsert(role_arn=role)



{'PipelineArn': 'arn:aws:sagemaker:us-east-2:211125403081:pipeline/ChurnModelSMPipeline',
 'ResponseMetadata': {'RequestId': 'a57f5c34-8c79-4562-b2fb-d18522ae080e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a57f5c34-8c79-4562-b2fb-d18522ae080e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '88',
   'date': 'Mon, 04 Nov 2024 23:35:20 GMT'},
  'RetryAttempts': 0}}

In [None]:
execution = pipeline.start()

In [None]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-2:211125403081:pipeline/ChurnModelSMPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-2:211125403081:pipeline/ChurnModelSMPipeline/execution/bph7w3oxle60',
 'PipelineExecutionDisplayName': 'execution-1730763324155',
 'PipelineExecutionStatus': 'Failed',
 'PipelineExperimentConfig': {'ExperimentName': 'churnmodelsmpipeline',
  'TrialName': 'bph7w3oxle60'},
 'FailureReason': 'Step failure: One or multiple steps failed.',
 'CreationTime': datetime.datetime(2024, 11, 4, 23, 35, 24, 85000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 11, 4, 23, 35, 27, 323000, tzinfo=tzlocal()),
 'CreatedBy': {'IamIdentity': {'Arn': 'arn:aws:sts::211125403081:assumed-role/AmazonSageMaker-ExecutionRole-20241030T160189/SageMaker',
   'PrincipalId': 'AROATCKAN7XERMTMUSYWW:SageMaker'}},
 'LastModifiedBy': {'IamIdentity': {'Arn': 'arn:aws:sts::211125403081:assumed-role/AmazonSageMaker-ExecutionRole-20241030T160189/SageMaker',
   'Principa

In [None]:
execution.list_steps()

[{'StepName': 'ChurnModelProcess',
  'StartTime': datetime.datetime(2024, 11, 4, 23, 35, 25, 705000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2024, 11, 4, 23, 35, 26, 996000, tzinfo=tzlocal()),
  'StepStatus': 'Failed',
  'FailureReason': "ClientError: Failed to invoke sagemaker:CreateProcessingJob. Error Details: The account-level service limit 'ml.m5.large for processing job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please use AWS Service Quotas to request an increase for this quota. If AWS Service Quotas is not available, contact AWS support to request an increase for this quota.\nRetry not appropriate on execution of step with PipelineExecutionArn arn:aws:sagemaker:us-east-2:211125403081:pipeline/churnmodelsmpipeline/execution/bph7w3oxle60 and StepId ChurnModelProcess. No retry policy configured for the exception type SAGEMAKER_RESOURCE_LIMIT.",
  'Metadata': {},
  'AttemptCount': 1}]