### Setting Up AWS Environment for SageMaker Session

In [10]:
import sagemaker
import boto3
from sagemaker import get_execution_role

# Getting the execution role for SageMaker
aws_role = get_execution_role()

# Getting the AWS region using the boto3 session
aws_region = boto3.Session().region_name

# Creating a SageMaker session
session = sagemaker.Session()

# Getting the default S3 bucket associated with the SageMaker session
bucket = session.default_bucket()

# Defining a prefix for the S3 location
prefix = "catboost-demo"

### Data Preprocessing Pipeline for Loan Eligibility Prediction

In [17]:
%%writefile preprocessing.py

import argparse
import os
import warnings

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, KBinsDiscretizer
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.exceptions import DataConversionWarning

# Ignore warnings related to data conversion
warnings.filterwarnings(action="ignore", category=DataConversionWarning)

# Define columns of interest
columns = [
    "Loan_Status",
    "ApplicantIncome", 
    "CoapplicantIncome",
    "LoanAmount", 
    "Loan_Amount_Term",
    "Gender", 
    "Married", 
    "Dependents", 
    "Education",
    "Credit_History",
    "Property_Area",
]

if __name__ == "__main__":
    # Parse command-line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-test-split-ratio", type=float, default=0.3)
    args, _ = parser.parse_known_args()
    
    print("Received Arguments: {}".format(args))
    
    # Load input data
    input_data_path = os.path.join("/opt/ml/processing/input/loan-train.csv")
    df = pd.read_csv(input_data_path, usecols=columns)
    
    # Map target labels to binary values
    df["Loan_Status"] = df["Loan_Status"].map({"Y": 1, "N": 0})
    
    # Split data into train and test sets
    split_ratio = args.train_test_split_ratio
    print("Splitting data into train and test sets with ratio {}".format(split_ratio))
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop(columns=["Loan_Status"]), df["Loan_Status"], test_size=split_ratio, random_state=42
    )
    
    # Define preprocessing pipeline
    preprocess = make_column_transformer(
        (Pipeline([
            ("imputer", SimpleImputer(strategy="mean")), 
            ("disretizer", KBinsDiscretizer(encode="ordinal", n_bins=5)),
        ]), ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"]),
        (Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")), 
            ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=1024)),
        ]), ["Gender", "Married", "Dependents", "Education"]),
        (Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")), 
            ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ]), ["Property_Area"]),
    )
    
    # Apply preprocessing pipeline
    print("Running preprocessing and feature engineering transformations")
    X_train = preprocess.fit_transform(X_train)
    X_test = preprocess.transform(X_test)
    
    # Display shapes of train and test data after preprocessing
    print("Train data shape after preprocessing: {}".format(X_train.shape))
    print("Test data shape after preprocessing: {}".format(X_test.shape))
    
    # Concatenate target variable with features
    train = np.column_stack([y_train, X_train])
    test = np.column_stack([y_test, X_test])
    
    # Define paths to save processed data
    train_data_output_path = os.path.join("/opt/ml/processing/train", "data.csv")
    test_data_output_path = os.path.join("/opt/ml/processing/validation", "data.csv")
    
    # Save processed train and test data
    print("Saving training data to {}".format(train_data_output_path))
    pd.DataFrame(train).to_csv(train_data_output_path, header=False, index=False)
    
    print("Saving test data to {}".format(test_data_output_path))
    pd.DataFrame(test).to_csv(test_data_output_path, header=False, index=False)


Overwriting preprocessing.py


### SageMaker SKLearn Processor for Data Preprocessing

In [19]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

# Define S3 bucket and prefix for input data
bucket = "farukcan-loan-eligibility"
prefix = "demo-1/source"
source_s3_path = f"s3://{bucket}/{prefix}/loan-train.csv"

# Initialize SKLearnProcessor
sklearn_preprocessor = SKLearnProcessor(
    framework_version="1.0-1",
    role=aws_role,  # Define AWS IAM role for accessing resources
    instance_type="ml.m5.xlarge",  # Choose instance type for processing
    instance_count=1,  # Number of instances to use for processing
)

# Run SKLearnProcessor with defined parameters
sklearn_preprocessor.run(
    code="preprocessing.py",  # Script to be executed
    inputs=[ProcessingInput(source=source_s3_path, destination="/opt/ml/processing/input")],  # Define input data location
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),  # Define output location for training data
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),  # Define output location for validation data
    ],
    arguments=["--train-test-split-ratio", "0.2"],  # Pass arguments to the preprocessing script
)


INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2024-03-28-09-46-06-395


...........................[34mReceived Arguments: Namespace(train_test_split_ratio=0.2)[0m
[34mSplitting data into train and test sets with ratio 0.2[0m
[34mRunning preprocessing and feature engineering transformations[0m
[34mTrain data shape after preprocessing: (491, 11)[0m
[34mTest data shape after preprocessing: (123, 11)[0m
[34mSaving training data to /opt/ml/processing/train/data.csv[0m
[34mSaving test data to /opt/ml/processing/validation/data.csv[0m



### Retrieving Processed Data URIs from SageMaker Processing Job

In [20]:
# Retrieve information about the latest processing job
processing_job_description = sklearn_preprocessor.jobs[-1].describe()

# Extract output information from the processing job description
outputs = processing_job_description["ProcessingOutputConfig"]["Outputs"]

# Extract URIs for processed train and test data from the output information
processed_train_data_uri = outputs[0]["S3Output"]["S3Uri"]
processed_test_data_uri = outputs[1]["S3Output"]["S3Uri"]