### Setting Up AWS Environment for SageMaker Session

In [123]:
import sagemaker
import boto3
from sagemaker import get_execution_role

# Getting the execution role for SageMaker
aws_role = get_execution_role()

# Getting the AWS region using the boto3 session
aws_region = boto3.Session().region_name

# Creating a SageMaker session
session = sagemaker.Session()

# Getting the default S3 bucket associated with the SageMaker session
bucket = session.default_bucket()

# Defining a prefix for the S3 location
prefix = "catboost-demo"

### Downloading Loan Dataset from AWS S3 via "download_file"

In [2]:
import pandas as pd

# Creating an S3 client object
s3 = boto3.client("s3")

# Downloading the training dataset file from the specified S3 bucket and saving it locally as 'train.csv'
s3.download_file("farukcan-loan-eligibility", "demo-1/source/train/data.csv", "train.csv")

# Downloading the testing dataset file from the specified S3 bucket and saving it locally as 'test.csv'
s3.download_file("farukcan-loan-eligibility", "demo-1/source/test/data.csv", "test.csv")

# Reading the training dataset from the local file 'train.csv' into a pandas DataFrame
train = pd.read_csv("train.csv")

# Reading the testing dataset from the local file 'test.csv' into a pandas DataFrame
test = pd.read_csv("test.csv")

### Downloading Loan Dataset from AWS S3 via "download_fileobj"

In [124]:
import io
import pandas as pd

# Create byte streams to store downloaded data
train_buffer = io.BytesIO()
test_buffer = io.BytesIO()

# Initialize boto3 client for accessing AWS S3
s3 = boto3.client("s3")

# Download training data from S3 bucket "farukcan-loan-eligibility" and store it in train_buffer
s3.download_fileobj("farukcan-loan-eligibility", "demo-1/source/train/data.csv", train_buffer)

# Download testing data from S3 bucket "farukcan-loan-eligibility" and store it in test_buffer
s3.download_fileobj("farukcan-loan-eligibility", "demo-1/source/test/data.csv", test_buffer)

# Reset the buffer position to the beginning to start reading data from the beginning
train_buffer.seek(0)

# Read the training data into a Pandas DataFrame from the train_buffer
train = pd.read_csv(train_buffer)

# Reset the buffer position to the beginning to start reading data from the beginning
test_buffer.seek(0)

# Read the testing data into a Pandas DataFrame from the test_buffer
test = pd.read_csv(test_buffer)

### Data Preparation: Preprocessing Loan Training and Testing Datasets

In [100]:
import pandas as pd

# Mapping the 'Loan_Status' column to binary values: 'Y' -> 1, 'N' -> 0, and storing the result in a new 'target' column
train["target"] = train["Loan_Status"].map({"Y": 1, "N": 0})

# Dropping unnecessary columns 'Loan_Status' and 'Loan_ID' from the training dataset
train = train.drop(columns=["Loan_Status", "Loan_ID"])

# Reordering the columns so that the 'target' column is the first column in the DataFrame
train = train[["target"] + train.columns[:-1].tolist()]

# Dropping the 'Loan_ID' column from the testing dataset
test = test.drop(columns=["Loan_ID"])

In [101]:
train.head(5)

Unnamed: 0,target,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,0,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,1,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,1,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,1,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


In [102]:
test.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


### Splitting Dataset into Training and Validation Sets

In [103]:
# Importing the train_test_split function from the sklearn.model_selection module
from sklearn.model_selection import train_test_split

# Splitting the training dataset into train and validation sets with a 70-30 ratio
# Stratifying the split based on the 'target' column to maintain class distribution in both sets
train, valid = train_test_split(
    train, test_size=0.3, shuffle=True, stratify=train.target)

# Resetting the index of both train and validation sets after the split
train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)

### Handling Missing Values: Imputation for Numerical and Categorical Features

In [104]:
# Importing the SimpleImputer class from the sklearn.impute module
from sklearn.impute import SimpleImputer

# Creating SimpleImputer objects for numerical and categorical features
num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

# Defining column names for categorical and numerical features
X_cat_column_names = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area"]
X_num_column_names = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term", "Credit_History"]

# Imputing missing values in numerical features using mean strategy
train[X_num_column_names] = num_imputer.fit_transform(train[X_num_column_names])
valid[X_num_column_names] = num_imputer.transform(valid[X_num_column_names])
test[X_num_column_names] = num_imputer.transform(test[X_num_column_names])

# Imputing missing values in categorical features using most frequent strategy
train[X_cat_column_names] = cat_imputer.fit_transform(train[X_cat_column_names])
valid[X_cat_column_names] = cat_imputer.transform(valid[X_cat_column_names])
test[X_cat_column_names] = cat_imputer.transform(test[X_cat_column_names])

### Encoding Categorical Features: Ordinal Encoding for Selected Features

In [105]:
# List of categorical column names
cat_col_names = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area"]

# Dictionary to store the indices of categorical columns
cat_idx_json = {"cat_idx": []}

# Loop through the columns of the training data
for idx, col_name in enumerate(train.columns.tolist()):
    # Check if the column name is in the list of categorical column names
    if col_name in cat_col_names:
        # If the column is categorical, append its index to the list in cat_idx_json
        cat_idx_json["cat_idx"].append(idx)

In [106]:
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

# Creating a LabelEncoder object
encoder = OrdinalEncoder(dtype=np.int32)

# Ordinal encoding categorical features in train, validation, and test sets

train[cat_col_names] = encoder.fit_transform(train[cat_col_names])
valid[cat_col_names] = encoder.transform(valid[cat_col_names])
test[cat_col_names] = encoder.transform(test[cat_col_names])

### Uploading Preprocessed Data to S3 Bucket via "upload_file"

In [9]:
import os
import json

# Saving preprocessed data to local CSV files
train.to_csv("train.csv", index=False, header=False)
valid.to_csv("valid.csv", index=False, header=False)
test.to_csv("test.csv", index=False, header=False)

# Saving category index data to local JSON file
with open("cat_idx.json", "w") as f:
    json.dump(cat_idx_json, f)

# Uploading the preprocessed training data to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train/data.csv")
).upload_file("train.csv")

# Uploading the preprocessed validation data to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "validation/data.csv")
).upload_file("valid.csv")

# Uploading the preprocessed testing data to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "test/data.csv")
).upload_file("test.csv")

# Uploading the category index data to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "cat_idx.json")
).upload_file("cat_idx.json")

### Uploading Preprocessed Data to S3 Bucket via "upload_fileobj"

In [107]:
import io
import os
import json

# Convert dataframes to CSV format and encode as bytes
train_buffer = io.BytesIO(train.to_csv(index=False, header=False).encode("utf-8"))
valid_buffer = io.BytesIO(valid.to_csv(index=False, header=False).encode("utf-8"))
test_buffer = io.BytesIO(test.to_csv(index=False, header=False).encode("utf-8"))
cat_idx_buffer = io.BytesIO(json.dumps(cat_idx_json).encode("utf-8"))

# Upload the training data buffer to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train/data.csv")
).upload_fileobj(train_buffer)

# Upload the validation data buffer to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "validation/data.csv")
).upload_fileobj(valid_buffer)

# Upload the test data buffer to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "test/data.csv")
).upload_fileobj(test_buffer)

# Upload the test data buffer to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "cat_idx.json")
).upload_fileobj(cat_idx_buffer)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


### Retrieving SageMaker Resources for Training

In [108]:
from sagemaker import image_uris, model_uris, script_uris

# Defining parameters for training the model
train_model_id = "catboost-classification-model"
train_model_version = "2.1.0"  # Use the latest version of the model
train_scope = "training"
train_instance_type = "ml.m5.xlarge"  # Instance type for training

# Retrieving the Docker image URI for training
train_image_uri = image_uris.retrieve(
    region=None,  # Use default region
    framework=None,  # No specific framework required
    model_id=train_model_id,
    model_version=train_model_version,
    image_scope=train_scope,
    instance_type=train_instance_type,
)

# Retrieving the source script URI for training
train_source_uri = script_uris.retrieve(
    model_id=train_model_id,
    model_version=train_model_version,
    script_scope=train_scope,
)

# Retrieving the URI of the pre-trained model tarball for further fine-tuning
train_model_uri = model_uris.retrieve(
    model_id=train_model_id,
    model_version=train_model_version,
    model_scope=train_scope,
)

### Retrieving Default Hyperparameters and Setting Custom Values

In [109]:
# Importing necessary functions from SageMaker
from sagemaker import hyperparameters

# Retrieving default hyperparameters for the specified model
params = hyperparameters.retrieve_default(
    model_id=train_model_id, model_version=train_model_version)

# Setting a custom value for the number of iterations and learning rate
params["iterations"] = '500'
params["learning_rate"] = '0.05'
params["early_stopping_rounds"] = '100'

# Displaying the updated hyperparameters
params

{'iterations': '500',
 'early_stopping_rounds': '100',
 'eval_metric': 'Auto',
 'learning_rate': '0.05',
 'depth': '6',
 'l2_leaf_reg': '3',
 'random_strength': '1.0',
 'max_leaves': '31',
 'rsm': '1',
 'sampling_frequency': 'PerTreeLevel',
 'min_data_in_leaf': '1',
 'bagging_temperature': '1',
 'boosting_type': 'Auto',
 'scale_pos_weight': '1.0',
 'max_bin': 'Auto',
 'grow_policy': 'SymmetricTree',
 'random_seed': '0',
 'thread_count': '-1',
 'verbose': '1'}

### Setting Up SageMaker Estimator for Training Job

In [110]:
# Importing necessary classes from SageMaker
from sagemaker.estimator import Estimator
from sagemaker.utils import name_from_base

# Defining the S3 output location for the trained model
s3_output_location = f"s3://{bucket}/{prefix}/output"

# Creating an Estimator object for training
estimator = Estimator(
    role=aws_role,
    image_uri=train_image_uri,
    source_dir=train_source_uri,
    model_uri=train_model_uri,
    entry_point="transfer_learning.py",
    instance_count=1,
    instance_type=train_instance_type,
    max_run=1000,
    hyperparameters=params,
    output_path=s3_output_location,
)

### Initiating Training Job with Training and Validation Datasets

In [111]:
# Train S3 path
train_data_s3_path = f"s3://{bucket}/{prefix}"

# Generating a unique name for the training job
train_job_name = name_from_base("catboost-train-job")

# Initiating the training job with training and validation datasets using TrainingInput
estimator.fit(
    {"training": train_data_s3_path},
    logs=True,  # Print logs during training
    job_name=train_job_name  # Assigning a name to the training job
)

INFO:sagemaker:Creating training-job with name: catboost-train-job-2024-04-01-10-35-25-479


2024-04-01 10:35:25 Starting - Starting the training job...
2024-04-01 10:35:42 Starting - Preparing the instances for training...
2024-04-01 10:36:06 Downloading - Downloading input data...
2024-04-01 10:36:52 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-04-01 10:37:00,707 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-04-01 10:37:00,709 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-04-01 10:37:00,718 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-04-01 10:37:00,721 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-04-01 10:37:02,620 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[3

### Deploying Trained Model as an Endpoint for Inference

In [112]:
# Defining the instance type for inference
inference_instance_type = "ml.m5.large"

# Generating a unique endpoint name
endpoint_name = name_from_base("catboost-endpoint")

# Retrieving the Docker image URI for the inference image
deploy_image_uri = image_uris.retrieve(
    region=None,  # Use default region
    framework=None,  # No specific framework required
    model_id=train_model_id,
    model_version=train_model_version,
    image_scope="inference",
    instance_type=inference_instance_type,
)

# Retrieving the source script URI for inference
deploy_source_uri = script_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, script_scope="inference"
)

# Deploying the trained model as an endpoint for inference
predictor = estimator.deploy(
    initial_instance_count=1,  # Number of instances to start with
    instance_type=inference_instance_type,  # Instance type for the endpoint
    entry_point="inference.py",  # Script to handle inference requests
    image_uri=deploy_image_uri,  # Docker image URI for the endpoint
    source_dir=deploy_source_uri,  # Source script URI for the endpoint
    endpoint_name=endpoint_name  # Name of the endpoint
)

INFO:sagemaker:Repacking model artifact (s3://sagemaker-eu-north-1-339713058917/catboost-demo/output/catboost-train-job-2024-04-01-10-35-25-479/output/model.tar.gz), script artifact (s3://jumpstart-cache-prod-eu-north-1/source-directory-tarballs/catboost/inference/classification/v1.1.2/sourcedir.tar.gz), and dependencies ([]) into single tar.gz file located at s3://sagemaker-eu-north-1-339713058917/sagemaker-jumpstart-2024-04-01-10-38-07-749/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: sagemaker-jumpstart-2024-04-01-10-38-07-749
INFO:sagemaker:Creating endpoint-config with name catboost-endpoint-2024-04-01-10-38-07-748
INFO:sagemaker:Creating endpoint with name catboost-endpoint-2024-04-01-10-38-07-748


------!

### Make Inference Requests on Test Data

In [113]:
import os
# Create a buffer to store the downloaded data from S3
buffer = io.BytesIO()

# Initialize the S3 client
s3 = boto3.client("s3")

# Download the test data file from S3 into the buffer
s3.download_fileobj(bucket, os.path.join(prefix, "test/data.csv"), buffer)

In [114]:
import json
import numpy as np

# Initialize the SageMaker runtime client
client = boto3.client("runtime.sagemaker")

# Define the content type of the input data
content_type = "text/csv"

# Create a buffer to store the downloaded data from S3
buffer = io.BytesIO()

# Initialize the S3 client
s3 = boto3.client("s3")

# Download the test data file from S3 into the buffer
s3.download_fileobj(bucket, os.path.join(prefix, "test/data.csv"), buffer)

# Invoke the SageMaker endpoint with the downloaded data
response = client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType=content_type,
    Body=buffer.getvalue(),
)

# Read the response from the endpoint and parse it as JSON
predictions = json.loads(response["Body"].read())

# Extract the probabilities from the predictions
probabilities = np.array(predictions["probabilities"])

# Determine the predicted classes by selecting the index of the maximum probability for each prediction
predictions = probabilities.argmax(1)

### Deleting Model and Endpoint Resources in SageMaker

In [19]:
# Delete the model
predictor.delete_model()

# Delete the endpoint
predictor.delete_endpoint()

INFO:sagemaker:Deleting model with name: sagemaker-jumpstart-2024-03-27-10-01-37-013
INFO:sagemaker:Deleting endpoint configuration with name: my-first-endpoint-2024-03-27-10-01-37-013
INFO:sagemaker:Deleting endpoint with name: my-first-endpoint-2024-03-27-10-01-37-013
