### Setting Up AWS Environment for SageMaker Session

In [1]:
import sagemaker
import boto3
from sagemaker import get_execution_role

# Getting the execution role for SageMaker
aws_role = get_execution_role()

# Getting the AWS region using the boto3 session
aws_region = boto3.Session().region_name

# Creating a SageMaker session
session = sagemaker.Session()

# Getting the default S3 bucket associated with the SageMaker session
bucket = session.default_bucket()

# Defining a prefix for the S3 location
prefix = "demo-1"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


### Downloading Loan Dataset from AWS S3 via "download_file"

In [60]:
import pandas as pd

# Creating an S3 client object
s3 = boto3.client("s3")

# Downloading the training dataset file from the specified S3 bucket and saving it locally as 'train.csv'
s3.download_file("farukcan-loan-eligibility", "demo-1/source/loan-train.csv", "train.csv")

# Downloading the testing dataset file from the specified S3 bucket and saving it locally as 'test.csv'
s3.download_file("farukcan-loan-eligibility", "demo-1/source/loan-test.csv", "test.csv")

# Reading the training dataset from the local file 'train.csv' into a pandas DataFrame
train = pd.read_csv("train.csv")

# Reading the testing dataset from the local file 'test.csv' into a pandas DataFrame
test = pd.read_csv("test.csv")

### Downloading Loan Dataset from AWS S3 via "download_fileobj"

In [2]:
import io
import pandas as pd

# Create byte streams to store downloaded data
train_buffer = io.BytesIO()
test_buffer = io.BytesIO()

# Initialize boto3 client for accessing AWS S3
s3 = boto3.client("s3")

# Download training data from S3 bucket "farukcan-loan-eligibility" and store it in train_buffer
s3.download_fileobj("farukcan-loan-eligibility", "demo-1/source/loan-train.csv", train_buffer)

# Download testing data from S3 bucket "farukcan-loan-eligibility" and store it in test_buffer
s3.download_fileobj("farukcan-loan-eligibility", "demo-1/source/loan-test.csv", test_buffer)

# Reset the buffer position to the beginning to start reading data from the beginning
train_buffer.seek(0)

# Read the training data into a Pandas DataFrame from the train_buffer
train = pd.read_csv(train_buffer)

# Reset the buffer position to the beginning to start reading data from the beginning
test_buffer.seek(0)

# Read the testing data into a Pandas DataFrame from the test_buffer
test = pd.read_csv(test_buffer)

### Data Preparation: Preprocessing Loan Training and Testing Datasets

In [3]:
import pandas as pd

# Mapping the 'Loan_Status' column to binary values: 'Y' -> 1, 'N' -> 0, and storing the result in a new 'target' column
train["target"] = train["Loan_Status"].map({"Y": 1, "N": 0})

# Dropping unnecessary columns 'Loan_Status' and 'Loan_ID' from the training dataset
train = train.drop(columns=["Loan_Status", "Loan_ID"])

# Reordering the columns so that the 'target' column is the first column in the DataFrame
train = train[["target"] + train.columns[:-1].tolist()]

# Dropping the 'Loan_ID' column from the testing dataset
test = test.drop(columns=["Loan_ID"])

In [4]:
train.head(5)

Unnamed: 0,target,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,0,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,1,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,1,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,1,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


In [5]:
test.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


### Splitting Dataset into Training and Validation Sets

In [6]:
# Importing the train_test_split function from the sklearn.model_selection module
from sklearn.model_selection import train_test_split

# Splitting the training dataset into train and validation sets with a 70-30 ratio
# Stratifying the split based on the 'target' column to maintain class distribution in both sets
train, valid = train_test_split(
    train, test_size=0.3, shuffle=True, stratify=train.target)

# Resetting the index of both train and validation sets after the split
train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)

### Handling Missing Values: Imputation for Numerical and Categorical Features

In [7]:
# Importing the SimpleImputer class from the sklearn.impute module
from sklearn.impute import SimpleImputer

# Creating SimpleImputer objects for numerical and categorical features
num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

# Defining column names for categorical and numerical features
X_cat_column_names = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area"]
X_num_column_names = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term", "Credit_History"]

# Imputing missing values in numerical features using mean strategy
train[X_num_column_names] = num_imputer.fit_transform(train[X_num_column_names])
valid[X_num_column_names] = num_imputer.transform(valid[X_num_column_names])
test[X_num_column_names] = num_imputer.transform(test[X_num_column_names])

# Imputing missing values in categorical features using most frequent strategy
train[X_cat_column_names] = cat_imputer.fit_transform(train[X_cat_column_names])
valid[X_cat_column_names] = cat_imputer.transform(valid[X_cat_column_names])
test[X_cat_column_names] = cat_imputer.transform(test[X_cat_column_names])

### Encoding Categorical Features: Ordinal Encoding for Selected Features

In [8]:
# Importing the OrdinalEncoder class from the sklearn.preprocessing module
from sklearn.preprocessing import OrdinalEncoder

# Creating an OrdinalEncoder object
encoder = OrdinalEncoder()

# Defining column names for ordinal features to be encoded
ordinal_col_names = ["Dependents", "Gender", "Married", "Education", "Self_Employed"]

# Encoding ordinal features in train, validation, and test sets
for col_name in ordinal_col_names:
    train[col_name] = encoder.fit_transform(train[[col_name]])
    valid[col_name] = encoder.transform(valid[[col_name]])
    test[col_name] = encoder.transform(test[[col_name]])

### Encoding Categorical Features: One-Hot Encoding for Selected Features

In [9]:
# Importing the OneHotEncoder class from the sklearn.preprocessing module
from sklearn.preprocessing import OneHotEncoder

# Creating a OneHotEncoder object with sparse_output set to False for dense output
encoder = OneHotEncoder(sparse_output=False)

# Defining column names for categorical features to be one-hot encoded
onehot_col_names = ["Property_Area"]

# Encoding categorical features into one-hot representations for train set
cat_encoded = encoder.fit_transform(train[onehot_col_names])
cat_encoded = pd.DataFrame(cat_encoded, columns=encoder.get_feature_names_out())
train = pd.concat([train, cat_encoded], axis=1)
train = train.drop(columns=onehot_col_names)

# Encoding categorical features into one-hot representations for validation set
cat_encoded = encoder.transform(valid[onehot_col_names])
cat_encoded = pd.DataFrame(cat_encoded, columns=encoder.get_feature_names_out())
valid = pd.concat([valid, cat_encoded], axis=1)
valid = valid.drop(columns=onehot_col_names)

# Encoding categorical features into one-hot representations for test set
cat_encoded = encoder.transform(test[onehot_col_names])
cat_encoded = pd.DataFrame(cat_encoded, columns=encoder.get_feature_names_out())
test = pd.concat([test, cat_encoded], axis=1)
test = test.drop(columns=onehot_col_names)

### Dimensionality Reduction: Truncated Singular Value Decomposition (SVD) for Feature Transformation

In [10]:
# Importing the TruncatedSVD class from the sklearn.decomposition module
from sklearn.decomposition import TruncatedSVD

# Creating a TruncatedSVD object with specified parameters
svd = TruncatedSVD(n_components=10, n_iter=10, random_state=42)

# Extracting column names for features from the train dataset
X_column_names = train.columns[1:]

# Fitting the TruncatedSVD model to the training data
svd.fit(train[X_column_names])

# Printing the cumulative explained variance ratio of the SVD model
print("SVD Explained Variance Ratio: {:.4f}".format(svd.explained_variance_ratio_.sum()))

# Transforming the train, validation, and test sets using the fitted SVD model
X_train = svd.transform(train[X_column_names])
X_valid = svd.transform(valid[X_column_names])
X_test = svd.transform(test[X_column_names])

SVD Explained Variance Ratio: 1.0000


In [11]:
train = pd.concat([train["target"], pd.DataFrame(X_train)], axis=1)
train.head(5)

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,9
0,0,2487.498399,-359.241196,-411.955677,-56.634263,-0.737743,0.717549,-0.178683,0.418683,-0.106411,-0.123231
1,1,5197.870114,6517.92496,-43.027121,7.884235,-1.049943,-0.568556,-0.472483,0.340707,0.053364,-0.280527
2,1,2888.882363,1219.123088,-237.04732,-45.562903,-0.747907,-0.563071,-0.512018,0.790443,-0.021422,-0.027585
3,1,4851.581591,1387.598444,-185.867717,12.796768,1.293758,0.694074,-0.201757,0.013021,-0.26868,-0.104215
4,1,3512.921528,-516.911541,-269.820123,-14.417922,0.19654,0.761185,-0.180114,0.997915,-0.017266,-0.111557


In [12]:
valid = pd.concat([valid["target"], pd.DataFrame(X_valid)], axis=1)
valid.head(5)

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,9
0,0,2964.965785,-434.746709,-279.872157,-34.026609,-0.582896,0.694165,-0.136328,0.329779,0.01727,0.530203
1,1,3596.27509,2524.382992,-192.354337,-6.1172,-0.628901,-0.601735,-0.55546,-0.449758,-0.29387,0.072372
2,1,651.685722,2862.120073,-274.172065,-24.515829,1.494883,0.745779,-0.146195,0.261706,0.634675,0.113011
3,1,5636.406467,437.709456,-187.003017,21.597709,0.237222,-0.239987,0.792219,-0.261983,-0.372239,-0.073717
4,1,17077.34315,-2557.927256,112.58963,8.722956,-1.205531,0.804555,-0.18441,0.612138,1.124751,-0.234933


In [13]:
test = pd.DataFrame(X_test)
test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,5666.572894,-841.260253,-203.571304,-29.901946,-0.531782,-0.632406,-0.595646,-0.510643,-0.33205,0.040606
1,3275.335166,1035.199335,-240.309458,-7.012766,0.39559,-0.653382,-0.554321,-0.236083,-0.334924,0.000871
2,5222.678166,1046.376258,-195.539623,57.997963,1.07244,-0.674123,-0.523643,0.102895,-0.274271,-0.046935
3,2702.117565,2178.385118,-222.234255,-33.639578,1.442499,-0.687172,-0.525264,0.009838,-0.35192,0.091784
4,3250.444819,-477.883726,-268.39082,-45.432145,-0.525566,-0.672907,-0.497303,-0.084201,0.850575,-0.34825


### Uploading Preprocessed Data to S3 Bucket via "upload_file"

In [86]:
import os

# Saving preprocessed data to local CSV files
train.to_csv("train_preprocessed.csv", index=False, header=False)
valid.to_csv("valid_preprocessed.csv", index=False, header=False)
test.to_csv("test_preprocessed.csv", index=False, header=False)

# Uploading the preprocessed training data to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train/data.csv")
).upload_file("train_preprocessed.csv")

# Uploading the preprocessed validation data to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "valid/data.csv")
).upload_file("valid_preprocessed.csv")

# Uploading the preprocessed testing data to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "test/data.csv")
).upload_file("test_preprocessed.csv")

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


### Uploading Preprocessed Data to S3 Bucket via "upload_fileobj"

In [14]:
import io
import os

# Convert dataframes to CSV format and encode as bytes
train_buffer = io.BytesIO(train.to_csv(index=False, header=False).encode("utf-8"))
valid_buffer = io.BytesIO(valid.to_csv(index=False, header=False).encode("utf-8"))
test_buffer = io.BytesIO(test.to_csv(index=False, header=False).encode("utf-8"))

# Upload the training data buffer to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train/data.csv")
).upload_fileobj(train_buffer)

# Upload the validation data buffer to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "valid/data.csv")
).upload_fileobj(valid_buffer)

# Upload the test data buffer to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "test/data.csv")
).upload_fileobj(test_buffer)

### Retrieving SageMaker Resources for Training

In [22]:
from sagemaker import image_uris, model_uris, script_uris

# Defining parameters for training the model
train_model_id = "lightgbm-classification-model"
train_model_version = "2.1.0"  # Use the latest version of the model
train_scope = "training"
train_instance_type = "ml.m5.xlarge"  # Instance type for training

# Retrieving the Docker image URI for training
train_image_uri = image_uris.retrieve(
    region=None,  # Use default region
    framework=None,  # No specific framework required
    model_id=train_model_id,
    model_version=train_model_version,
    image_scope=train_scope,
    instance_type=train_instance_type,
)

# Retrieving the source script URI for training
train_source_uri = script_uris.retrieve(
    model_id=train_model_id,
    model_version=train_model_version,
    script_scope=train_scope,
)

# Retrieving the URI of the pre-trained model tarball for further fine-tuning
train_model_uri = model_uris.retrieve(
    model_id=train_model_id,
    model_version=train_model_version,
    model_scope=train_scope,
)

### Retrieving Default Hyperparameters and Setting Custom Values

In [23]:
# Importing necessary functions from SageMaker
from sagemaker import hyperparameters

# Retrieving default hyperparameters for the specified model
params = hyperparameters.retrieve_default(
    model_id=train_model_id, model_version=train_model_version)

# Setting a custom value for the number of boosting rounds and learning rate
params["num_boost_round"] = '100'
params["learning_rate"] = '0.1'

# Displaying the updated hyperparameters
params

{'num_boost_round': '100',
 'early_stopping_rounds': '30',
 'metric': 'auto',
 'learning_rate': '0.1',
 'num_leaves': '67',
 'feature_fraction': '0.74',
 'bagging_fraction': '0.53',
 'bagging_freq': '5',
 'max_depth': '11',
 'min_data_in_leaf': '26',
 'max_delta_step': '0.0',
 'lambda_l1': '0.0',
 'lambda_l2': '0.0',
 'boosting': 'gbdt',
 'min_gain_to_split': '0.0',
 'scale_pos_weight': '1.0',
 'tree_learner': 'serial',
 'feature_fraction_bynode': '1.0',
 'is_unbalance': 'False',
 'max_bin': '255',
 'num_threads': '0',
 'verbosity': '1',
 'use_dask': 'False'}

### Setting Up SageMaker Estimator for Training Job

In [24]:
# Importing necessary classes from SageMaker
from sagemaker.estimator import Estimator
from sagemaker.utils import name_from_base

# Defining the S3 output location for the trained model
s3_output_location = f"s3://{bucket}/{prefix}/output"

# Creating an Estimator object for training
estimator = Estimator(
    role=aws_role,
    image_uri=train_image_uri,
    source_dir=train_source_uri,
    model_uri=train_model_uri,
    entry_point="transfer_learning.py",
    instance_count=1,
    instance_type=train_instance_type,
    max_run=1000,
    hyperparameters=params,
    output_path=s3_output_location,
)

### Initiating Training Job with Training and Validation Datasets

In [25]:
from sagemaker.inputs import TrainingInput

# Define paths for training and validation datasets
train_data_path = f"s3://{bucket}/{prefix}/train"
valid_data_path = f"s3://{bucket}/{prefix}/valid"

# Create TrainingInput objects for training and validation datasets
train_input = TrainingInput(train_data_path, content_type="text/csv")
valid_input = TrainingInput(valid_data_path, content_type="text/csv")

# Generating a unique name for the training job
train_job_name = name_from_base("lightgbm-train-job")

# Initiating the training job with training and validation datasets using TrainingInput
estimator.fit(
    {"train": train_input, "validation": valid_input},
    logs=True,  # Print logs during training
    job_name=train_job_name  # Assigning a name to the training job
)

INFO:sagemaker:Creating training-job with name: my-first-example-2024-03-27-10-14-09-762


2024-03-27 10:14:09 Starting - Starting the training job......
2024-03-27 10:14:50 Starting - Preparing the instances for training...
2024-03-27 10:15:30 Downloading - Downloading the training image...
2024-03-27 10:16:01 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-03-27 10:16:08,539 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-03-27 10:16:08,541 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-03-27 10:16:08,552 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-03-27 10:16:08,554 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-03-27 10:16:08,781 sagemaker-training-toolkit INFO     Installing dependencies from requirements.t

### Deploying Trained Model as an Endpoint for Inference

In [26]:
# Defining the instance type for inference
inference_instance_type = "ml.m5.large"

# Generating a unique endpoint name
endpoint_name = name_from_base("lightgbm-endpoint")

# Retrieving the Docker image URI for the inference image
deploy_image_uri = image_uris.retrieve(
    region=None,  # Use default region
    framework=None,  # No specific framework required
    model_id=train_model_id,
    model_version=train_model_version,
    image_scope="inference",
    instance_type=inference_instance_type,
)

# Retrieving the source script URI for inference
deploy_source_uri = script_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, script_scope="inference"
)

# Deploying the trained model as an endpoint for inference
predictor = estimator.deploy(
    initial_instance_count=1,  # Number of instances to start with
    instance_type=inference_instance_type,  # Instance type for the endpoint
    entry_point="inference.py",  # Script to handle inference requests
    image_uri=deploy_image_uri,  # Docker image URI for the endpoint
    source_dir=deploy_source_uri,  # Source script URI for the endpoint
    endpoint_name=endpoint_name  # Name of the endpoint
)

INFO:sagemaker:Repacking model artifact (s3://sagemaker-eu-north-1-339713058917/demo-1/output/my-first-example-2024-03-27-10-14-09-762/output/model.tar.gz), script artifact (s3://jumpstart-cache-prod-eu-north-1/source-directory-tarballs/lightgbm/inference/classification/v1.2.2/sourcedir.tar.gz), and dependencies ([]) into single tar.gz file located at s3://sagemaker-eu-north-1-339713058917/sagemaker-jumpstart-2024-03-27-10-20-29-390/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: sagemaker-jumpstart-2024-03-27-10-20-29-390
INFO:sagemaker:Creating endpoint-config with name lightgbm-endpoint-2024-03-27-10-20-29-390
INFO:sagemaker:Creating endpoint with name lightgbm-endpoint-2024-03-27-10-20-29-390


------!

### Make Inference Requests on Test Data

In [27]:
import json
import numpy as np

# Initialize the SageMaker runtime client
client = boto3.client("runtime.sagemaker")

# Define the content type of the input data
content_type = "text/csv"

# Create a buffer to store the downloaded data from S3
buffer = io.BytesIO()

# Initialize the S3 client
s3 = boto3.client("s3")

# Download the test data file from S3 into the buffer
s3.download_fileobj(bucket, os.path.join(prefix, "test/data.csv"), buffer)

# Invoke the SageMaker endpoint with the downloaded data
response = client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType=content_type,
    Body=buffer.getvalue(),
)

# Read the response from the endpoint and parse it as JSON
predictions = json.loads(response["Body"].read())

# Extract the probabilities from the predictions
probabilities = np.array(predictions["probabilities"])

# Determine the predicted classes by selecting the index of the maximum probability for each prediction
predictions = probabilities.argmax(1)

### Deleting Model and Endpoint Resources in SageMaker

In [28]:
# Delete the model
predictor.delete_model()

# Delete the endpoint
predictor.delete_endpoint()

INFO:sagemaker:Deleting model with name: sagemaker-jumpstart-2024-03-27-10-20-29-390
INFO:sagemaker:Deleting endpoint configuration with name: lightgbm-endpoint-2024-03-27-10-20-29-390
INFO:sagemaker:Deleting endpoint with name: lightgbm-endpoint-2024-03-27-10-20-29-390
