### Setting Up AWS Environment for SageMaker Session

In [2]:
import sagemaker
import boto3
from sagemaker import get_execution_role

# Getting the execution role for SageMaker
aws_role = get_execution_role()

# Getting the AWS region using the boto3 session
aws_region = boto3.Session().region_name

# Creating a SageMaker session
session = sagemaker.Session()

# Getting the default S3 bucket associated with the SageMaker session
bucket = session.default_bucket()

# Defining a prefix for the S3 location
prefix = "sklearn-demo"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


### Downloading Loan Dataset from AWS S3 via "download_fileobj"

In [64]:
import io
import pandas as pd

# Create byte streams to store downloaded data
train_buffer = io.BytesIO()
test_buffer = io.BytesIO()

# Initialize boto3 client for accessing AWS S3
s3 = boto3.client("s3")

# Download training data from S3 bucket "farukcan-loan-eligibility" and store it in train_buffer
s3.download_fileobj("farukcan-loan-eligibility", "demo-1/source/train/data.csv", train_buffer)

# Download testing data from S3 bucket "farukcan-loan-eligibility" and store it in test_buffer
s3.download_fileobj("farukcan-loan-eligibility", "demo-1/source/test/data.csv", test_buffer)

# Reset the buffer position to the beginning to start reading data from the beginning
train_buffer.seek(0)

# Read the training data into a Pandas DataFrame from the train_buffer
train = pd.read_csv(train_buffer)

# Reset the buffer position to the beginning to start reading data from the beginning
test_buffer.seek(0)

# Read the testing data into a Pandas DataFrame from the test_buffer
test = pd.read_csv(test_buffer)

### Data Preparation: Preprocessing Loan Training and Testing Datasets

In [65]:
import pandas as pd

# Mapping the 'Loan_Status' column to binary values: 'Y' -> 1, 'N' -> 0, and storing the result in a new 'target' column
train["target"] = train["Loan_Status"].map({"Y": 1, "N": 0})

# Dropping unnecessary columns 'Loan_Status' and 'Loan_ID' from the training dataset
train = train.drop(columns=["Loan_Status", "Loan_ID"])

# Reordering the columns so that the 'target' column is the first column in the DataFrame
train = train[["target"] + train.columns[:-1].tolist()]

# Dropping the 'Loan_ID' column from the testing dataset
test = test.drop(columns=["Loan_ID"])

In [66]:
train.head(5)

Unnamed: 0,target,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,0,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,1,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,1,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,1,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


In [67]:
test.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


### Splitting Dataset into Training and Validation Sets

In [68]:
# Importing the train_test_split function from the sklearn.model_selection module
from sklearn.model_selection import train_test_split

# Splitting the training dataset into train and validation sets with a 70-30 ratio
# Stratifying the split based on the 'target' column to maintain class distribution in both sets
train, valid = train_test_split(
    train, test_size=0.3, shuffle=True, stratify=train.target)

# Resetting the index of both train and validation sets after the split
train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)

### Imputing Numeric and Categoric Feautres

In [69]:
from sklearn.impute import SimpleImputer

num_cols = ["ApplicantIncome", "CoapplicantIncome", "LoanAmount", "Loan_Amount_Term"]
num_imputer = SimpleImputer(strategy="mean")

cat_cols = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area", "Credit_History"]
cat_imputer = SimpleImputer(strategy="most_frequent")

train[num_cols] = num_imputer.fit_transform(train[num_cols])
valid[num_cols] = num_imputer.transform(valid[num_cols])
test[num_cols] = num_imputer.transform(test[num_cols])

train[cat_cols] = cat_imputer.fit_transform(train[cat_cols])
valid[cat_cols] = cat_imputer.transform(valid[cat_cols])
test[cat_cols] = cat_imputer.transform(test[cat_cols])

### Encoding Categorical Features: Ordinal Encoding for Selected Features

In [70]:
from sklearn.preprocessing import OrdinalEncoder

# List of categorical column names
cat_col_names = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Property_Area"]

# Creating a LabelEncoder object
encoder = OrdinalEncoder()

# Ordinal encoding categorical features in train, validation, and test sets

train[cat_col_names] = encoder.fit_transform(train[cat_col_names])
valid[cat_col_names] = encoder.transform(valid[cat_col_names])
test[cat_col_names] = encoder.transform(test[cat_col_names])

In [71]:
train.head()

Unnamed: 0,target,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1.0,1.0,3.0,0.0,0.0,4342.0,189.0,124.0,360.0,1.0,1.0
1,1,1.0,1.0,2.0,0.0,0.0,3510.0,4416.0,243.0,360.0,1.0,0.0
2,0,1.0,0.0,0.0,0.0,1.0,6050.0,4333.0,120.0,180.0,1.0,2.0
3,1,0.0,0.0,0.0,0.0,0.0,3692.0,0.0,93.0,360.0,1.0,0.0
4,1,1.0,1.0,2.0,0.0,0.0,6250.0,1695.0,210.0,360.0,1.0,1.0


In [72]:
valid.head()

Unnamed: 0,target,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0.0,0.0,0.0,0.0,0.0,8333.0,0.0,280.0,360.0,1.0,1.0
1,1,1.0,1.0,1.0,0.0,0.0,5468.0,1032.0,26.0,360.0,1.0,1.0
2,1,1.0,0.0,0.0,0.0,0.0,2500.0,20000.0,103.0,360.0,1.0,1.0
3,1,1.0,1.0,0.0,1.0,0.0,2875.0,1750.0,105.0,360.0,1.0,1.0
4,1,0.0,0.0,0.0,1.0,0.0,2165.0,0.0,70.0,360.0,1.0,1.0


In [73]:
test.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1.0,1.0,0.0,0.0,0.0,5720.0,0.0,110.0,360.0,1.0,2.0
1,1.0,1.0,1.0,0.0,0.0,3076.0,1500.0,126.0,360.0,1.0,2.0
2,1.0,1.0,2.0,0.0,0.0,5000.0,1800.0,208.0,360.0,1.0,2.0
3,1.0,1.0,2.0,0.0,0.0,2340.0,2546.0,100.0,360.0,1.0,2.0
4,1.0,0.0,0.0,1.0,0.0,3276.0,0.0,78.0,360.0,1.0,2.0


### Uploading Preprocessed Data to S3 Bucket via "upload_fileobj"

In [74]:
import io
import os
import json

# Convert dataframes to CSV format and encode as bytes
train_buffer = io.BytesIO(train.to_csv(index=False, header=False).encode("utf-8"))
valid_buffer = io.BytesIO(valid.to_csv(index=False, header=False).encode("utf-8"))
test_buffer = io.BytesIO(test.to_csv(index=False, header=False).encode("utf-8"))

# Upload the training data buffer to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train/data.csv")
).upload_fileobj(train_buffer)

# Upload the validation data buffer to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "validation/data.csv")
).upload_fileobj(valid_buffer)

# Upload the test data buffer to S3
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "test/data.csv")
).upload_fileobj(test_buffer)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


### Writing SKLearn Estimator Script to File

In [75]:
%%writefile sklearn-estimator.py

import os
import joblib
import argparse
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

if __name__ == "__main__":
    # Define command-line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type=int, default=50)  # Number of epochs for training (default: 50)
    parser.add_argument('--batch-size', type=int, default=64)  # Batch size for training (default: 64)
    parser.add_argument('--learning-rate', type=float, default=0.05)  # Learning rate for training (default: 0.05)
    
    # SageMaker environment variables for I/O
    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    
    # Parse command-line arguments
    args = parser.parse_args()
    
    # Construct input file paths
    input_files = [os.path.join(args.train, file) for file in os.listdir(args.train)]
    
    # Check if any input files are found
    if len(input_files) == 0:
        raise ValueError(("There are no files in {}.\n" +
                          "This usually indicates that the channel ({}) was incorrectly specified, \n" +
                          "the data specification in S3 was incorrectly specified, or the specified role\n" +
                          "does not have permission to access the data.").format(args.train, "train"))
    
    # Read and concatenate the raw data
    raw_data = [pd.read_csv(file, header=None) for file in input_files]
    concat_data = pd.concat(raw_data)
    
    # Split features and labels
    X_train = concat_data.iloc[:, 1:]  # Features
    y_train = concat_data.iloc[:, 0]   # Labels
    
    # Initialize and train the RandomForestClassifier
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    
    # Save the trained model to the model directory
    joblib.dump(clf, os.path.join(args.model_dir, "model.joblib"))


Overwriting sklearn-estimator.py


### AWS SageMaker SKLearn Estimator Configuration

In [76]:
from sagemaker import get_execution_role
from sagemaker.sklearn.estimator import SKLearn

# Get the execution role
role = get_execution_role()

# Define the S3 location of the training data
train_input = "s3://sagemaker-eu-north-1-339713058917/sklearn-demo/train/"

# Configure the SKLearn estimator
sklearn_estimator = SKLearn(
    entry_point='sklearn-estimator.py',  # Name of the entry point script
    role=role,  # SageMaker execution role
    instance_type='ml.m5.xlarge',  # Instance type for training
    framework_version='1.0-1',  # SKLearn framework version
    hyperparameters={'epochs': 20, 'batch-size': 64, 'learning-rate': 0.1},  # Hyperparameters for training
)

### Fitting the SKLearn Estimator to Training Data

In [77]:
# Fit the SKLearn estimator to the training data
sklearn_estimator.fit({"train": train_input})

INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2024-03-31-15-44-12-327


2024-03-31 15:44:12 Starting - Starting the training job......
2024-03-31 15:45:05 Starting - Preparing the instances for training...
2024-03-31 15:45:34 Downloading - Downloading input data...
2024-03-31 15:45:49 Downloading - Downloading the training image...
2024-03-31 15:46:29 Training - Training image download completed. Training in progress.[34m2024-03-31 15:46:35,428 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-03-31 15:46:35,432 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-03-31 15:46:35,435 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-03-31 15:46:35,450 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-03-31 15:46:35,681 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-03-31 15:46:35,684 sagemaker-training-toolkit INFO     No 

### Deploying SKLearn Estimator as Endpoint and Making Predictions

In [None]:
# Deploy the trained SKLearn estimator as an endpoint
predictor = sklearn_estimator.deploy(
    instance_type='ml.m5.xlarge',  # Instance type for the endpoint
    initial_instance_count=1,  # Number of instances to deploy initially
)

# Now you can use the predictor to make predictions
# `data` should be a NumPy array or a Python list.
# `response` will be a NumPy array containing the predictions.
# Example:
# response = predictor.predict(data)