# Modeling with XGBoost

### Steps Covered:  
1. **Load Data from Feature Store / S3**
2. **Train XGBoost Model using SageMaker Training Job**
3. **Evaluate Performance with Processing Job** (accuracy, precision, recall, F1-score, etc.)
4. **Register Model in SageMaker Model Registry**
5. **Deploy Model as a SageMaker Endpoint**
6. **Decomission Endpoint** (optional)

In [9]:
!pip install awswrangler --quiet

In [50]:
import pandas as pd
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup
import boto3
from sklearn.model_selection import train_test_split

from sagemaker.estimator import Estimator
from sagemaker.predictor import Predictor
from sagemaker.inputs import TrainingInput
from sagemaker.model import Model
from sagemaker import ModelPackage

from sagemaker.clarify import BiasConfig, DataConfig, ModelConfig
from sagemaker.model_monitor import DataCaptureConfig, ModelBiasMonitor
from sagemaker.serializers import CSVSerializer

from io import StringIO

## Setting up Retrieval from Feature Store

In [2]:
sess = sagemaker.Session()

bucket = sess.default_bucket()

role = sagemaker.get_execution_role()

region = boto3.Session().region_name

account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

s3 = boto3.client('s3')

featurestore_runtime = boto3.client("sagemaker-featurestore-runtime")

feature_group_name = "employee-attrition-feature-store"

In [3]:
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sess)

In [18]:
feature_group_description = feature_group.describe()
offline_store_uri = feature_group_description["OfflineStoreConfig"]["S3StorageConfig"]["S3Uri"]
print(f"📍 Offline Store S3 Path: {offline_store_uri}")

📍 Offline Store S3 Path: s3://sagemaker-us-east-1-203012117619/aai-540-group-3-final-project/data/db_source/


## Data Splits

In [19]:
# Download the file from S3 to a local file object
response = s3.get_object(Bucket=bucket, Key='aai-540-group-3-final-project/data/db_source/remaining_data.csv')

# Read the content of the file into a pandas DataFrame
offline_data = pd.read_csv(response['Body'])

offline_data

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,10861,37,0,27,4,12617,1,0,0,1,...,2,0,2,57,1,0,0,0,0,0
1,33332,35,1,12,0,5935,3,0,0,2,...,1,2,1,19,0,0,0,3,0,0
2,17066,52,1,34,0,3908,3,3,0,1,...,2,1,1,63,0,1,0,3,2,0
3,62940,35,1,21,2,5663,2,2,0,0,...,2,2,1,70,0,0,0,1,2,1
4,65686,30,1,4,2,8184,1,0,2,4,...,3,1,2,50,0,0,0,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44694,33489,25,0,1,0,7550,0,0,0,0,...,3,1,2,17,1,0,0,2,2,1
44695,71741,38,1,23,0,4199,2,3,0,1,...,4,0,1,35,0,0,0,2,1,1
44696,14104,22,1,2,2,7631,2,0,0,0,...,3,0,2,41,0,0,0,2,2,0
44697,65630,50,1,36,4,7472,3,0,1,3,...,0,1,1,72,1,0,0,3,3,1


In [21]:
X = offline_data.drop(['Attrition'],axis=1)
y = offline_data['Attrition']

# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the splits
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(35759, 23) (8940, 23)
(35759,) (8940,)


## Adding Split Data to S3

In [22]:
# Save data for training
train_data = pd.concat([y_train, X_train], axis=1)
test_data = pd.concat([y_test, X_test], axis=1)

# Save to CSV
train_data.to_csv("training_data.csv", index=False, header=False)
test_data.to_csv("testing_data.csv", index=False, header=False)

In [24]:
# Paths for our training and testing data
train_path ="s3://{}/aai-540-group-3-final-project/data/splits/train".format(bucket)
test_path = "s3://{}/aai-540-group-3-final-project/data/splits/test".format(bucket)

train_path,test_path

('s3://sagemaker-us-east-1-203012117619/aai-540-group-3-final-project/data/splits/train',
 's3://sagemaker-us-east-1-203012117619/aai-540-group-3-final-project/data/splits/test')

### Uploading to S3

In [25]:
# Uploading Test Data to S3
!aws s3 cp "testing_data.csv" $test_path/

upload: ./testing_data.csv to s3://sagemaker-us-east-1-203012117619/aai-540-group-3-final-project/data/splits/test/testing_data.csv


In [26]:
# Uploading Train Data to S3
!aws s3 cp "training_data.csv" $train_path/

upload: ./training_data.csv to s3://sagemaker-us-east-1-203012117619/aai-540-group-3-final-project/data/splits/train/training_data.csv


## XGBoost Training

In [35]:
# Define model path
model_path = "s3://{}/aai-540-group-3-final-project/data/model".format(bucket)

# Define a unique model name
model_name = "retain_ai_xgb_fp_model"

# Retrieve AWS SageMaker XGBoost container image
image_uri = sagemaker.image_uris.retrieve(framework="xgboost", version="0.90-1", region=region)

# Train the XGBoost model
xgb = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"{model_path}/output",
    sagemaker_session=sess
)

# Set hyperparameters and defining evaluation metrics
xgb.set_hyperparameters(
    objective="binary:logistic",
    num_round=100,
    max_depth=5,
    eta=0.2,
    subsample=0.8,
    eval_metric="auc,aucpr,logloss,error" # AUC, F1 Score, Loss and Accuracy
)

# Define training & validation input data sources
train_input = TrainingInput(train_path, content_type="text/csv")
test_input = TrainingInput(test_path, content_type="text/csv")

# Launch training job with validation dataset
xgb.fit({"train": train_input, "validation": test_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-02-18-03-27-20-877


2025-02-18 03:27:24 Starting - Starting the training job...
2025-02-18 03:27:37 Starting - Preparing the instances for training...
2025-02-18 03:28:04 Downloading - Downloading input data...
2025-02-18 03:28:44 Downloading - Downloading the training image...
2025-02-18 03:29:20 Training - Training image download completed. Training in progress..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc,aucpr,logloss,error to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined del

## Model Evaluation

In [36]:
# Retrieve training job name
training_job_name = xgb.latest_training_job.name

# Get metrics from training job
training_job_info = sess.describe_training_job(training_job_name)
metrics = training_job_info["FinalMetricDataList"]

# Print available metrics
for metric in metrics:
    print(f"{metric['MetricName']}: {metric['Value']}")


validation:aucpr: 0.8571680188179016
validation:logloss: 0.48814401030540466
train:error: 0.20014500617980957
validation:auc: 0.8390949964523315
train:auc: 0.8919510245323181
validation:error: 0.2503359913825989
train:logloss: 0.41408899426460266
train:aucpr: 0.9076799750328064


## Registering Model

In [38]:
# Create model object
model = Model(
    image_uri=image_uri,
    model_data=xgb.model_data,
    role=role,
    sagemaker_session=sess
)

# Register model in SageMaker Model Registry
model_package = model.register(
    model_package_group_name="retain-ai-xgboost",
    approval_status="Approved"
)

print(f"Model registered: {model_package.model_package_arn}")

Model registered: arn:aws:sagemaker:us-east-1:203012117619:model-package/retain-ai-xgboost/1


## Deploying Endpoint

In [49]:
# Define model S3 path
# CAVEAT: This may need to be adjusted when running your notebook
model_url = "s3://sagemaker-us-east-1-203012117619/aai-540-group-3-final-project/data/model/output/sagemaker-xgboost-2025-02-18-03-27-20-877/output/model.tar.gz"

# Create SageMaker Model
model = Model(
    image_uri=image_uri,
    model_data=model_url,
    role=role,
    sagemaker_session=sess
)

# Define an endpoint name
endpoint_name = "retain-ai-xgb-endpoint"

# Enable data capture for bias monitoring
data_capture_config = DataCaptureConfig(
    enable_capture=True,
    sampling_percentage=100,  # Capture 100% of requests
    destination_s3_uri="s3://sagemaker-us-east-1-203012117619/aai-540-group-3-final-project/data/monitoring"
)

# Deploy model with data capture
model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
    data_capture_config=data_capture_config
)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-02-18-03-40-41-931
INFO:sagemaker:Creating endpoint-config with name retain-ai-xgb-endpoint
INFO:sagemaker:Creating endpoint with name retain-ai-xgb-endpoint


-----!

In [None]:
predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=session,
    serializer=CSVSerializer()
)

In [None]:
print(f"Model deployed at endpoint: {endpoint_name}")

## Spinning Down Endpoint (Optional)

In [None]:
# Delete the endpoint
sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
print(f"✅ Endpoint '{endpoint_name}' has been deleted.")

## Deleting Local Files
Serves as clean up before pushing to our repository

In [None]:
import os

def delete_csv_files(*filenames):
    if len(filenames) < 1:
        print("Please specify at least three CSV files to delete.")
        return
    
    for file in filenames:
        if file.endswith(".csv") and os.path.exists(file):
            try:
                os.remove(file)
                print(f"Deleted: {file}")
            except Exception as e:
                print(f"Error deleting {file}: {e}")
        else:
            print(f"File not found or not a CSV: {file}")

# Deleting CSV Files
delete_csv_files("training_data.csv","testing_data.csv")