# Setup
Upgrade and install needed libraries

In [None]:
pip install --upgrade pip

In [None]:
%pip install boto3 botocore sagemaker numpy pandas pyyaml python-dateutil

In [None]:
# import libraries
import boto3, sagemaker
import numpy as np
import pandas as pd
import time
import os
from invst_utility import *
from sagemaker.feature_store.feature_group import ( FeatureGroup )

# This instantiates a SageMaker session that we will be operating in.
session = sagemaker.Session()
# This object represents the IAM role that we are assigned.
role = sagemaker.get_execution_role()
print("Exectuion role :",role)
bucket = session.default_bucket()
print("Bucket:", bucket)
region = session.boto_region_name
print("Region:", region)

#s3_bucket="sagemaker-us-east-1-246694999211"

# using XG Boost as example
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "latest")
xgboost_container

# Access feature group data via Athena.  
## Feature group data reside in Central Feature Store account. Metadata and permissions reside in Central Governance account.


In [None]:
import awswrangler as wr
sts_client = boto3.client('sts')

# Call the assume_role method of the STSConnection object and pass the role
# ARN and a role session name.
assumed_role_object=sts_client.assume_role(
    RoleArn="arn:aws:iam::${AWS::AccountId}:role/AthenaConsumerAssumeRole",
    RoleSessionName="AssumeRoleSession1"
)

# From the response that contains the assumed role, get the temporary 
# credentials that can be used to make subsequent API calls
credentials=assumed_role_object['Credentials']


boto3_session = boto3.Session(aws_access_key_id=credentials['AccessKeyId'], 
                           aws_secret_access_key=credentials['SecretAccessKey'], 
                           aws_session_token=credentials['SessionToken'], region_name="us-east-1")

query='SELECT * FROM "rl_centralfeaturestore"."fg_bank_marketing_1717511943"'

try:
    # Retrieving the data from Amazon Athena
    athena_results_df = wr.athena.read_sql_query(
        query,
        database='rl_centralfeaturestore',
        boto3_session=boto3_session,
        ctas_approach=False,
        keep_files=False
    )

    print("Query completed, data retrieved successfully!")
except Exception as e:
    print(f"Something went wrong... the error is:{e}")
    raise Exception(e)

#replace feature group fg_bank_marketing_1717511943 with your feature group name and OutputLocation with your S3 location


In [None]:
model_data = athena_results_df

In [None]:
model_data

In [None]:
model_data.dtypes

Verify data types

# Prepare data for training
- Encode categorical column data.
- Convert boolean and Object type columns into 1/0

In [None]:
# Feature prep - drop the Duration, as it was post-facto data
model_data = model_data.drop(
    labels=[
        "write_time",
        "eventtime",
        "api_invocation_time",
        "customerid",
        "partition_0",
    ],
    axis="columns",
)

# One hot encode categorical variables
model_data = pd.get_dummies(model_data)

In [None]:
# one-hot the categorical columns:
# encode True/False to 1/0
bool_cols = model_data.select_dtypes(include=["bool"]).columns
for col in bool_cols:
    model_data[col] = model_data[col].astype(int)
    
model_data.head()

In [None]:
# move the predicted colum to first - as XGB expects
# Add predicting column at the beginning of the dataframe
model_data["y_yes"] = pd.NA
y_yes = [1, 0]
model_data["y_yes"] = model_data["y_yes"].apply(
    lambda x: np.random.choice(y_yes, p=[0.64, 0.36])
)
predict_col = model_data.pop("y_yes")
model_data.insert(0, "y_yes", predict_col)
model_data.head()

Split the data for training, validation and testing

In [None]:
# split the data into train, validate, test:
train_data, val_data, test_data = np.split(
    model_data.sample(frac=1, random_state=1729),
    [int(0.7 * len(model_data)), int(0.9 * len(model_data))],
)

Upload the data to S3 bucket for training

In [None]:
base_dest= "./data/"
s3_output_bucket = "s3://"+bucket+"/query_results/"
s3_path="ml-workshop-module4"
train_path = base_dest +"train"
val_path = base_dest + "validation"
test_path = base_dest + "test"

boto3.set_stream_logger("boto3.resources", boto3.logging.INFO)
boto3.setup_default_session(region_name=region)

try:
    os.makedirs(train_path)
    os.makedirs(val_path)
    os.makedirs(test_path)
except Exception:
    pass

train_data.to_csv(train_path + "/train.csv", index=False, header=None)
val_data.to_csv(val_path + "/validation.csv", index=False, header=None)
test_data.to_csv(test_path + "/test.csv", index=False, header=None)

try:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(s3_path, 'train/train.csv')).upload_file(train_path+'/train.csv')
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(s3_path, 'validation/validation.csv')).upload_file(val_path+'/validation.csv')
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(s3_path, 'test/test.csv')).upload_file(test_path+'/test.csv')
except Exception as e:
    print(e)
    
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, s3_path), content_type='csv')
s3_input_val = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation'.format(bucket, s3_path), content_type='csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket, s3_path), content_type='csv')

# Model training

In [None]:
sts_client = boto3.client('sts')
accountId = sts_client.get_caller_identity()["Account"]
bucket = f"sagemaker-{accountId}-mlops"
bucket

In [None]:
#replace output_kms_key with yours
xgb = sagemaker.estimator.Estimator(xgboost_container,role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_kms_key = "arn:aws:kms:us-east-1:${AWS::AccountId}:key/{replace-key}}", #replace output_kms_key
                                    output_path='s3://{}/{}/output'.
                                    format(bucket, s3_path),sagemaker_session=session)
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100)
xgb.fit({'train': s3_input_train, "validation": s3_input_val})


In [None]:
job_details = xgb.latest_training_job.describe()
job_details

In [None]:
model_s3 = job_details['ModelArtifacts']['S3ModelArtifacts']
model_s3

# Register trained model
Register the model in to model register 

In [None]:
# Create the model package 
model_package_group_name = "module4-" + str(round(time.time()))
model_package_group_input_dict = {
 "ModelPackageGroupName" : model_package_group_name,
 "ModelPackageGroupDescription" : "Module 4 model package group"
}

registry = boto3.client('sagemaker')
create_model_package_group_response = registry.create_model_package_group(**model_package_group_input_dict)
print('ModelPackageGroup Arn : {}'.format(create_model_package_group_response['ModelPackageGroupArn']))

# Model registry Register the version
model_url = model_s3

modelpackage_inference_specification =  {
    "InferenceSpecification": {
      "Containers": [
         {
            "Image": xgboost_container,
	         "ModelDataUrl": model_url
         }
      ],
      "SupportedContentTypes": [ "text/csv" ],
      "SupportedResponseMIMETypes": [ "text/csv" ],
   }
 }

create_model_package_input_dict = {
    "ModelPackageGroupName" : model_package_group_name,
    "ModelPackageDescription" : "Bank loan default",
    "ModelApprovalStatus" : "PendingManualApproval"
}
create_model_package_input_dict.update(modelpackage_inference_specification)

create_model_package_response = registry.create_model_package(**create_model_package_input_dict)
model_package_arn = create_model_package_response["ModelPackageArn"]
print('ModelPackage Version ARN : {}'.format(model_package_arn))