# ML Model Training
This notebook retrieves the data from the feature store and trains an ML model using this data. The model is then deployed as a SageMaker endpoint. The model predicts a hotel cluster based on user characterestics. 

## Imports

In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.inputs import TrainingInput
from sagemaker.predictor import Predictor
from sagemaker import get_execution_role
import concurrent.futures
from pathlib import Path
import pandas as pd
import numpy as np
import sagemaker
import logging
import psutil
import boto3
import json
import time
import sys
import os

In [None]:
# import from a different path
sys.path.insert(0, '../utils')
path = Path(os.path.abspath(os.getcwd()))
package_dir = f'{str(path.parent)}/utils'
print(package_dir)
import utils

In [None]:
# install PyAthena if not already installed
import pip
def import_or_install(package):
    try:
        __import__(package)
    except ImportError:
        pip.main(['install', package])
import_or_install("pyathena==2.3.2")
from pyathena import connect

## Setup Logging

In [None]:
logger = logging.getLogger('__name__')
logging.basicConfig(format="%(asctime)s,%(filename)s,%(funcName)s,%(lineno)s,%(levelname)s,p%(process)s,%(message)s", level=logging.INFO)       
logger.info(f'Using SageMaker version: {sagemaker.__version__}')
logger.info(f'Using Pandas version: {pd.__version__}')

## Global Constants

In [None]:
# global constants
STACK_NAME = "expedia-feature-store-demo-v2"
RANDOM_STATE = 123
S3_DATA_DIR = "data"
LOCAL_DATA_DIR = "../data"
REGION = "us-east-1"
AWS_FEATURE_STORE_DATABASE = "sagemaker_featurestore"
ML_MODEL_TRAINING_ROUNDS = 100

## Setup Config Variables
Read the config variables used by this notebook from the cloud formation outputs and parameters.

In [None]:
# read output variables from cloud formation stack, these will be used as parameters throughout
# the code
data_bucket_name = utils.get_cfn_stack_outputs(STACK_NAME, 'DataBucketName')
model_bucket_name = utils.get_cfn_stack_outputs(STACK_NAME, 'MLModelBucketName')
athena_query_results_bucket_name = utils.get_cfn_stack_outputs(STACK_NAME, 'AthenaQueryResultsBucketName')
feature_store_bucket_name = utils.get_cfn_stack_outputs(STACK_NAME, 'FeatureStoreBucketName')
hotel_cluster_prediction_fn_arn = utils.get_cfn_stack_outputs(STACK_NAME, 'HotelClusterPredictionFunction')

logger.info(f"data_bucket_name={data_bucket_name},\nathena_query_results_bucket_name={athena_query_results_bucket_name},\n"
            f"model_bucket_name={model_bucket_name}\nfeature_store_bucket_name={feature_store_bucket_name},\n"
            f"hotel_cluster_prediction_fn_arn={hotel_cluster_prediction_fn_arn}\n")

In [None]:
# this notebook reads the feature group name from the file in the data folder
customer_inputs_fg_name = utils.read_param("customer_inputs_fg_name")
destinations_fg_name = utils.read_param("destinations_fg_name")
customer_inputs_fg_table = utils.read_param("customer_inputs_fg_table")
destinations_fg_table = utils.read_param("destinations_fg_table")
raw_data_dir = utils.get_cfn_stack_parameters(STACK_NAME, 'RawDataDir')
app_name = utils.get_cfn_stack_parameters(STACK_NAME, 'AppName')

training_dataset_fname = utils.get_cfn_stack_parameters(STACK_NAME, 'TrainingDatasetFileName')
test_dataset_fname = utils.get_cfn_stack_parameters(STACK_NAME, 'TestDatasetFileName')
validation_dataset_fname = utils.get_cfn_stack_parameters(STACK_NAME, 'ValidationDatasetFileName')

training_job_instance_type = utils.get_cfn_stack_parameters(STACK_NAME, 'TrainingJobInstanceType')
if training_job_instance_type is None:
    training_job_instance_type = "ml.m5.xlarge"
training_job_instance_count = int(utils.get_cfn_stack_parameters(STACK_NAME, 'TrainingJobNodeInstanceCount'))

model_ep_instance_type = utils.get_cfn_stack_parameters(STACK_NAME, 'ModelEndpointInstanceType')
model_ep_instance_count = int(utils.get_cfn_stack_parameters(STACK_NAME, 'ModelEndpointInstanceCount'))

customer_input_stream_name = utils.get_cfn_stack_parameters(STACK_NAME, 'CustomerInputStreamName')
            
logger.info(f"customer_inputs_fg_table={customer_inputs_fg_table},\ndestinations_fg_table={destinations_fg_table},\n"
            f"customer_inputs_fg_name={customer_inputs_fg_name},\ndestinations_fg_name={destinations_fg_name}\n"
            f"raw_data_dir={raw_data_dir},\ntraining_dataset_fname={training_dataset_fname},\n"
            f"test_dataset_fname={test_dataset_fname},\nvalidation_dataset_fname=-{validation_dataset_fname}\n"
            f"training_job_instance_type={training_job_instance_type},\ntraining_job_instance_count={training_job_instance_count},\n"
            f"model_ep_instance_type={model_ep_instance_type},\nmodel_ep_instance_count={model_ep_instance_count},\ncustomer_input_stream_name={customer_input_stream_name}")

## Retreve training data from the offline feature stores

At this point the data needed for training the model exists in two separate feature groups, the customer inputs feature group and the destinations feature group. We will use Athena to run a SQL query to join the data and then read the results into a Pandas dataframe. We want to use Athena to do the heavy lifting of joining the large datasets rather than joining it here in this notebook.

In [None]:
role = get_execution_role()
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)

featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

account_id = boto3.client('sts').get_caller_identity()["Account"]

feature_store_session = sagemaker.Session(boto_session=boto_session, 
                                          sagemaker_client=sagemaker_client, 
                                          sagemaker_featurestore_runtime_client=featurestore_runtime)

logger.info(f"role={role}, region={region}, account_id={account_id}")

In [None]:
sagemaker_session = sagemaker.Session()

Setup the SQL query for the join. We join the destination principal components with the customer inputs. We exclude out columns such as user_id and is_deleted, api_invocation_time etc that are not needed during model training.

Note the "sagemaker_featurestore" that is the default database in which AWS keeps the feature store data.

In [None]:
query_string = f"""
select 
    L.hotel_cluster,
    L.site_name,
    L.posa_continent,
    L.user_location_country,
    L.user_location_region,
    L.user_location_city,
    L.orig_destination_distance,
    L.user_id,
    L.is_mobile,
    L.is_package,
    L.channel,
    L.srch_adults_cnt,
    L.srch_children_cnt,
    L.srch_rm_cnt,
    L.srch_destination_id,
    L.srch_destination_type_id,
    L.hotel_continent,
    L.hotel_country,
    L.hotel_market,
    L.duration,
    L.days_to_trip,
    L.start_of_trip_weekend,
    L.end_of_trip_weekend,
    R.pc1,
    R.pc2,
    R.pc3
from (
        "{AWS_FEATURE_STORE_DATABASE}"."{customer_inputs_fg_table}" as L
        left join "{AWS_FEATURE_STORE_DATABASE}"."{destinations_fg_table}" as R on L.srch_destination_id = R.srch_destination_id
    )
"""

In [None]:
logger.info(f"going to run the following query using Athena -> {query_string}")
conn = connect(s3_staging_dir=f's3://{athena_query_results_bucket_name}/',
               region_name=REGION)

df = pd.read_sql(query_string, conn)
logger.info(f"results of the query are in a dataframe of shape {df.shape}")
df.head()

## ML model training

At this point we are ready for ML model training. We have already excluded features we did not need for training from the Athena query so no further data preparation is required. 

We do a train/validation/test split and store the three datasets in S3. The Sagemaker ML model training job will retrieve the data directly from S3. We use an XGBoost container for training this model. The model is a mlti-class classification model with the hotel cluster being the target variable. All data is already available in numeric form as needed by XGBoost.

In [None]:
# list the data types of each feature, we would be converting some of the int features to categorical (object)
df.dtypes

SageMaker xgboost requires that the target column be the first column in the dataframe.

In [None]:
# rearrange columns by extracting the target column (hotel_cluster) and then adding it as the first column
# in the dataframe
first_column = df.pop('hotel_cluster')
df.insert(0, 'hotel_cluster', first_column)
df.head()

In [None]:
df.info()

In [None]:
num_hotel_clusters = len(df.hotel_cluster.unique())
logger.info(f"there are {num_hotel_clusters} unique hotel clusters in the data")

In [None]:
# train/test/validation split
# Note: numpy.split works like this: for the second param (indices_or_sections) when specified as a 1-D list  say [a,b] then first
# split return elements from 0 to a, second split contains a to b and third split contains b to the end of the array being split
# the df.sample with frac=1 is simply shuffling the dataset
df_train, df_validation, df_test = np.split(df.sample(frac=1, random_state=RANDOM_STATE), [int(.7*len(df)), int(.9*len(df))])

In [None]:
logger.info(f"shape of df_train={df_train.shape}, df_validation={df_validation.shape}, df_test={df_test.shape}")

In [None]:
# write the df_test to a data folder so that we can use this for streaming data when 
# testing real-time inference
# exclude the pc1/2/3 columns since they will be retrieved from the online feature store
os.makedirs(LOCAL_DATA_DIR, exist_ok=True)
fpath = os.path.join(LOCAL_DATA_DIR, test_dataset_fname)
cols_to_be_excluded = ['hotel_cluster', 'pc1', 'pc2', 'pc3']
df_test.loc[:, ~df_test.columns.isin(cols_to_be_excluded)].to_csv(fpath, index=False)

In [None]:
# upload dataframe to S3
utils.upload_df_to_s3(df_train, data_bucket_name, f'{S3_DATA_DIR}/{app_name}/{training_dataset_fname}')
utils.upload_df_to_s3(df_validation, data_bucket_name, f'{S3_DATA_DIR}/{app_name}/{validation_dataset_fname}')
utils.upload_df_to_s3(df_test, data_bucket_name, f'{S3_DATA_DIR}/{app_name}/{test_dataset_fname}')

In [None]:
logger.info(f"df_train shape={df_train.shape}, df_validation shape={df_validation.shape}, df_test shape={df_test.shape}")

In [None]:
display(df_train.head())
display(df_validation.head())
display(df_test.head())

In [None]:
train_set_location = f's3://{data_bucket_name}/{S3_DATA_DIR}/{app_name}/train.csv'
validation_set_location = f's3://{data_bucket_name}/{S3_DATA_DIR}/{app_name}/validation.csv'
test_set_location = f's3://{data_bucket_name}/{S3_DATA_DIR}/{app_name}/test.csv'

train_set_pointer = TrainingInput(s3_data=train_set_location, content_type='csv')
validation_set_pointer = TrainingInput(s3_data=validation_set_location, content_type='csv')
test_set_pointer = TrainingInput(s3_data=test_set_location, content_type='csv')
logger.info(f"train_set_pointer -> {json.dumps(train_set_pointer.__dict__, indent=2)},\n"
            f"validation_set_pointer -> {json.dumps(validation_set_pointer.__dict__, indent=2)},\n"
            f"test_set_pointer -> {json.dumps(test_set_pointer.__dict__, indent=2)}")

In [None]:
# a short sleep to make sure that files got uploaded to S3
time.sleep(10)

In [None]:
container_uri = sagemaker.image_uris.retrieve(region=region, 
                                              framework='xgboost', 
                                              version='1.0-1', 
                                              image_scope='training')
job_name = f"{app_name.replace('_', '-')}-ml-model"
xgb = sagemaker.estimator.Estimator(image_uri=container_uri,
                                    role=role, 
                                    instance_count=training_job_instance_count, 
                                    instance_type=training_job_instance_type,
                                    output_path=f's3://{data_bucket_name}/{app_name}/model-artifacts',
                                    sagemaker_session=sagemaker_session,
                                    base_job_name=job_name)

xgb.set_hyperparameters(objective='multi:softmax',
                        num_class=num_hotel_clusters,
                        num_round=ML_MODEL_TRAINING_ROUNDS)
xgb.fit({'train': train_set_pointer, 'validation': validation_set_pointer})

In [None]:
# Saving training job information to be used in the ML lineage module
training_job_info = xgb.latest_training_job.describe()
if training_job_info != None :
    training_job_name = training_job_info["TrainingJobName"]
    utils.write_param("training_job_name", training_job_name)

## Host the trained model as a SageMaker Endpoint

In [None]:
logger.info(f"going to deploy the trained model to model_ep_instance_type={model_ep_instance_type}, model_ep_instance_count={model_ep_instance_count}")
xgb_predictor = xgb.deploy(initial_instance_count=model_ep_instance_count,
                           instance_type=model_ep_instance_type)

In [None]:
# the model will be accepting csv as input
csv_serializer = CSVSerializer()

# store the endpoint in a filename for next stage (lineage tracking)
endpoint_name = xgb_predictor.endpoint_name
utils.write_param("endpoint_name", endpoint_name)

# setup the predictor endpoint    
predictor = Predictor(endpoint_name=endpoint_name, 
                      serializer=csv_serializer)


## Batch inference

Use Python multiprocessing to get inference for the entire dataframe by first splitting it into as many dataframes as there are cores on this machine and then get predictions one row at a time for each dataframe.

In [None]:
# handy function for getting inference
def get_inference(df):
    # get all the dataframe content as ndarray
    y_hat_list = []
    for r in df.values:
        # the first element of each row is the target variable
        y = r[0]
        
        # everything from the second element onwards is a feature for that row
        X = r[1:]
        
        # get the prediction. The prediction is returned as a float string so
        # "64.0" for 64, so we first convert the string to float and then to int
        # cant directly cast to int (invalid literal for int() with base 10: '' error)
        y_hat = int(float(predictor.predict(X).decode('utf-8')))
        
        # append it to a list so that at the end we have a list containing predictions
        # for each row of the input dataframe
        y_hat_list.append(y_hat)
    return y_hat_list

In [None]:
num_procs  = psutil.cpu_count(logical=False)
logger.info(f"num_procs={num_procs}")

# df_test = df_test.drop('hotel_cluster_predicted', axis=1)
# split the dataframe into as many parts as their are cores on this instance
df_splitted = np.array_split(df_test, num_procs)

# list for holding predictions for each dataframe
y_hat_list = []

start = time.time()
# setup parallel predictions for each dataframe
with concurrent.futures.ProcessPoolExecutor(max_workers=num_procs) as executor:
    results = [ executor.submit(get_inference, df=df) for df in df_splitted ]
    for result in concurrent.futures.as_completed(results):
        try:
            y_hat_list.append(result.result())
        except Exception as ex:
            logger.error(str(ex))
            pass
end = time.time()
logger.info(f"PPID {os.getpid()}, all done in {round(end-start,2)}s")

# flatten out the list (remember we have a list containing predicted values for each split of the original dataframe, so at this time we have a list of lists)
y_hat = [y_hat for y_hat_sublist in y_hat_list for y_hat in y_hat_sublist]


In [None]:
# add the prediction as a new column to the dataframe
df_test['hotel_cluster_predicted'] = y_hat

In [None]:
# how many did we predict correctly?
correct = sum(df_test.hotel_cluster == df_test.hotel_cluster_predicted)
logger.info(f"the model predicted {correct} correctly out of {df_test.shape[0]}, accuracy={round(100*(correct/df_test.shape[0]), 2)}%")