# ML Model Training
This notebook retrieves the data from the feature store and trains an ML model using this data. The model is then deployed as a SageMaker endpoint. The model predicts a hotel cluster based on user characterestics. 

## Imports

In [1]:
from sagemaker.serializers import CSVSerializer
from sagemaker.inputs import TrainingInput
from sagemaker.predictor import Predictor
from sagemaker import get_execution_role
import concurrent.futures
from pathlib import Path
import pandas as pd
import numpy as np
import sagemaker
import logging
import psutil
import boto3
import json
import time
import sys
import os

In [2]:
# import from a different path
sys.path.insert(0, '../utils')
path = Path(os.path.abspath(os.getcwd()))
package_dir = f'{str(path.parent)}/utils'
print(package_dir)
import utils

/home/ec2-user/SageMaker/feature-store-expedia/utils


In [3]:
# install PyAthena if not already installed
import pip
def import_or_install(package):
    try:
        __import__(package)
    except ImportError:
        pip.main(['install', package])
import_or_install("pyathena==2.3.2")
from pyathena import connect

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.




## Setup Logging

In [4]:
logger = logging.getLogger('__name__')
logging.basicConfig(format="%(asctime)s,%(filename)s,%(funcName)s,%(lineno)s,%(levelname)s,p%(process)s,%(message)s", level=logging.INFO)       
logger.info(f'Using SageMaker version: {sagemaker.__version__}')
logger.info(f'Using Pandas version: {pd.__version__}')

Using SageMaker version: 2.86.2
Using Pandas version: 1.1.5


## Global Constants

In [5]:
# global constants
STACK_NAME = "expedia-feature-store-demo-v2"
RANDOM_STATE = 123
S3_DATA_DIR = "data"
LOCAL_DATA_DIR = "../data"
REGION = "us-east-1"
AWS_FEATURE_STORE_DATABASE = "sagemaker_featurestore"
ML_MODEL_TRAINING_ROUNDS = 100

## Setup Config Variables
Read the config variables used by this notebook from the cloud formation outputs and parameters.

In [6]:
# read output variables from cloud formation stack, these will be used as parameters throughout
# the code
data_bucket_name = utils.get_cfn_stack_outputs(STACK_NAME, 'DataBucketName')
model_bucket_name = utils.get_cfn_stack_outputs(STACK_NAME, 'MLModelBucketName')
athena_query_results_bucket_name = utils.get_cfn_stack_outputs(STACK_NAME, 'AthenaQueryResultsBucketName')
feature_store_bucket_name = utils.get_cfn_stack_outputs(STACK_NAME, 'FeatureStoreBucketName')
hotel_cluster_prediction_fn_arn = utils.get_cfn_stack_outputs(STACK_NAME, 'HotelClusterPredictionFunction')

logger.info(f"data_bucket_name={data_bucket_name},\nathena_query_results_bucket_name={athena_query_results_bucket_name},\n"
            f"model_bucket_name={model_bucket_name}\nfeature_store_bucket_name={feature_store_bucket_name},\n"
            f"hotel_cluster_prediction_fn_arn={hotel_cluster_prediction_fn_arn}\n")



data_bucket_name=expedia-customer-behavior-data-2345bbc0,
athena_query_results_bucket_name=athena-query-results-2345bbc0,
model_bucket_name=expedia-ml-models-2345bbc0
feature_store_bucket_name=expedia-feature-store-offline-2345bbc0,
hotel_cluster_prediction_fn_arn=arn:aws:lambda:us-east-1:924873211303:function:PredictHotelCluster



In [7]:
# this notebook reads the feature group name from the file in the data folder
customer_inputs_fg_name = utils.read_param("customer_inputs_fg_name")
destinations_fg_name = utils.read_param("destinations_fg_name")
customer_inputs_fg_table = utils.read_param("customer_inputs_fg_table")
destinations_fg_table = utils.read_param("destinations_fg_table")
raw_data_dir = utils.get_cfn_stack_parameters(STACK_NAME, 'RawDataDir')
app_name = utils.get_cfn_stack_parameters(STACK_NAME, 'AppName')

training_dataset_fname = utils.get_cfn_stack_parameters(STACK_NAME, 'TrainingDatasetFileName')
test_dataset_fname = utils.get_cfn_stack_parameters(STACK_NAME, 'TestDatasetFileName')
validation_dataset_fname = utils.get_cfn_stack_parameters(STACK_NAME, 'ValidationDatasetFileName')

training_job_instance_type = utils.get_cfn_stack_parameters(STACK_NAME, 'TrainingJobInstanceType')
if training_job_instance_type is None:
    training_job_instance_type = "ml.m5.xlarge"
training_job_instance_count = int(utils.get_cfn_stack_parameters(STACK_NAME, 'TrainingJobNodeInstanceCount'))

model_ep_instance_type = utils.get_cfn_stack_parameters(STACK_NAME, 'ModelEndpointInstanceType')
model_ep_instance_count = int(utils.get_cfn_stack_parameters(STACK_NAME, 'ModelEndpointInstanceCount'))

customer_input_stream_name = utils.get_cfn_stack_parameters(STACK_NAME, 'CustomerInputStreamName')
            
logger.info(f"customer_inputs_fg_table={customer_inputs_fg_table},\ndestinations_fg_table={destinations_fg_table},\n"
            f"customer_inputs_fg_name={customer_inputs_fg_name},\ndestinations_fg_name={destinations_fg_name}\n"
            f"raw_data_dir={raw_data_dir},\ntraining_dataset_fname={training_dataset_fname},\n"
            f"test_dataset_fname={test_dataset_fname},\nvalidation_dataset_fname=-{validation_dataset_fname}\n"
            f"training_job_instance_type={training_job_instance_type},\ntraining_job_instance_count={training_job_instance_count},\n"
            f"model_ep_instance_type={model_ep_instance_type},\nmodel_ep_instance_count={model_ep_instance_count},\ncustomer_input_stream_name={customer_input_stream_name}")

read_param, fpath=../config/customer_inputs_fg_name, read customer_inputs_fg_name=expedia-customer-inputs-2022-6-24-21-43
read_param, fpath=../config/destinations_fg_name, read destinations_fg_name=expedia-destinations-2022-6-24-21-43
read_param, fpath=../config/customer_inputs_fg_table, read customer_inputs_fg_table=expedia-customer-inputs-2022-6-24-21-43-1656107063
read_param, fpath=../config/destinations_fg_table, read destinations_fg_table=expedia-destinations-2022-6-24-21-43-1656107143
customer_inputs_fg_table=expedia-customer-inputs-2022-6-24-21-43-1656107063,
destinations_fg_table=expedia-destinations-2022-6-24-21-43-1656107143,
customer_inputs_fg_name=expedia-customer-inputs-2022-6-24-21-43,
destinations_fg_name=expedia-destinations-2022-6-24-21-43
raw_data_dir=raw_data,
training_dataset_fname=train.csv,
test_dataset_fname=test.csv,
validation_dataset_fname=-validation.csv
training_job_instance_type=ml.m5.xlarge,
training_job_instance_count=2,
model_ep_instance_type=ml.m5.xlarg

## Retreve training data from the offline feature stores

At this point the data needed for training the model exists in two separate feature groups, the customer inputs feature group and the destinations feature group. We will use Athena to run a SQL query to join the data and then read the results into a Pandas dataframe. We want to use Athena to do the heavy lifting of joining the large datasets rather than joining it here in this notebook.

In [8]:
role = get_execution_role()
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)

featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

account_id = boto3.client('sts').get_caller_identity()["Account"]

feature_store_session = sagemaker.Session(boto_session=boto_session, 
                                          sagemaker_client=sagemaker_client, 
                                          sagemaker_featurestore_runtime_client=featurestore_runtime)

logger.info(f"role={role}, region={region}, account_id={account_id}")

role=arn:aws:iam::924873211303:role/expedia-feature-store-demo-v2-SageMakerRole-1RP4Q27JNDN68, region=us-east-1, account_id=924873211303


In [9]:
sagemaker_session = sagemaker.Session()

Setup the SQL query for the join. We join the destination principal components with the customer inputs. We exclude out columns such as user_id and is_deleted, api_invocation_time etc that are not needed during model training.

Note the "sagemaker_featurestore" that is the default database in which AWS keeps the feature store data.

In [10]:
query_string = f"""
select 
    L.hotel_cluster,
    L.site_name,
    L.posa_continent,
    L.user_location_country,
    L.user_location_region,
    L.user_location_city,
    L.orig_destination_distance,
    L.user_id,
    L.is_mobile,
    L.is_package,
    L.channel,
    L.srch_adults_cnt,
    L.srch_children_cnt,
    L.srch_rm_cnt,
    L.srch_destination_id,
    L.srch_destination_type_id,
    L.hotel_continent,
    L.hotel_country,
    L.hotel_market,
    L.duration,
    L.days_to_trip,
    L.start_of_trip_weekend,
    L.end_of_trip_weekend,
    R.pc1,
    R.pc2,
    R.pc3
from (
        "{AWS_FEATURE_STORE_DATABASE}"."{customer_inputs_fg_table}" as L
        left join "{AWS_FEATURE_STORE_DATABASE}"."{destinations_fg_table}" as R on L.srch_destination_id = R.srch_destination_id
    )
"""

In [11]:
logger.info(f"going to run the following query using Athena -> {query_string}")
conn = connect(s3_staging_dir=f's3://{athena_query_results_bucket_name}/',
               region_name=REGION)

df = pd.read_sql(query_string, conn)
logger.info(f"results of the query are in a dataframe of shape {df.shape}")
df.head()

going to run the following query using Athena -> 
select 
    L.hotel_cluster,
    L.site_name,
    L.posa_continent,
    L.user_location_country,
    L.user_location_region,
    L.user_location_city,
    L.orig_destination_distance,
    L.user_id,
    L.is_mobile,
    L.is_package,
    L.channel,
    L.srch_adults_cnt,
    L.srch_children_cnt,
    L.srch_rm_cnt,
    L.srch_destination_id,
    L.srch_destination_type_id,
    L.hotel_continent,
    L.hotel_country,
    L.hotel_market,
    L.duration,
    L.days_to_trip,
    L.start_of_trip_weekend,
    L.end_of_trip_weekend,
    R.pc1,
    R.pc2,
    R.pc3
from (
        "sagemaker_featurestore"."expedia-customer-inputs-2022-6-24-21-43-1656107063" as L
        left join "sagemaker_featurestore"."expedia-destinations-2022-6-24-21-43-1656107143" as R on L.srch_destination_id = R.srch_destination_id
    )

results of the query are in a dataframe of shape (19689, 26)


Unnamed: 0,hotel_cluster,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,hotel_continent,hotel_country,hotel_market,duration,days_to_trip,start_of_trip_weekend,end_of_trip_weekend,pc1,pc2,pc3
0,59,2,3,66,351,19310,2038.725,414732,0,0,...,2,50,663,2.0,2.0,0,0,0.815249,-0.211815,0.039287
1,91,2,3,66,321,46490,745.8792,429419,0,0,...,2,50,447,1.0,1.0,0,0,0.230422,-0.246673,-0.011226
2,21,2,3,66,435,5391,4264.8641,590837,0,0,...,6,68,275,2.0,14.0,1,0,1.046715,-0.431724,-0.58026
3,40,2,3,66,348,48862,355.5705,1103512,0,0,...,2,50,620,1.0,2.0,0,0,-0.115928,-0.224723,-0.198649
4,28,2,3,66,448,46588,2424.2759,350179,0,0,...,2,50,1230,1.0,122.0,0,1,1.124057,-0.18195,-0.166327


## ML model training

At this point we are ready for ML model training. We have already excluded features we did not need for training from the Athena query so no further data preparation is required. 

We do a train/validation/test split and store the three datasets in S3. The Sagemaker ML model training job will retrieve the data directly from S3. We use an XGBoost container for training this model. The model is a mlti-class classification model with the hotel cluster being the target variable. All data is already available in numeric form as needed by XGBoost.

In [12]:
# list the data types of each feature, we would be converting some of the int features to categorical (object)
df.dtypes

hotel_cluster                  int64
site_name                      int64
posa_continent                 int64
user_location_country          int64
user_location_region           int64
user_location_city             int64
orig_destination_distance    float64
user_id                       object
is_mobile                      int64
is_package                     int64
channel                        int64
srch_adults_cnt                int64
srch_children_cnt              int64
srch_rm_cnt                    int64
srch_destination_id           object
srch_destination_type_id       int64
hotel_continent                int64
hotel_country                  int64
hotel_market                   int64
duration                     float64
days_to_trip                 float64
start_of_trip_weekend          int64
end_of_trip_weekend            int64
pc1                          float64
pc2                          float64
pc3                          float64
dtype: object

SageMaker xgboost requires that the target column be the first column in the dataframe.

In [13]:
# rearrange columns by extracting the target column (hotel_cluster) and then adding it as the first column
# in the dataframe
first_column = df.pop('hotel_cluster')
df.insert(0, 'hotel_cluster', first_column)
df.head()

Unnamed: 0,hotel_cluster,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,hotel_continent,hotel_country,hotel_market,duration,days_to_trip,start_of_trip_weekend,end_of_trip_weekend,pc1,pc2,pc3
0,59,2,3,66,351,19310,2038.725,414732,0,0,...,2,50,663,2.0,2.0,0,0,0.815249,-0.211815,0.039287
1,91,2,3,66,321,46490,745.8792,429419,0,0,...,2,50,447,1.0,1.0,0,0,0.230422,-0.246673,-0.011226
2,21,2,3,66,435,5391,4264.8641,590837,0,0,...,6,68,275,2.0,14.0,1,0,1.046715,-0.431724,-0.58026
3,40,2,3,66,348,48862,355.5705,1103512,0,0,...,2,50,620,1.0,2.0,0,0,-0.115928,-0.224723,-0.198649
4,28,2,3,66,448,46588,2424.2759,350179,0,0,...,2,50,1230,1.0,122.0,0,1,1.124057,-0.18195,-0.166327


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   hotel_cluster              19689 non-null  int64  
 1   site_name                  19689 non-null  int64  
 2   posa_continent             19689 non-null  int64  
 3   user_location_country      19689 non-null  int64  
 4   user_location_region       19689 non-null  int64  
 5   user_location_city         19689 non-null  int64  
 6   orig_destination_distance  19689 non-null  float64
 7   user_id                    19689 non-null  object 
 8   is_mobile                  19689 non-null  int64  
 9   is_package                 19689 non-null  int64  
 10  channel                    19689 non-null  int64  
 11  srch_adults_cnt            19689 non-null  int64  
 12  srch_children_cnt          19689 non-null  int64  
 13  srch_rm_cnt                19689 non-null  int

In [15]:
num_hotel_clusters = len(df.hotel_cluster.unique())
logger.info(f"there are {num_hotel_clusters} unique hotel clusters in the data")

there are 100 unique hotel clusters in the data


In [16]:
# train/test/validation split
# Note: numpy.split works like this: for the second param (indices_or_sections) when specified as a 1-D list  say [a,b] then first
# split return elements from 0 to a, second split contains a to b and third split contains b to the end of the array being split
# the df.sample with frac=1 is simply shuffling the dataset
df_train, df_validation, df_test = np.split(df.sample(frac=1, random_state=RANDOM_STATE), [int(.7*len(df)), int(.9*len(df))])

In [17]:
logger.info(f"shape of df_train={df_train.shape}, df_validation={df_validation.shape}, df_test={df_test.shape}")

shape of df_train=(13782, 26), df_validation=(3938, 26), df_test=(1969, 26)


In [18]:
# write the df_test to a data folder so that we can use this for streaming data when 
# testing real-time inference
# exclude the pc1/2/3 columns since they will be retrieved from the online feature store
os.makedirs(LOCAL_DATA_DIR, exist_ok=True)
fpath = os.path.join(LOCAL_DATA_DIR, test_dataset_fname)
cols_to_be_excluded = ['hotel_cluster', 'pc1', 'pc2', 'pc3']
df_test.loc[:, ~df_test.columns.isin(cols_to_be_excluded)].to_csv(fpath, index=False)

In [19]:
# upload dataframe to S3
utils.upload_df_to_s3(df_train, data_bucket_name, f'{S3_DATA_DIR}/{app_name}/{training_dataset_fname}')
utils.upload_df_to_s3(df_validation, data_bucket_name, f'{S3_DATA_DIR}/{app_name}/{validation_dataset_fname}')
utils.upload_df_to_s3(df_test, data_bucket_name, f'{S3_DATA_DIR}/{app_name}/{test_dataset_fname}')

upload_df_to_s3, going to upload df of shape=(13782, 26) to s3 object expedia-customer-behavior-data-2345bbc0/data/hotel_cluster_prediction/train.csv
upload_df_to_s3, going to upload df of shape=(3938, 26) to s3 object expedia-customer-behavior-data-2345bbc0/data/hotel_cluster_prediction/validation.csv




upload_df_to_s3, going to upload df of shape=(1969, 26) to s3 object expedia-customer-behavior-data-2345bbc0/data/hotel_cluster_prediction/test.csv


In [20]:
logger.info(f"df_train shape={df_train.shape}, df_validation shape={df_validation.shape}, df_test shape={df_test.shape}")

df_train shape=(13782, 26), df_validation shape=(3938, 26), df_test shape=(1969, 26)


In [21]:
display(df_train.head())
display(df_validation.head())
display(df_test.head())

Unnamed: 0,hotel_cluster,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,hotel_continent,hotel_country,hotel_market,duration,days_to_trip,start_of_trip_weekend,end_of_trip_weekend,pc1,pc2,pc3
5048,59,2,3,66,220,53780,455.0091,1121162,1,1,...,2,50,350,4.0,10.0,0,1,0.758594,-0.269274,0.159704
9928,42,2,3,66,447,17734,285.1325,99349,1,0,...,2,50,630,2.0,18.0,0,0,0.826747,-0.003045,0.622666
16968,28,2,3,66,435,16159,14.9961,213054,1,0,...,2,50,647,3.0,-1.0,0,1,0.726189,-0.388916,-0.059577
2449,48,34,3,205,155,42000,180.8638,783315,0,0,...,2,198,370,1.0,0.0,0,0,0.632124,-0.198417,0.19785
19663,4,2,3,66,174,40365,60.042,1138556,0,0,...,2,50,1230,1.0,4.0,0,1,1.124057,-0.18195,-0.166327


Unnamed: 0,hotel_cluster,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,hotel_continent,hotel_country,hotel_market,duration,days_to_trip,start_of_trip_weekend,end_of_trip_weekend,pc1,pc2,pc3
12080,33,34,3,205,155,8177,73.5084,975356,0,0,...,2,50,435,1.0,14.0,0,0,0.32276,-0.122403,-0.013122
10730,4,2,3,66,174,1112,1859.5847,26182,0,0,...,2,50,637,3.0,70.0,0,1,1.308345,-0.368567,-0.028236
10576,13,34,3,205,155,14703,156.1108,478444,0,0,...,2,50,551,1.0,2.0,0,0,0.310552,-0.032776,0.244337
8357,31,2,3,66,220,13647,228.0965,1050092,0,0,...,4,128,1400,1.0,0.0,0,1,1.333655,1.186722,0.047902
6473,94,2,3,66,447,30932,423.035,1120486,0,0,...,2,50,633,1.0,2.0,1,1,1.26653,-0.136094,0.491362


Unnamed: 0,hotel_cluster,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,hotel_continent,hotel_country,hotel_market,duration,days_to_trip,start_of_trip_weekend,end_of_trip_weekend,pc1,pc2,pc3
8882,98,2,3,66,311,47357,820.7915,203704,0,1,...,2,50,1442,4.0,52.0,0,0,0.632215,0.719063,-0.254314
6942,32,2,3,66,363,12346,421.5762,188748,0,0,...,2,50,640,1.0,54.0,0,0,0.449168,-0.307637,-0.040287
9412,83,2,3,66,448,24848,8165.4505,378594,0,0,...,3,171,61,4.0,29.0,0,0,0.701995,0.001459,-0.224099
18658,57,2,3,215,646,51733,789.4612,191608,0,0,...,4,8,126,3.0,5.0,1,0,1.28445,1.102881,0.088481
899,91,2,3,66,174,24294,1929.5831,186010,0,0,...,2,50,511,2.0,1.0,0,0,0.117765,-0.205437,-0.015675


In [22]:
train_set_location = f's3://{data_bucket_name}/{S3_DATA_DIR}/{app_name}/train.csv'
validation_set_location = f's3://{data_bucket_name}/{S3_DATA_DIR}/{app_name}/validation.csv'
test_set_location = f's3://{data_bucket_name}/{S3_DATA_DIR}/{app_name}/test.csv'

train_set_pointer = TrainingInput(s3_data=train_set_location, content_type='csv')
validation_set_pointer = TrainingInput(s3_data=validation_set_location, content_type='csv')
test_set_pointer = TrainingInput(s3_data=test_set_location, content_type='csv')
logger.info(f"train_set_pointer -> {json.dumps(train_set_pointer.__dict__, indent=2)},\n"
            f"validation_set_pointer -> {json.dumps(validation_set_pointer.__dict__, indent=2)},\n"
            f"test_set_pointer -> {json.dumps(test_set_pointer.__dict__, indent=2)}")

train_set_pointer -> {
  "config": {
    "DataSource": {
      "S3DataSource": {
        "S3DataType": "S3Prefix",
        "S3Uri": "s3://expedia-customer-behavior-data-2345bbc0/data/hotel_cluster_prediction/train.csv",
        "S3DataDistributionType": "FullyReplicated"
      }
    },
    "ContentType": "csv"
  }
},
validation_set_pointer -> {
  "config": {
    "DataSource": {
      "S3DataSource": {
        "S3DataType": "S3Prefix",
        "S3Uri": "s3://expedia-customer-behavior-data-2345bbc0/data/hotel_cluster_prediction/validation.csv",
        "S3DataDistributionType": "FullyReplicated"
      }
    },
    "ContentType": "csv"
  }
},
test_set_pointer -> {
  "config": {
    "DataSource": {
      "S3DataSource": {
        "S3DataType": "S3Prefix",
        "S3Uri": "s3://expedia-customer-behavior-data-2345bbc0/data/hotel_cluster_prediction/test.csv",
        "S3DataDistributionType": "FullyReplicated"
      }
    },
    "ContentType": "csv"
  }
}


In [23]:
# a short sleep to make sure that files got uploaded to S3
time.sleep(10)

In [24]:
container_uri = sagemaker.image_uris.retrieve(region=region, 
                                              framework='xgboost', 
                                              version='1.0-1', 
                                              image_scope='training')
job_name = f"{app_name.replace('_', '-')}-ml-model"
xgb = sagemaker.estimator.Estimator(image_uri=container_uri,
                                    role=role, 
                                    instance_count=training_job_instance_count, 
                                    instance_type=training_job_instance_type,
                                    output_path=f's3://{data_bucket_name}/{app_name}/model-artifacts',
                                    sagemaker_session=sagemaker_session,
                                    base_job_name=job_name)

xgb.set_hyperparameters(objective='multi:softmax',
                        num_class=num_hotel_clusters,
                        num_round=ML_MODEL_TRAINING_ROUNDS)
xgb.fit({'train': train_set_pointer, 'validation': validation_set_pointer})

Defaulting to only available Python version: py3
Defaulting to only supported image scope: cpu.
Defaulting to the only supported framework/algorithm version: latest.
Ignoring unnecessary instance type: None.
Creating training-job with name: hotel-cluster-prediction-ml-model-2022-06-26-14-51-50-674
2022-06-26 14:51:50 Starting - Starting the training job...
2022-06-26 14:52:14 Starting - Preparing the instances for trainingProfilerReport-1656255110: InProgress
......
2022-06-26 14:53:15 Downloading - Downloading input data...
2022-06-26 14:53:46 Training - Downloading the training image.....[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mod

In [25]:
# Saving training job information to be used in the ML lineage module
training_job_info = xgb.latest_training_job.describe()
if training_job_info != None :
    training_job_name = training_job_info["TrainingJobName"]
    utils.write_param("training_job_name", training_job_name)

write_param, fpath=../config/training_job_name, writing training_job_name=hotel-cluster-prediction-ml-model-2022-06-26-14-51-50-674


## Host the trained model as a SageMaker Endpoint

In [26]:
logger.info(f"going to deploy the trained model to model_ep_instance_type={model_ep_instance_type}, model_ep_instance_count={model_ep_instance_count}")
xgb_predictor = xgb.deploy(initial_instance_count=model_ep_instance_count,
                           instance_type=model_ep_instance_type)

going to deploy the trained model to model_ep_instance_type=ml.m5.xlarge, model_ep_instance_count=2
Creating model with name: hotel-cluster-prediction-ml-model-2022-06-26-15-50-01-705
Creating endpoint-config with name hotel-cluster-prediction-ml-model-2022-06-26-15-50-01-705
Creating endpoint with name hotel-cluster-prediction-ml-model-2022-06-26-15-50-01-705
-----!

In [27]:
# the model will be accepting csv as input
csv_serializer = CSVSerializer()

# store the endpoint in a filename for next stage (lineage tracking)
endpoint_name = xgb_predictor.endpoint_name
utils.write_param("endpoint_name", endpoint_name)

# setup the predictor endpoint    
predictor = Predictor(endpoint_name=endpoint_name, 
                      serializer=csv_serializer)


write_param, fpath=../config/endpoint_name, writing endpoint_name=hotel-cluster-prediction-ml-model-2022-06-26-15-50-01-705


## Batch inference

Use Python multiprocessing to get inference for the entire dataframe by first splitting it into as many dataframes as there are cores on this machine and then get predictions one row at a time for each dataframe.

In [28]:
# handy function for getting inference
def get_inference(df):
    # get all the dataframe content as ndarray
    y_hat_list = []
    for r in df.values:
        # the first element of each row is the target variable
        y = r[0]
        
        # everything from the second element onwards is a feature for that row
        X = r[1:]
        
        # get the prediction. The prediction is returned as a float string so
        # "64.0" for 64, so we first convert the string to float and then to int
        # cant directly cast to int (invalid literal for int() with base 10: '' error)
        y_hat = int(float(predictor.predict(X).decode('utf-8')))
        
        # append it to a list so that at the end we have a list containing predictions
        # for each row of the input dataframe
        y_hat_list.append(y_hat)
    return y_hat_list

In [29]:
num_procs  = psutil.cpu_count(logical=False)
logger.info(f"num_procs={num_procs}")

# df_test = df_test.drop('hotel_cluster_predicted', axis=1)
# split the dataframe into as many parts as their are cores on this instance
df_splitted = np.array_split(df_test, num_procs)

# list for holding predictions for each dataframe
y_hat_list = []

start = time.time()
# setup parallel predictions for each dataframe
with concurrent.futures.ProcessPoolExecutor(max_workers=num_procs) as executor:
    results = [ executor.submit(get_inference, df=df) for df in df_splitted ]
    for result in concurrent.futures.as_completed(results):
        try:
            y_hat_list.append(result.result())
        except Exception as ex:
            logger.error(str(ex))
            pass
end = time.time()
logger.info(f"PPID {os.getpid()}, all done in {round(end-start,2)}s")

# flatten out the list (remember we have a list containing predicted values for each split of the original dataframe, so at this time we have a list of lists)
y_hat = [y_hat for y_hat_sublist in y_hat_list for y_hat in y_hat_sublist]


num_procs=4
PPID 22005, all done in 9.44s


In [30]:
# add the prediction as a new column to the dataframe
df_test['hotel_cluster_predicted'] = y_hat

In [31]:
# how many did we predict correctly?
correct = sum(df_test.hotel_cluster == df_test.hotel_cluster_predicted)
logger.info(f"the model predicted {correct} correctly out of {df_test.shape[0]}, accuracy={round(100*(correct/df_test.shape[0]), 2)}%")

NumExpr defaulting to 8 threads.
the model predicted 41 correctly out of 1969, accuracy=2.08%
