In [1]:
# !pip install sagemaker==2.21.0 boto3==1.16.40

Collecting sagemaker==2.21.0
  Using cached sagemaker-2.21.0-py2.py3-none-any.whl
Installing collected packages: sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.20.0
    Uninstalling sagemaker-2.20.0:
      Successfully uninstalled sagemaker-2.20.0
Successfully installed sagemaker-2.21.0
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
# %store -z
%store -r
%store

Stored variables and their in-db values:
output_path             -> 's3://sagemaker-us-east-2-645431112437/export-flow
timestamp               -> '2021-02-01-17-10'


In [3]:
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker.lineage import context, artifact, association, action
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import CreateModelStep
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.parameters import ParameterInteger, ParameterFloat, ParameterString

from model_package_src.inference_specification import InferenceSpecification

In [4]:
import boto3
import csv
import io
import json
import numpy as np
import pandas as pd
import datetime
import time
from scipy.sparse import lil_matrix
from sklearn.model_selection import train_test_split
import awswrangler as wr

In [5]:
assert sagemaker.__version__ >= '2.21.0'

In [6]:
region = "us-east-2"
boto3.setup_default_session(region_name=region)
boto_session = boto3.Session(region_name=region)

s3_client = boto3.client('s3', region_name=region)

sagemaker_boto_client = boto_session.client('sagemaker')
sagemaker_session = sagemaker.session.Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_boto_client)
sagemaker_role = sagemaker.get_execution_role()

bucket = sagemaker_session.default_bucket()

In [7]:
if 'timestamp' not in locals():
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')
    %store timestamp
else:
    print(f'Using existing timestamp: {timestamp}')

Using existing timestamp: 2021-02-01-17-10


# Read the data

The data comes from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Online+Retail)

In [8]:
df = wr.s3.read_csv(output_path)
print(df.shape)
df.head()

(274399, 6)


Unnamed: 0,StockCode,CustomerID,Country,UnitPrice,Quantity,Description_features
0,"(3650,[1817],[1.0])","(4327,[109],[1.0])","(38,[5],[1.0])",0.85,12,"(2231,[1354,1495,1503],[1.0,1.0,1.0])"
1,"(3650,[1817],[1.0])","(4327,[2314],[1.0])","(38,[4],[1.0])",0.85,24,"(2231,[1354,1495,1503],[1.0,1.0,1.0])"
2,"(3650,[1817],[1.0])","(4327,[584],[1.0])","(38,[2],[1.0])",0.85,48,"(2231,[1354,1495,1503],[1.0,1.0,1.0])"
3,"(3650,[1817],[1.0])","(4327,[214],[1.0])","(38,[2],[1.0])",0.85,12,"(2231,[1354,1495,1503],[1.0,1.0,1.0])"
4,"(3650,[1817],[1.0])","(4327,[1934],[1.0])","(38,[1],[1.0])",0.85,1,"(2231,[1354,1495,1503],[1.0,1.0,1.0])"


In [9]:
def loadDataset(dataframe):      
    n_rows = dataframe.shape[0]
    n_customers = int(dataframe['CustomerID'][0].split(',')[0].strip('()'))
    n_items = int(dataframe['StockCode'][0].split(',')[0].strip('()'))
    n_countries = int(dataframe['Country'][0].split(',')[0].strip('()'))
    n_tokens = int(dataframe['Description_features'][0].split(',')[0].strip('()'))
    n_features = n_customers + n_items + n_countries + n_tokens + 1   # plus one is for the UnitPrice feature
    
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((n_rows, n_features)).astype('float32')
    # Labels are stored in a vector
    y = []
    
    for ix, row in dataframe.iterrows():
        desc = row['Description_features']
        
        X[ix, 0] = row['UnitPrice']
        X[ix, int(row['CustomerID'].split(',')[1].strip('[]')) + 1] = 1
        X[ix, n_customers + int(row['StockCode'].split(',')[1].strip('[]')) + 1] = 1
        X[ix, n_customers + n_items + int(row['Country'].split(',')[1].strip('[]')) + 1] = 1
        
        for col_idx in desc.split(',[')[1].strip(']').split(','):
            X[ix, n_customers + n_items + n_countries + int(col_idx) + 1] = 1
        
        y.append(row['Quantity'])
            
    y=np.array(y).astype('float32')
    
    return X, y

In [10]:
X, y = loadDataset(df)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((219519, 10247), (54880, 10247), (219519,), (54880,))

In [12]:
prefix = 'personalization'

train_key      = 'train.protobuf'
train_prefix   = f'{prefix}/train'

test_key       = 'test.protobuf'
test_prefix    = f'{prefix}/test'

output_prefix  = f's3://{bucket}/{prefix}/output'

In [13]:
def writeDatasetToProtobuf(X, y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
train_data = writeDatasetToProtobuf(X_train, y_train, bucket, train_prefix, train_key)    
test_data  = writeDatasetToProtobuf(X_test, y_test, bucket, test_prefix, test_key)    
  
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

s3://sagemaker-us-east-2-645431112437/personalization/train/train.protobuf
s3://sagemaker-us-east-2-645431112437/personalization/test/test.protobuf
Output: s3://sagemaker-us-east-2-645431112437/personalization/output


In [14]:
container = sagemaker.image_uris.retrieve("factorization-machines", region=boto_session.region_name)

fm = sagemaker.estimator.Estimator(container,
                                   sagemaker_role, 
                                   instance_count=1, 
                                   instance_type='ml.c5.xlarge',
                                   output_path=output_prefix,
                                   sagemaker_session=sagemaker_session)

fm.set_hyperparameters(feature_dim=X_train.shape[1],
                       predictor_type='regressor',
                       mini_batch_size=1000,
                       num_factors=64,
                       epochs=20)

In [15]:
if 'training_job_name' not in locals():
    
    fm.fit({'train': train_data, 'test': test_data})
    training_job_name = fm.latest_training_job.job_name
    %store training_job_name
    
else:
    print(f'Using previous training job: {training_job_name}')

2021-02-01 17:27:02 Starting - Starting the training job...
2021-02-01 17:27:25 Starting - Launching requested ML instancesProfilerReport-1612200422: InProgress
......
2021-02-01 17:28:26 Starting - Preparing the instances for training...
2021-02-01 17:28:46 Downloading - Downloading input data
2021-02-01 17:28:46 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from numpy.testing import nosetester[0m
[34m[02/01/2021 17:29:09 INFO 140187809592704] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal',

In [16]:
training_job_info = sagemaker_boto_client.describe_training_job(TrainingJobName=training_job_name)

### Training data artifact

In [17]:
training_data_s3_uri = training_job_info['InputDataConfig'][0]['DataSource']['S3DataSource']['S3Uri']

matching_artifacts = list(artifact.Artifact.list(
    source_uri=training_data_s3_uri,
    sagemaker_session=sagemaker_session))

if matching_artifacts:
    training_data_artifact = matching_artifacts[0]
    print(f'Using existing artifact: {training_data_artifact.artifact_arn}')
else:
    training_data_artifact = artifact.Artifact.create(
        artifact_name='TrainingData',
        source_uri=training_data_s3_uri,
        artifact_type='Dataset',
        sagemaker_session=sagemaker_session)
    print(f'Create artifact {training_data_artifact.artifact_arn}: SUCCESSFUL')

Using existing artifact: arn:aws:sagemaker:us-east-2:645431112437:artifact/cdd7fbecb4eefa22c43b2ad48140acc2


### Model artifact

In [18]:
trained_model_s3_uri = training_job_info['ModelArtifacts']['S3ModelArtifacts']

matching_artifacts = list(artifact.Artifact.list(
    source_uri=trained_model_s3_uri,
    sagemaker_session=sagemaker_session))

if matching_artifacts:
    model_artifact = matching_artifacts[0]
    print(f'Using existing artifact: {model_artifact.artifact_arn}')
else:
    model_artifact = artifact.Artifact.create(
        artifact_name='TrainedModel',
        source_uri=trained_model_s3_uri,
        artifact_type='Model',
        sagemaker_session=sagemaker_session)
    print(f'Create artifact {model_artifact.artifact_arn}: SUCCESSFUL')

Using existing artifact: arn:aws:sagemaker:us-east-2:645431112437:artifact/34578c9e5251ea3f609cb909fff9b782


#### Set artifact associations

In [19]:
trial_component = sagemaker_boto_client.describe_trial_component(TrialComponentName=training_job_name+'-aws-training-job')
trial_component_arn = trial_component['TrialComponentArn']

#### Store artifacts

In [20]:
artifact_list = (
                 (training_data_artifact, 'ContributedTo'),
                 (model_artifact, 'Produced')
)

for artifact, assoc in artifact_list:
    try:
        association.Association.create(
            source_arn=artifact.artifact_arn,
            destination_arn=trial_component_arn,
            association_type=assoc,
            sagemaker_session=sagemaker_session)
        print(f"Association with {artifact.artifact_type}: SUCCEESFUL")
    except:
        print(f"Association already exists with {artifact.artifact_type}")

Association already exists with DataSet
Association with Model: SUCCEESFUL


In [21]:
model_name = 'retail-recommendations'
model_matches = sagemaker_boto_client.list_models(NameContains=model_name)['Models']

if not model_matches:
    print(f'Creating model {model_name}')
    model = sagemaker_session.create_model_from_job(
        name=model_name,
        training_job_name=training_job_info['TrainingJobName'],
        role=sagemaker_role,
        image_uri=training_job_info['AlgorithmSpecification']['TrainingImage'])
else:
    
    print(f"Model {model_name} already exists.")

Creating model retail-recommendations


Using already existing model: retail-recommendations


## SageMaker Model Registry

Once a useful model has been trained and its artifacts properly associated, the next step is to register the model for future reference and possible deployment.

### Create Model Package Group

A Model Package Groups holds multiple versions or iterations of a model. Though it is not required to create them for every model in the registry, they help organize various models which all have the same purpose and provide autiomatic versioning.

In [22]:
if 'mpg_name' not in locals():
    mpg_name = f'retail-recommendation-{timestamp}'
    %store mpg_name

print(f'Model Package Group name: {mpg_name}')

Stored 'mpg_name' (str)
Model Package Group name: retail-recommendation-2021-02-01-17-10


In [23]:
mpg_input_dict = {
    'ModelPackageGroupName': mpg_name,
    'ModelPackageGroupDescription': 'Recommendation for Online Retail Sales'
}

In [24]:
matching_mpg = sagemaker_boto_client.list_model_package_groups(NameContains=mpg_name)['ModelPackageGroupSummaryList']

if matching_mpg:
    print(f'Using existing Model Package Group: {mpg_name}')
else:
    mpg_response = sagemaker_boto_client.create_model_package_group(**mpg_input_dict)
    print(f'Create Model Package Group {mpg_name}: SUCCESSFUL')

Create Model Package Group retail-recommendation-2021-02-01-17-10: SUCCESSFUL


In [25]:
model_metrics_report = {
    'regression_metrics': {}
}

for metric in training_job_info['FinalMetricDataList']:
    stat = {
        metric['MetricName']: {
            'value': metric['Value']
        }
    }
    model_metrics_report['regression_metrics'].update(stat)
    
with open('training_metrics.json', 'w') as f:
    json.dump(model_metrics_report, f)
    
metrics_s3_key = f"training_jobs/{training_job_info['TrainingJobName']}/training_metrics.json"
s3_client.upload_file(Filename='training_metrics.json', Bucket=bucket, Key=metrics_s3_key)

#### Define the inference spec

In [26]:
mp_inference_spec = InferenceSpecification().get_inference_specification_dict(
    ecr_image=training_job_info['AlgorithmSpecification']['TrainingImage'],
    supports_gpu=False,
    supported_content_types=['application/x-recordio-protobuf', 'application/json'],
    supported_mime_types=['text/csv'])

mp_inference_spec['InferenceSpecification']['Containers'][0]['ModelDataUrl'] = training_job_info['ModelArtifacts']['S3ModelArtifacts']

#### Define model metrics
Metrics other than model quality can be defined. See the Boto3 documentation for [creating a model package](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker.html#SageMaker.Client.create_model_package).

In [27]:
model_metrics = {
    'ModelQuality': {
        'Statistics': {
            'ContentType': 'application/json',
            'S3Uri': f's3://{bucket}/{metrics_s3_key}'
        }
    }
}

In [28]:
mp_input_dict = {
    'ModelPackageGroupName': mpg_name,
    'ModelPackageDescription': 'Factorization Machine Model to create personalized retail recommendations',
    'ModelApprovalStatus': 'PendingManualApproval',
    'ModelMetrics': model_metrics
}

mp_input_dict.update(mp_inference_spec)
mp_response = sagemaker_boto_client.create_model_package(**mp_input_dict)

### Wait until model package is completed

In [29]:
mp_info = sagemaker_boto_client.describe_model_package(ModelPackageName=mp_response['ModelPackageArn'])
mp_status = mp_info['ModelPackageStatus']

while mp_status not in ['Completed', 'Failed']:
    time.sleep(5)
    mp_info = sagemaker_boto_client.describe_model_package(ModelPackageName=mp_response['ModelPackageArn'])
    mp_status = mp_info['ModelPackageStatus']
    print(f'model package status: {mp_status}')
print(f'model package status: {mp_status}')

model package status: Completed


In [30]:
model_package = sagemaker_boto_client.list_model_packages(ModelPackageGroupName=mpg_name)['ModelPackageSummaryList'][0]
model_package_update = {
    'ModelPackageArn': model_package['ModelPackageArn'],
    'ModelApprovalStatus': 'Approved'
}

update_response = sagemaker_boto_client.update_model_package(**model_package_update)

__Create endpoint config and endpoint__

In [31]:
primary_container = {'ModelPackageName': model_package['ModelPackageArn']}
endpoint_config_name=f'{model_name}-endpoint-config'
existing_configs = sagemaker_boto_client.list_endpoint_configs(NameContains=endpoint_config_name)['EndpointConfigs']

if not existing_configs:
    create_ep_config_response = sagemaker_boto_client.create_endpoint_config(
        EndpointConfigName=endpoint_config_name,
        ProductionVariants=[{
            'InstanceType': 'ml.m4.xlarge',
            'InitialVariantWeight': 1,
            'InitialInstanceCount': 1,
            'ModelName': model_name,
            'VariantName': 'AllTraffic'
        }]
    )
    %store endpoint_config_name

endpoint_name = f'{model_name}-endpoint'
existing_endpoints = sagemaker_boto_client.list_endpoints(NameContains=endpoint_name)['Endpoints']

if not existing_endpoints:
    create_endpoint_response = sagemaker_boto_client.create_endpoint(
        EndpointName=endpoint_name,
        EndpointConfigName=endpoint_config_name)
    %store endpoint_name

endpoint_info = sagemaker_boto_client.describe_endpoint(EndpointName=endpoint_name)
endpoint_status = endpoint_info['EndpointStatus']

while endpoint_status == 'Creating':
    endpoint_info = sagemaker_boto_client.describe_endpoint(EndpointName=endpoint_name)
    endpoint_status = endpoint_info['EndpointStatus']
    print('Endpoint status:', endpoint_status)
    if endpoint_status == 'Creating':
        time.sleep(60)

# Make Predictions
Here we will take a single customer and try to predict

In [38]:
class FMSerializer(JSONSerializer):
    def serialize(self, data):
        js = {'instances': []}
        for row in data:
              js['instances'].append({'features': row.tolist()})
        return json.dumps(js)

fm_predictor = fm.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    serializer=FMSerializer(),
    deserializer= JSONDeserializer()
)

-------------!

In [46]:
n = 100
popular_items = df.groupby(['StockCode', 'UnitPrice']).nunique()['CustomerID'].sort_values(ascending=False).reset_index()
top_n_items = popular_items['StockCode'].iloc[:n].values
top_n_prices = popular_items['UnitPrice'].iloc[:n].values

In [56]:
# stock codes can have multiple descriptions, so we will choose whichever description is most common
item_desc_map = df.groupby('StockCode').agg(lambda x: x.value_counts().index[0])['Description_features']
customer_id = df.iloc[0]['CustomerID']
country = df.iloc[0]['Country']

In [57]:
data = {'StockCode': top_n_items,
        'Description_features': [item_desc_map[i] for i in top_n_items],
        'CustomerID': customer_id,
        'Country': country,
        'UnitPrice': top_n_prices,
       }

df_inference = pd.DataFrame(data, columns=df.columns)

In [59]:
df_inference.head()

Unnamed: 0,StockCode,CustomerID,Country,UnitPrice,Quantity,Description_features
0,"(3650,[0],[1.0])","(4327,[109],[1.0])","(38,[5],[1.0])",12.75,,"(2231,[15,58,148,214],[1.0,1.0,1.0,1.0])"
1,"(3650,[1],[1.0])","(4327,[109],[1.0])","(38,[5],[1.0])",2.95,,"(2231,[4,13,19,20,25],[1.0,1.0,1.0,1.0,1.0])"
2,"(3650,[4],[1.0])","(4327,[109],[1.0])","(38,[5],[1.0])",1.69,,"(2231,[60,77,89,337],[1.0,1.0,1.0,1.0])"
3,"(3650,[3],[1.0])","(4327,[109],[1.0])","(38,[5],[1.0])",4.95,,"(2231,[65,66],[1.0,1.0])"
4,"(3650,[6],[1.0])","(4327,[109],[1.0])","(38,[5],[1.0])",4.95,,"(2231,[0,1,8,12,15,46,68],[1.0,1.0,1.0,1.0,1.0..."


In [60]:
X_inference, _ = loadDataset(df_inference)

In [61]:
result = fm_predictor.predict(X_inference.toarray())
result

{'predictions': [{'score': 15.505157470703125},
  {'score': 45.32623291015625},
  {'score': 47.73848342895508},
  {'score': 19.60230255126953},
  {'score': 16.89789581298828},
  {'score': 51.341312408447266},
  {'score': 18.874170303344727},
  {'score': 17.15357208251953},
  {'score': 12.38580322265625},
  {'score': 26.633277893066406},
  {'score': 32.23994445800781},
  {'score': 30.322702407836914},
  {'score': 47.75290298461914},
  {'score': 37.8108024597168},
  {'score': 26.108715057373047},
  {'score': 8.026677131652832},
  {'score': 18.226085662841797},
  {'score': 13.117279052734375},
  {'score': 29.297754287719727},
  {'score': 31.275550842285156},
  {'score': 30.882699966430664},
  {'score': 25.639904022216797},
  {'score': 20.291793823242188},
  {'score': 34.24357604980469},
  {'score': 18.868614196777344},
  {'score': 28.178415298461914},
  {'score': 34.03327560424805},
  {'score': 16.524681091308594},
  {'score': 31.802452087402344},
  {'score': 26.684755325317383},
  {'scor

In [62]:
preds = [i['score'] for i in result['predictions']]
index_array = np.array(preds).argsort()
top_5_recs = np.take_along_axis(np.array(top_n_items), index_array, axis=0)[:-6:-1]

print(f'The top 5 recommended products have Stock Numbers: {top_5_recs}')

The top 5 recommended products have Stock Numbers: ['(3650,[7],[1.0])' '(3650,[50],[1.0])' '(3650,[2],[1.0])'
 '(3650,[4],[1.0])' '(3650,[1],[1.0])']
