In [2]:
import sys
sys.path.append('../')
import pickle
from preprocessing.imdb_datareader import IMDBDataReader
from preprocessing.imdb_datareader import PopularityReader
from preprocessing.factorization_machine_transformer import  FactorizationMachineTransformer

user_item  = pickle.load(open("data/user_item.p", "rb")) 
users = pickle.load(open("data/users.p", "rb")) 
items = pickle.load(open("data/items.p", "rb")) 
train_user_item = user_item[:int(len(user_item)*0.8)]
test_user_item = user_item[int(len(user_item)*0.8):]
popreader = PopularityReader()
pop_info = popreader.read_item_data(train_user_item)
for k, v in items.items():
    if k in pop_info:
        v.append(pop_info[k])
    else:
        v.append(0)


transformer = FactorizationMachineTransformer(users, items, train_user_item)
X_train, Y_train, _, _, nFeatures = transformer.get_feature_vectors(users, items, train_user_item)
X_test, Y_test,X_cold_test, Y_cold_test, nFeatures = transformer.get_feature_vectors(users, items, test_user_item)

In [None]:
!aws s3api create-bucket --bucket recommendation-demo-yianc-0814 --region us-west-2 --create-bucket-configuration LocationConstraint=us-west-2

In [3]:
bucket = 'recommendation-demo-yianc'
prefix = 'sagemaker/fm-movielens'
train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train')
output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)


import io,boto3
import sagemaker.amazon.common as smac


def writeDatasetToProtobuf(X, bucket, prefix, key, d_type, Y=None):
    buf = io.BytesIO()
    if d_type == "sparse":
        smac.write_spmatrix_to_sparse_tensor(buf, X, labels=Y)
    else:
        smac.write_numpy_to_dense_tensor(buf, X, labels=Y)
        
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
 
    
train_data = writeDatasetToProtobuf(X_train, bucket, train_prefix, train_key, "sparse", Y_train)    

In [4]:
print('Output: {}'.format(output_prefix))
print('Train data: {}'.format(train_data))

Output: s3://recommendation-demo-yianc/sagemaker/fm-movielens/output
Train data: s3://recommendation-demo-yianc/sagemaker/fm-movielens/train/train.protobuf


In [5]:
import sagemaker 

from sagemaker import get_execution_role
import boto3 
from sagemaker.amazon.amazon_estimator import get_image_uri
region = boto3.Session().region_name
container = get_image_uri(region, 'factorization-machines', 'latest')
container


'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


'382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest'

In [6]:

import boto3 
import sagemaker
from sagemaker import get_execution_role

crole = get_execution_role() 

dir(sagemaker.estimator.Estimator)
fm = sagemaker.estimator.Estimator(container,
                                   crole, 
                                   train_instance_count=1, 
                                   train_instance_type='ml.c4.xlarge',
                                   output_path=output_prefix,
                                   sagemaker_session=sagemaker.Session())



fm.set_hyperparameters(
                      feature_dim=nFeatures,
                      predictor_type='binary_classifier',
#                       predictor_type='regressor',
                      mini_batch_size=200,
                      num_factors=64,
                      epochs=50)

fm.fit({'train': train_data})


Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-08-20 09:00:55 Starting - Starting the training job...
2020-08-20 09:00:58 Starting - Launching requested ML instances......
2020-08-20 09:02:15 Starting - Preparing the instances for training......
2020-08-20 09:03:20 Downloading - Downloading input data......
2020-08-20 09:04:25 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from numpy.testing import nosetester[0m
[34m[08/20/2020 09:04:28 INFO 139776720611136] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'_

[34m[2020-08-20 09:04:33.395] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 6, "duration": 1259, "num_examples": 400, "num_bytes": 8462008}[0m
[34m[08/20/2020 09:04:33 INFO 139776720611136] #quality_metric: host=algo-1, epoch=2, train binary_classification_accuracy <score>=0.64025[0m
[34m[08/20/2020 09:04:33 INFO 139776720611136] #quality_metric: host=algo-1, epoch=2, train binary_classification_cross_entropy <loss>=0.652206644106[0m
[34m[08/20/2020 09:04:33 INFO 139776720611136] #quality_metric: host=algo-1, epoch=2, train binary_f_1.000 <score>=0.68233995585[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 1261.350154876709, "sum": 1261.350154876709, "min": 1261.350154876709}}, "EndTime": 1597914273.395885, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1597914272.134135}
[0m
[34m[08/20/2020 09:04:33 INFO 139776720611136] #progress_metric: host=algo-1, comp

[34m[2020-08-20 09:04:43.527] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 22, "duration": 1250, "num_examples": 400, "num_bytes": 8462008}[0m
[34m[08/20/2020 09:04:43 INFO 139776720611136] #quality_metric: host=algo-1, epoch=10, train binary_classification_accuracy <score>=0.6906[0m
[34m[08/20/2020 09:04:43 INFO 139776720611136] #quality_metric: host=algo-1, epoch=10, train binary_classification_cross_entropy <loss>=0.603687685061[0m
[34m[08/20/2020 09:04:43 INFO 139776720611136] #quality_metric: host=algo-1, epoch=10, train binary_f_1.000 <score>=0.724824902724[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 1253.1969547271729, "sum": 1253.1969547271729, "min": 1253.1969547271729}}, "EndTime": 1597914283.528388, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1597914282.274662}
[0m
[34m[08/20/2020 09:04:43 INFO 139776720611136] #progress_metric: host=algo-

[34m[2020-08-20 09:04:53.518] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 38, "duration": 1220, "num_examples": 400, "num_bytes": 8462008}[0m
[34m[08/20/2020 09:04:53 INFO 139776720611136] #quality_metric: host=algo-1, epoch=18, train binary_classification_accuracy <score>=0.6722875[0m
[34m[08/20/2020 09:04:53 INFO 139776720611136] #quality_metric: host=algo-1, epoch=18, train binary_classification_cross_entropy <loss>=0.646315648603[0m
[34m[08/20/2020 09:04:53 INFO 139776720611136] #quality_metric: host=algo-1, epoch=18, train binary_f_1.000 <score>=0.706682628299[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 1222.2530841827393, "sum": 1222.2530841827393, "min": 1222.2530841827393}}, "EndTime": 1597914293.518862, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1597914292.29622}
[0m
[34m[08/20/2020 09:04:53 INFO 139776720611136] #progress_metric: host=alg

[34m[2020-08-20 09:05:03.506] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 54, "duration": 1226, "num_examples": 400, "num_bytes": 8462008}[0m
[34m[08/20/2020 09:05:03 INFO 139776720611136] #quality_metric: host=algo-1, epoch=26, train binary_classification_accuracy <score>=0.6766[0m
[34m[08/20/2020 09:05:03 INFO 139776720611136] #quality_metric: host=algo-1, epoch=26, train binary_classification_cross_entropy <loss>=0.654398579097[0m
[34m[08/20/2020 09:05:03 INFO 139776720611136] #quality_metric: host=algo-1, epoch=26, train binary_f_1.000 <score>=0.710480965063[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 1228.6770343780518, "sum": 1228.6770343780518, "min": 1228.6770343780518}}, "EndTime": 1597914303.507238, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1597914302.278078}
[0m
[34m[08/20/2020 09:05:03 INFO 139776720611136] #progress_metric: host=algo-

[34m[2020-08-20 09:05:13.342] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 70, "duration": 1214, "num_examples": 400, "num_bytes": 8462008}[0m
[34m[08/20/2020 09:05:13 INFO 139776720611136] #quality_metric: host=algo-1, epoch=34, train binary_classification_accuracy <score>=0.6962625[0m
[34m[08/20/2020 09:05:13 INFO 139776720611136] #quality_metric: host=algo-1, epoch=34, train binary_classification_cross_entropy <loss>=0.631318675423[0m
[34m[08/20/2020 09:05:13 INFO 139776720611136] #quality_metric: host=algo-1, epoch=34, train binary_f_1.000 <score>=0.727586632137[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 1216.3240909576416, "sum": 1216.3240909576416, "min": 1216.3240909576416}}, "EndTime": 1597914313.342865, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1597914312.125989}
[0m
[34m[08/20/2020 09:05:13 INFO 139776720611136] #progress_metric: host=al

[34m[2020-08-20 09:05:23.241] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 86, "duration": 1249, "num_examples": 400, "num_bytes": 8462008}[0m
[34m[08/20/2020 09:05:23 INFO 139776720611136] #quality_metric: host=algo-1, epoch=42, train binary_classification_accuracy <score>=0.6505625[0m
[34m[08/20/2020 09:05:23 INFO 139776720611136] #quality_metric: host=algo-1, epoch=42, train binary_classification_cross_entropy <loss>=0.762225919437[0m
[34m[08/20/2020 09:05:23 INFO 139776720611136] #quality_metric: host=algo-1, epoch=42, train binary_f_1.000 <score>=0.684384631886[0m
[34m#metrics {"Metrics": {"update.time": {"count": 1, "max": 1251.352071762085, "sum": 1251.352071762085, "min": 1251.352071762085}}, "EndTime": 1597914323.241622, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1597914321.989819}
[0m
[34m[08/20/2020 09:05:23 INFO 139776720611136] #progress_metric: host=algo-


2020-08-20 09:05:39 Uploading - Uploading generated training model
2020-08-20 09:05:39 Completed - Training job completed
Training seconds: 139
Billable seconds: 139


In [7]:
fm_predictor = fm.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1)


Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-----------------!

In [8]:
sys.path.append('../')
import sagemaker_utils
from sagemaker_utils.query_serializer import serialize as fmserialize 
from sagemaker.predictor import  json_deserializer
from sklearn.metrics import accuracy_score
import numpy 


sagemaker_utils.query_serializer.nFeatures = nFeatures
fm_predictor.content_type = sagemaker_utils.query_serializer.CONTENT_TYPE
fm_predictor.serializer = fmserialize
fm_predictor.deserializer = json_deserializer


def model_accuracy(X_test, Y_test): 
    X_test_arr = X_test
        
    result = fm_predictor.predict(X_test_arr) 
    y_pred = [] 
    for p in result['predictions']: 
        if p['score'] > 0.5:
            y_pred.append(1)
        else: 
            y_pred.append(0)
    return accuracy_score(Y_test, y_pred, normalize=False)

In [9]:
accuracy = model_accuracy(X_test, Y_test) / len(Y_test)
accuracy 

0.6915822563744324

In [10]:
accuracy = model_accuracy(X_cold_test, Y_cold_test) / len(Y_cold_test)
accuracy

0.6853932584269663