In [1]:
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import json_deserializer

import boto3, csv, io, json
import numpy as np
from scipy.sparse import lil_matrix

In [2]:
nbUsers=943
nbFoodItems=1682
nbFeatures=nbUsers+nbFoodItems

nbRatingsTrain=90570
nbRatingsTest=943*1682

In [3]:
foodItemsByUser={}
for userId in range(nbUsers):
    foodItemsByUser[str(userId)] = []
    
with open('food.base','r') as f:
    samples=csv.reader(f,delimiter='\t')
    for userId,foodItemId,rating in samples:
        foodItemsByUser[str(int(userId)-1)].append(int(foodItemId)-1)

In [4]:
def loadDataset(filename, lines, columns):
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((lines, columns)).astype('float32')
    # Labels are stored in a vector
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter='\t')
        for userId,foodItemId,rating in samples:
            X[line,int(userId)-1] = 1
            X[line,int(nbUsers)+int(foodItemId)-1] = 1
            if int(rating) >= 4:
                Y.append(1)
            else:
                Y.append(0)
            line=line+1
            
    Y=np.array(Y).astype('float32')
    return X,Y

In [5]:
X_train, Y_train = loadDataset('food.base', nbRatingsTrain, nbFeatures)
X_test, Y_test = loadDataset('food1.test',nbRatingsTest,nbFeatures)
print("train and test data loaded")

train and test data loaded


In [6]:
print(X_train.shape)
print(Y_train.shape)
assert X_train.shape == (nbRatingsTrain, nbFeatures)
assert Y_train.shape == (nbRatingsTrain, )
zero_labels = np.count_nonzero(Y_train)
print("Training labels: %d zeros, %d ones" % (zero_labels, nbRatingsTrain-zero_labels))

print(X_test.shape)
print(Y_test.shape)
assert X_test.shape  == (nbRatingsTest, nbFeatures)
assert Y_test.shape  == (nbRatingsTest, )
zero_labels = np.count_nonzero(Y_test)
print("Test labels: %d zeros, %d ones" % (zero_labels, nbRatingsTest-zero_labels))

(90570, 2625)
(90570,)
Training labels: 49906 zeros, 40664 ones
(1586126, 2625)
(1586126,)
Test labels: 635777 zeros, 950349 ones


In [8]:
bucket = 'food-recommender'
prefix = 'sagemaker/fm-food'

train_key      = 'train.protobuf'
train_prefix   = '{}/{}'.format(prefix, 'train3')

test_key       = 'test.protobuf'
test_prefix    = '{}/{}'.format(prefix, 'test3')

output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)

In [9]:
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)
    
train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    
test_data  = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    
  
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

s3://food-recommender/sagemaker/fm-food/train3/train.protobuf
s3://food-recommender/sagemaker/fm-food/test3/test.protobuf
Output: s3://food-recommender/sagemaker/fm-food/output


In [12]:
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/factorization-machines:latest',
              'ap-south-1':'991648021394.dkr.ecr.ap-south-1.amazonaws.com/factorization-machines:latest'
             }

In [13]:
fm = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
'arn:aws:iam::851078371316:role/FoodSageMakerRole', 
                                   train_instance_count=1, 
                                   train_instance_type='ml.c4.xlarge',
                                   output_path=output_prefix,
                                   sagemaker_session=sagemaker.Session())

fm.set_hyperparameters(feature_dim=nbFeatures,
                      predictor_type='binary_classifier',
                      mini_batch_size=1000,
                      num_factors=64,
                      epochs=100)

fm.fit({'train': train_data, 'test': test_data})
#print(fm)

INFO:sagemaker:Creating training-job with name: factorization-machines-2019-03-16-08-30-58-152


2019-03-16 08:30:58 Starting - Starting the training job...
2019-03-16 08:30:59 Starting - Launching requested ML instances......
2019-03-16 08:32:00 Starting - Preparing the instances for training......
2019-03-16 08:33:23 Downloading - Downloading input data..
[31mDocker entrypoint called with argument(s): train[0m
[31m[03/16/2019 08:33:41 INFO 139869249693504] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'_tuning_objective_metric': u'', u'bias_wd': u'0.01', u'use_linear': u'true', u'bias_lr': u'0.1', u'mini_batch_size': u'1000', u'_use_full_symbolic': u'true', u'batch_metrics_publish_interval': u

In [14]:
fm_predictor = fm.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1)

INFO:sagemaker:Creating model with name: factorization-machines-2019-03-16-08-35-40-204
INFO:sagemaker:Creating endpoint with name factorization-machines-2019-03-16-08-30-58-152


------------------------------------------------------------------!

In [15]:
def fm_serializer(data):
    js = {'instances': []}
    for row in data:
        js['instances'].append({'features': row.tolist()})
    #print js
    return json.dumps(js)

fm_predictor.content_type = 'application/json'
fm_predictor.serializer = fm_serializer
fm_predictor.deserializer = json_deserializer

In [19]:
resultSet = {
}
#for i in range(0,943*1682-200,200):
for i in range(0,943*1682-200,200):
    result = fm_predictor.predict(X_test[i:i+200].toarray())
    #print(result)


    with open('food1.test') as j, open('productDataJson.json') as productDataJson:
        pdJ = json.load(productDataJson)
        c=0
        for i,j in zip(result['predictions'],j.readlines()[i:i+200]):
            s = dict()
            k = j.split('\t')
            #s['uid'],s['pid'],s['isRecommended'],s['calorificValue'],s['cuisine'] = int(k[0]),int(k[1]),int(i['predicted_label']),pdJ[k[1]]["calorificValue"],pdJ[k[1]]["cuisine"]
            if int(k[0]) not in resultSet:
                resultSet[int(k[0])] = dict()
                #print("true")
            resultSet[int(k[0])][int(k[1])] = dict()
            resultSet[int(k[0])][int(k[1])]['isRecommended'],resultSet[int(k[0])][int(k[1])]['calorificValue'],resultSet[int(k[0])][int(k[1])]['cuisine'] = int(i['predicted_label']),pdJ[k[1]]["calorificValue"],pdJ[k[1]]["cuisine"]

            #resultSet['predictions'].append(s)
            """
            if(c==100):
                break
            c = c+1
            """
    #print(resultSet)
    f = open('predictions1.json','w+')
    f.write(json.dumps(resultSet))
    f.close()