In [43]:
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.serializers import SimpleBaseSerializer,JSONSerializer
from sagemaker.deserializers import JSONDeserializer

import boto3, csv, io, json
import numpy as np
from scipy.sparse import lil_matrix

# Fetch Data

In [44]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -o ml-100k.zip

--2021-09-05 21:55:05--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip.1’


2021-09-05 21:55:06 (5.19 MB/s) - ‘ml-100k.zip.1’ saved [4924029/4924029]

Archive:  ml-100k.zip
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating:

In [46]:
print("Training Data:")
!head -10 ./ml-100k/ua.base

print("\nTesting Data:")
!head -10 ./ml-100k/ua.test

Training Data:
1	1	5	874965758
1	2	3	876893171
1	3	4	878542960
1	4	3	876893119
1	5	3	889751712
1	6	5	887431973
1	7	4	875071561
1	8	1	875072484
1	9	5	878543541
1	10	3	875693118

Testing Data:
1	20	4	887431883
1	33	4	878542699
1	61	4	878542420
1	117	3	874965739
1	155	2	878542201
1	160	4	875072547
1	171	5	889751711
1	189	3	888732928
1	202	5	875072442
1	265	4	878542441


# Load Data


Sagemaker se asteapta ca datele sa vina in urmatorul format:
  - O matrice sparse. Nu vom stoca 0
  - Variabila target este recomandarea(rating) utilizatorului
  - One-hot encoding pentru utilizator $N$
  - One-hot encoding pentru produs $M$
  
  

|Rating|User1|User2|...|UserN|Movie1|Movie2|Movie3|...|MovieM|Feature1|Feature2|...|
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|4|1|0|...|0|0|0|0|...|0|20|2.2|...|
|5|0|1|...|0|0|0|0|...|0|17|9.1|...|
|3|0|0|...|0|1|0|0|...|0|3|11.0|...|
|4|0|0|...|0|0|1|0|...|0|15|6.4|...|



In [47]:
def get_samples(csv_reader):
    samples = []
    for userId,movieId,rating,timestamp in csv_reader:
        samples.append({
            'userId': userId,
            'movieId': movieId,
            'rating': rating,
            'timestamp': timestamp
        })
        
    return samples
    
    
def get_maximums(samples):
    users = []
    movies = []
    for sample in samples:
        users.append(int(sample['userId']))
        movies.append(int(sample['movieId']))

    max_user_id = max(users)
    max_movie_id = max(movies)
    
    return max_user_id, max_movie_id


def get_matrix_shape(max_user_id, max_movie_id, samples):
    total_samples = len(samples)
    total_features = max_user_id + max_movie_id

    return total_samples, total_features


def fill_data(data, labels, samples):
    row = 0
        
    # Build matrix and labels
    for sample in samples:

        # One hot-encode userId and movieId at row
        user_index = int(sample['userId']) - 1
        movie_index = 943 + int(sample['movieId']) - 1    #!!

        data[row, user_index] = 1
        data[row, movie_index] = 1

        # Append binary to labels for whether user "enjoyed" movie
        labels.append(int(sample['rating']))
        #if int(sample['rating']) >= 4:
        #    labels.append(1)
        #else:
        #    labels.append(0)

        row = row + 1

    # Convert labels list to float 32
    labels = np.array(labels).astype('float32')
    
    return data, labels
    

def load_dataset(training_data_file_path, testing_data_file_path):
    # Training Data
    with open(training_data_file_path, 'r') as file:
        csv_reader = csv.reader(file, delimiter='\t')
        
        # Get all training samples in form of [{}, {}, ...]
        training_samples = get_samples(csv_reader)
        
        # Get maximum number of users and movies
        max_user_id, max_movie_id = get_maximums(training_samples)
        
        # Get shape of training matrix
        training_matrix_shape = get_matrix_shape(max_user_id, max_movie_id, training_samples)
        
        # Initialize training data and labels structures
        training_data = lil_matrix(training_matrix_shape).astype('float32')
        training_labels = []

        # Fill training data and labels structures with sample training data 
        training_data, training_labels = fill_data(training_data, training_labels, training_samples)
        
    # Testing Data
    with open(testing_data_file_path, 'r') as file:
        csv_reader = csv.reader(file, delimiter='\t')
        
        # Get all testing samples in form of [{}, {}, ...]
        testing_samples = get_samples(csv_reader)
        
        #Get shape of testing matrix
        testing_matrix_shape = get_matrix_shape(max_user_id, max_movie_id, testing_samples)
        
        # Initialize testing data and labels structures
        testing_data = lil_matrix(testing_matrix_shape).astype('float32')
        testing_labels = []
        
        # Fill testing data and labels structurs with sample testing data
        testing_data, testing_labels = fill_data(testing_data, testing_labels, testing_samples)
        
    
    
    return (training_data, training_labels), (testing_data, testing_labels)

In [48]:
training_data_file_path = './ml-100k/ua.base'
testing_data_file_path = './ml-100k/ua.test'

(training_data, training_labels), (testing_data, testing_labels) = load_dataset(training_data_file_path, testing_data_file_path)

# Summary Statistics

### Shapes

In [49]:
print("(Ratings, Features)")
print(training_data.shape)
print(training_labels.shape)

print(testing_data.shape)
print(testing_labels.shape)

(Ratings, Features)
(90570, 2625)
(90570,)
(9430, 2625)
(9430,)


### Insight

In [50]:
print(training_data[10:15])
print(training_labels[10:15])

print(testing_data[10:15])
print(testing_labels[10:15])

  (0, 0)	1.0
  (0, 953)	1.0
  (1, 0)	1.0
  (1, 954)	1.0
  (2, 0)	1.0
  (2, 955)	1.0
  (3, 0)	1.0
  (3, 956)	1.0
  (4, 0)	1.0
  (4, 957)	1.0
[2. 5. 5. 5. 5.]
  (0, 1)	1.0
  (0, 955)	1.0
  (1, 1)	1.0
  (1, 992)	1.0
  (2, 1)	1.0
  (2, 1193)	1.0
  (3, 1)	1.0
  (3, 1222)	1.0
  (4, 1)	1.0
  (4, 1223)	1.0
[4. 5. 5. 3. 3.]


## Convert to protobuf and save to S3

In [51]:
bucket = 'andrei-work-eu' # Put your bucket
prefix = 'temp/movielens'

training_data_key = '{}/training-data/training.protobuf'.format(prefix)
testing_data_key = '{}/testing-data/testing.protobuf'.format(prefix)

output_path = 's3://{}/{}/output'.format(bucket, prefix)

In [52]:
def write_dataset_to_protobuf(data, labels, bucket, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, data, labels)
    buf.seek(0)
    boto3.resource('s3').Bucket(bucket).Object(key).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket, key)

In [53]:
training_data_location = write_dataset_to_protobuf(training_data, training_labels, bucket, training_data_key)
testing_data_location = write_dataset_to_protobuf(testing_data, testing_labels, bucket, testing_data_key)

print('Training data written to: {}'.format(training_data_location))
print('Testing data written to: {}'.format(testing_data_location))
print('Output location: {}'.format(output_path))

Training data written to: s3://andrei-work-eu/temp/movielens/training-data/training.protobuf
Testing data written to: s3://andrei-work-eu/temp/movielens/testing-data/testing.protobuf
Output location: s3://andrei-work-eu/temp/movielens/output


## Run training

In [54]:
image_name= sagemaker.image_uris.retrieve("factorization-machines", "eu-central-1")
print("User image :",image_name)

fm = sagemaker.estimator.Estimator(image_name,
                                   get_execution_role(),
                                   instance_count = 1,  # Instance count
                                   instance_type = 'ml.m5.large',
                                   output_path = output_path,
                                   sagemaker_session = sagemaker.Session(),
                                   use_spot_instances=True, # We want to use spot instances to lower the bill
                                   max_wait=1*3600,
                                   max_run=1*3600
                                  )

fm.set_hyperparameters(feature_dim = training_data.shape[1],
                       predictor_type = 'regressor', # We want to make regression to predict the rating
                       mini_batch_size = 1000,
                       num_factors = 10, # number of latent variable for user and product
                       epochs = 200 
                      )

fm.fit({ 'train': training_data_location, 'test': testing_data_location })

User image : 664544806723.dkr.ecr.eu-central-1.amazonaws.com/factorization-machines:1
2021-09-05 21:59:52 Starting - Starting the training job...
2021-09-05 21:59:54 Starting - Launching requested ML instancesProfilerReport-1630879192: InProgress
...
2021-09-05 22:00:42 Starting - Preparing the instances for training......
2021-09-05 22:01:50 Downloading - Downloading input data...
2021-09-05 22:02:23 Training - Downloading the training image...
2021-09-05 22:02:43 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from collections import Mapping, MutableMapping, Sequence[0m
  """[0m
  """[0m
[34m[09/05/2021 22:02:40 INFO 140434797135680] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 

## Deploy

In [13]:
fm_predictor = fm.deploy(instance_type = 'ml.t2.medium', initial_instance_count = 1)

-------------------!

In [40]:
class FactorizationMachineSerializer(SimpleBaseSerializer):
    # SimpleBaseSerializer already uses "application/json" CONTENT_TYPE by default

    def serialize(self, data):
        js = {"instances": []}
        for row in data:
            js["instances"].append({"features": row.tolist()})
        return json.dumps(js)


#fm_predictor.CONTENT_TYPE = 'application/json'
fm_predictor.serializer = FactorizationMachineSerializer()
fm_predictor.deserializer = JSONDeserializer()

## Run predictions

In [41]:
index=349


result = fm_predictor.predict(testing_data[index].toarray())

print("Actual:",result["predictions"][0]["score"])
print("Expected:",testing_labels[index])

Actual: 2.8320682048797607
Expected: 3.0


# Run prediction with loaded model

In [66]:

client = boto3.client('runtime.sagemaker')

index=220

testing_data[index].toarray()

payload=FactorizationMachineSerializer().serialize(testing_data[index].toarray())


result = client.invoke_endpoint(
    EndpointName='factorization-machines-2021-09-05-21-01-55-437',
    Body=payload,
    ContentType='application/json',
    Accept='Accept'
)

result=json.loads(result['Body'].read().decode())

print("Actual:",result["predictions"][0]["score"])
print("Expected:",testing_labels[index])

Actual: 4.073488712310791
Expected: 4.0
