Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: MIT-0

Permission is hereby granted, free of charge, to any person obtaining a copy of this
software and associated documentation files (the "Software"), to deal in the Software
without restriction, including without limitation the rights to use, copy, modify,
merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

# Building Amazon Personalize Online Recommendation

## Imports
Python ships with a broad collection of libraries and we need to import those as well as the ones installed to help us like [boto3](https://aws.amazon.com/sdk-for-python/) (AWS SDK for python) and [Pandas](https://pandas.pydata.org/)/[Numpy](https://numpy.org/) which are core data science tools.

In [None]:
# Imports
import boto3
import json
import numpy as np
import pandas as pd
import time
import datetime
from yaml import safe_load
import requests
import random 
from decimal import *
import ast

Next you will want to validate that your environment can communicate successfully with Amazon Personalize, the lines below do just that.

In [None]:
# Configure the SDK to Personalize:
s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')
dynamodb_resource = boto3.resource('dynamodb')
personalize_client = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

## Specify an S3 Bucket and Data Output Location

Amazon Personalize will need an S3 bucket to act as the source of your data. The code bellow will create a bucket with a unique `bucket_name`.

The Amazon S3 bucket needs to be in the same region as the Amazon Personalize resources. 

In [None]:
# Sets the same region as current Amazon SageMaker Notebook
with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:
    data = json.load(notebook_info)
    resource_arn = data['ResourceArn']
    region = resource_arn.split(':')[3]
print('region:', region)

# Or you can specify the region where your bucket and model will be domiciled
# region = "us-east-1" 

s3 = boto3.client('s3')
account_id = boto3.client('sts').get_caller_identity().get('Account')
bucket_name =  "personalize-demo-{}".format(account_id) 
print('bucket_name:', bucket_name)

try: 
    if region == "us-east-1":
        s3.create_bucket(Bucket=bucket_name)
    else:
        s3.create_bucket(
            Bucket = bucket_name,
            CreateBucketConfiguration={'LocationConstraint': region}
            )
except s3.exceptions.BucketAlreadyOwnedByYou:
    print("Bucket already exists. Using bucket", bucket_name)

## Download, Prepare, and Upload Training Data

First we need to download the data (training data). In this tutorial we'll use the Purchase history from a retail store  dataset. The dataset contains the user id,items id,the interaction between customers and items and the time this interaction took place(Timestamp) 

### Download and Explore and Prepare the Dataset

In [None]:
# !aws s3 cp s3://retail-demo-store-us-east-1/csvs/items.csv .
!aws s3 cp s3://retail-demo-store-us-east-1/csvs/interactions.csv .

The dataset has been successfully downloaded as Electronics_Store_purchase_history.csv

Lets learn more about the dataset by viewing its charateristics

### Interactions

In [None]:
df = pd.read_csv('./interactions.csv')
df.EVENT_TYPE.value_counts()
def convert_event_type(event_type_in_some_format):
    if(event_type_in_some_format == "ProductViewed"):
        return "View"
    if(event_type_in_some_format == "OrderCompleted"):
        return "Purchase"
    else:
        return event_type_in_some_format

df['EVENT_TYPE'] = df['EVENT_TYPE'].apply(convert_event_type)
df.EVENT_TYPE.value_counts()

test=df.drop(columns=['DISCOUNT'])
df=test
display(df.sample(5))
df.to_csv("cleaned_training_data.csv")

#### Users

In [None]:
# get user ids from the interaction dataset
user_ids = df['USER_ID'].unique()
users_df = pd.DataFrame()
users_df["USER_ID"]=user_ids

np.random.seed(123)
possible_genders = ['female', 'male']
random_ = np.random.choice(possible_genders, len(users_df.index), p=[0.5, 0.5])
users_df["GENDER"] = random_
users_df.to_csv("users.csv", index=False)

users_df.head()

#### Items

In [None]:
def put_random_inventory(x):
    sample_store_id = ["store"+str(x) for x in range(1,11)]
    sample_num_stocks = 10 * np.array(list(range(1,11)))

    store_inv_dict = {}
    for i in range(5):
        store_id_idx = random.randint(0,9)
        num_stocks_idx = random.randint(0,9)

        store_id_ = str(sample_store_id[store_id_idx])
        num_stocks_ = sample_num_stocks[num_stocks_idx]
        store_inv_dict[store_id_] = num_stocks_
    return store_inv_dict

In [None]:
items_url = 'https://raw.githubusercontent.com/aws-samples/retail-demo-store/master/src/products/src/products-service/data/products.yaml'
raw_txt = requests.get(items_url).content

items_df = pd.json_normalize(safe_load(raw_txt))

drop_col_names = ['current_stock', 'image', 'gender_affinity', 'where_visible', 'featured', 'image_license', 'link', 'aliases']

items_df = items_df.drop(labels= drop_col_names, axis=1)

items_df = items_df.rename(columns={'id':'ITEM_ID', 
                                    'name':'NAME',
                                    'category':'CATEGORY_L1',
                                    'style':'CATEGORY_L2',
                                    'description':'PRODUCT_DESCRIPTION',
                                    'price':'PRICE',
                                   })
# items_df.columns = items_df.columns.str.upper()

ts= datetime.datetime(2022, 1, 1, 0, 0).strftime('%s')
ts = datetime.datetime.now().strftime('%s')
items_df["CREATION_TIMESTAMP"] = ts


random_store_inv_df = items_df[['ITEM_ID']].applymap(put_random_inventory)
items_df[['STORE_INVENTORY']]= random_store_inv_df


items_df['STORE_IDS_AVAILABLE']= items_df['STORE_INVENTORY'].apply(lambda x: set(x.keys()))

items_df.to_csv("items_with_store_inv.csv", index=False)

items_df['STORE_IDS_AVAILABLE'] =items_df['STORE_IDS_AVAILABLE'].apply(lambda x: "|".join(x))
items_df = items_df.drop(labels=["STORE_INVENTORY"], axis=1)
items_df.to_csv("items.csv", index=False)
items_df.head()

In the cells below, we will write our cleaned data to a file named "final_training_data.csv

In [None]:
# move all the datasetGroup files in data folder
!mkdir -p datasets
!cp cleaned_training_data.csv ./datasets/cleaned_training_data.csv
!cp users.csv ./datasets/users.csv
!cp items.csv ./datasets/items.csv

In [None]:
interactions_file_name = 'cleaned_training_data.csv'
s3_prefix = "dataset/uploads/interaction"
boto3.Session().resource('s3').Bucket(bucket_name).Object(s3_prefix +"/"+interactions_file_name).upload_file(interactions_file_name)
interactions_s3DataPath = "s3://{}/{}/{}".format(bucket_name, s3_prefix, interactions_file_name)
print(interactions_s3DataPath)

users_file_name = 'users.csv'
s3_prefix = "dataset/uploads/users"
boto3.Session().resource('s3').Bucket(bucket_name).Object(s3_prefix +"/"+users_file_name).upload_file(users_file_name)
users_s3DataPath = "s3://{}/{}/{}".format(bucket_name, s3_prefix, users_file_name)
print(users_s3DataPath)

items_file_name = 'items.csv'
s3_prefix = "dataset/uploads/items"
boto3.Session().resource('s3').Bucket(bucket_name).Object(s3_prefix +"/"+items_file_name).upload_file(items_file_name)
items_s3DataPath = "s3://{}/{}/{}".format(bucket_name, s3_prefix, items_file_name)
print(items_s3DataPath)


## Configure an S3 bucket and an IAM role

So far, we have downloaded, manipulated, and saved the data onto the Amazon EBS instance attached to instance running this Jupyter notebook. However, Amazon Personalize will need an S3 bucket to act as the source of your data, as well as IAM roles for accessing that bucket. Let's set all of that up.


## Set the S3 bucket policy
Amazon Personalize needs to be able to read the contents of your S3 bucket. So add a bucket policy which allows that.

Note: Make sure the role you are using to run the code in this notebook has the necessary permissions to modify the S3 bucket policy.

In [None]:
s3 = boto3.client("s3")
policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com"
            },
            "Action": [
                "s3:GetObject",
                "s3:ListBucket",
                "s3:PutObject"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket_name),
                "arn:aws:s3:::{}/*".format(bucket_name)
            ]
        }
    ]
}

s3.put_bucket_policy(Bucket=bucket_name, Policy=json.dumps(policy))

## Create and Wait for Dataset Group
The largest grouping in Personalize is a Dataset Group, this will isolate your data, event trackers, solutions, Recommenders, and campaigns. Grouping things together that share a common collection of data. Feel free to alter the name below if you'd like.

### Create Dataset Group

In [None]:
response = personalize_client.create_dataset_group(
    name='personalize_demo_ecomemerce_dsg_v1',
    domain='ECOMMERCE'
)

dataset_group_arn = response['datasetGroupArn']
print(json.dumps(response, indent=2))

Wait for Dataset Group to Have ACTIVE Status
Before we can use the Dataset Group in any items below it must be active, execute the cell below and wait for it to show active.

In [None]:
%%time

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_group_response = personalize_client.describe_dataset_group(
        datasetGroupArn = dataset_group_arn
    )
    status = describe_dataset_group_response["datasetGroup"]["status"]
    print("DatasetGroup: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

## Create Interactions + User + Items Schema
A core component of how Personalize understands your data comes from the Schema that is defined below. This configuration tells the service how to digest the data provided via your CSV file. Note the columns and types align to what was in the file you created above.

In [None]:
interactions_schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        },
        {
            "name": "EVENT_TYPE",
            "type": "string"
            
        }
    ],
    "version": "1.0"
}



try:
    interactions_schema_response = personalize_client.create_schema(
        name = "personalize-demo-ecommerce-interaction-schema-v1",
        domain = "ECOMMERCE",
        schema = json.dumps(interactions_schema)
    )

    interactions_schema_arn = interactions_schema_response['schemaArn']
    print(json.dumps(interactions_schema_response, indent=2))
except Exception as e:
    print(e)

In [None]:
users_schema = {
    "type": "record",
    "name": "Users",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
      {
          "name": "USER_ID",
          "type": "string"
      },
      {
          "name": "GENDER",
          "type": "string",
          "categorical": True
      }
    ],
    "version": "1.0"
}

try:
    users_schema_response = personalize_client.create_schema(
        name = "personalize-demo-ecommerce-users-schema-v1",
        domain = "ECOMMERCE",
        schema = json.dumps(users_schema)
    )

    users_schema_arn = users_schema_response['schemaArn']
    print(json.dumps(users_schema_response, indent=2))
except Exception as e:
    print(e)

In [None]:
items_schema = {
   "type":"record",
   "name":"Items",
   "namespace":"com.amazonaws.personalize.schema",
   "fields":[
      {
         "name":"ITEM_ID",
         "type":"string"
      },
      {
         "name":"STORE_IDS_AVAILABLE",
         "type":"string"
      },
      {
         "name":"NAME",
         "type":"string"
      },
      {
         "name":"CATEGORY_L1",
         "type":[
            "string"
         ],
         "categorical":True
      },
      {
         "name":"CATEGORY_L2",
         "type":[
            "string"
         ],
         "categorical":True
      },
      {
         "name":"PRODUCT_DESCRIPTION",
         "type": [
            "null",
            "string"
          ],
          "textual": True
        },
      {
         "name":"PRICE",
         "type":"float"
      },
      {
         "name":"CREATION_TIMESTAMP",
         "type":"long"
      }
   ],
   "version":"1.0"
}

try:
    items_schema_response = personalize_client.create_schema(
        name = "personalize-demo-ecommerce-items-schema-v1",
        domain = "ECOMMERCE",
        schema = json.dumps(items_schema)
    )

    items_schema_arn = items_schema_response['schemaArn']
    print(json.dumps(items_schema_response, indent=2))
except Exception as e:
    print(e)

## Create Datasets
After the group, the next thing to create is the actual datasets.

### Create Interactions Dataset

In [None]:
dataset_type = "INTERACTIONS"
try:
    create_dataset_response = personalize_client.create_dataset(
        name = "personalize_demo_ecommerce_interactions_v1",
        datasetType = dataset_type,
        datasetGroupArn = dataset_group_arn,
        schemaArn = interactions_schema_arn
    )

    interactions_dataset_arn = create_dataset_response['datasetArn']
    print(json.dumps(create_dataset_response, indent=2))
except Exception as e:
    print(e)

In [None]:
dataset_type = "USERS"
try:
    create_dataset_response = personalize_client.create_dataset(
        name = "personalize_demo_ecommerce_users_v1",
        datasetType = dataset_type,
        datasetGroupArn = dataset_group_arn,
        schemaArn = users_schema_arn
    )

    users_dataset_arn = create_dataset_response['datasetArn']
    print(json.dumps(create_dataset_response, indent=2))
except Exception as e:
    print(e)

In [None]:
dataset_type = "ITEMS"
try:
    create_dataset_response = personalize_client.create_dataset(
        name = "personalize_demo_ecommerce_items_v1",
        datasetType = dataset_type,
        datasetGroupArn = dataset_group_arn,
        schemaArn = items_schema_arn
    )

    items_dataset_arn = create_dataset_response['datasetArn']
    print(json.dumps(create_dataset_response, indent=2))
except Exception as e:
    print(e)

## Create Personalize Role
Also Amazon Personalize needs the ability to assume Roles in AWS in order to have the permissions to execute certain tasks, the lines below grant that.

Note: Make sure the role you are using to run the code in this notebook has the necessary permissions to create a role.

In [None]:
iam = boto3.client("iam")

role_name = "PersonalizeDemoRoleEcommerceRecommender"
assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "personalize.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
    ]
}

create_role_response = iam.create_role(
    RoleName = role_name,
    AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
)

# AmazonPersonalizeFullAccess provides access to any S3 bucket with a name that includes "personalize" or "Personalize" 
# if you would like to use a bucket with a different name, please consider creating and attaching a new policy
# that provides read access to your bucket or attaching the AmazonS3ReadOnlyAccess policy to the role
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonPersonalizeFullAccess"
iam.attach_role_policy(
    RoleName = role_name,
    PolicyArn = policy_arn
)

# Now add S3 support
iam.attach_role_policy(
    PolicyArn='arn:aws:iam::aws:policy/AmazonS3FullAccess',
    RoleName=role_name
)
time.sleep(60) # wait for a minute to allow IAM role policy attachment to propagate

role_arn = create_role_response["Role"]["Arn"]
print(role_arn)

## Import the data
Earlier you created the DatasetGroup and Dataset to house your information, now you will execute an import job that will load the data from S3 into Amazon Personalize for usage building your model.
### Create Interactions + Users + Items Dataset Import Job

In [None]:
create_interactions_dataset_import_job_response = personalize_client.create_dataset_import_job(
    jobName = "personalize_demo_ecommerce_interactions_import_v1",
    datasetArn = interactions_dataset_arn,
    dataSource = {
        "dataLocation": interactions_s3DataPath
    },
    roleArn = role_arn
)

dataset_interactions_import_job_arn = create_interactions_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_interactions_dataset_import_job_response, indent=2))

In [None]:
create_users_dataset_import_job_response = personalize_client.create_dataset_import_job(
    jobName = "personalize_demo_ecommerce_users_import_v1",
    datasetArn = users_dataset_arn,
    dataSource = {
        "dataLocation": users_s3DataPath
    },
    roleArn = role_arn
)

dataset_users_import_job_arn = create_users_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_users_dataset_import_job_response, indent=2))

In [None]:
create_items_dataset_import_job_response = personalize_client.create_dataset_import_job(
    jobName = "personalize_demo_ecommerce_items_import_v1",
    datasetArn = items_dataset_arn,
    dataSource = {
        "dataLocation": items_s3DataPath
    },
    roleArn = role_arn
)

dataset_items_import_job_arn = create_items_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_items_dataset_import_job_response, indent=2))

Wait for Dataset Import Job to Have ACTIVE Status
It can take a while before the import job completes, please wait until you see that it is active below.

In [None]:
%%time
    
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize_client.describe_dataset_import_job(
        datasetImportJobArn = dataset_interactions_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("Interactions DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)
    
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize_client.describe_dataset_import_job(
        datasetImportJobArn = dataset_users_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("Users DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)
    
    
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize_client.describe_dataset_import_job(
        datasetImportJobArn = dataset_items_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("Items DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

# Train and Deploy Models

In [None]:
available_recipes = personalize_client.list_recipes(domain='ECOMMERCE') # See a list of recommenders for the domain. 
for recipe in available_recipes['recipes']:
    print(recipe['recipeArn'])
    
print("***************************")
personalize_recipes = personalize_client.list_recipes()
for recipe in personalize_recipes['recipes']:
    print(recipe['recipeArn'])

We are going to create a recommender of the type "Customers who frequently bought together items". This recommender gives recommendations items that customers frequently buy together along with an item that you specify. With this use case, Amazon Personalize automatically finds similar items the user purchased based on the userId that you specify and `Purchase` events.

In [None]:
frequently_bought_together_recipe_arn = 'arn:aws:personalize:::recipe/aws-ecomm-frequently-bought-together'
recommended_for_you_recipe_arn = 'arn:aws:personalize:::recipe/aws-ecomm-recommended-for-you'
sims_recipe_arn= 'arn:aws:personalize:::recipe/aws-similar-items'
rerank_recipe_arn = "arn:aws:personalize:::recipe/aws-personalized-ranking"

## 1: AWS Similar Items

#### a) Create the solution

As with the previous solution, start by creating the solution first. Although you provide the dataset ARN in this step, the model is not yet trained. See this as an identifier instead of a trained model.

In [None]:
sim_solution_response = personalize_client.create_solution(
    name = "personalize-demo-sim-v1",
    datasetGroupArn = dataset_group_arn,
    recipeArn = sims_recipe_arn
)

sim_solution_arn = sim_solution_response['solutionArn']
print(json.dumps(sim_solution_response, indent=2))

#### b) Create the solution version

Once you have a solution, you need to create a version in order to complete the model training. The training can take a while to complete, upwards of 25 minutes, and an average of 35 minutes for this recipe with our dataset. Normally, we would use a while loop to poll until the task is completed. However the task would block other cells from executing, and the goal here is to create many models and deploy them quickly. So we will set up the while loop for all of the solutions further down in the notebook. There, you will also find instructions for viewing the progress in the AWS console.

In [None]:
sim_solution_version_response = personalize_client.create_solution_version(
    solutionArn = sim_solution_arn
)

In [None]:
sim_solution_version_arn = sim_solution_version_response['solutionVersionArn']
print(json.dumps(sim_solution_version_response, indent=2))

In [None]:
in_progress_solution_versions = [
    sim_solution_version_arn
]

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    for solution_version_arn in in_progress_solution_versions:
        version_response = personalize_client.describe_solution_version(
            solutionVersionArn = solution_version_arn
        )
        status = version_response["solutionVersion"]["status"]
        
        if status == "ACTIVE":
            print("Build succeeded for {}".format(solution_version_arn))
            in_progress_solution_versions.remove(solution_version_arn)
        elif status == "CREATE FAILED":
            print("Build failed for {}".format(solution_version_arn))
            in_progress_solution_versions.remove(solution_version_arn)
    
    if len(in_progress_solution_versions) <= 0:
        break
    else:
        print("At least one solution build is still in progress")
        
    time.sleep(60)

## 2: Frequently Bought Together Items

In [None]:
create_recommender_response = personalize_client.create_recommender(
  name = 'personalize-demo-frequently-bought-together-v1',
  recipeArn = frequently_bought_together_recipe_arn,
  datasetGroupArn = dataset_group_arn
)
frequently_bought_together_arn = create_recommender_response["recommenderArn"]
print (json.dumps(create_recommender_response, indent=2))

In [None]:
%%time

max_time = time.time() + 10*60*60 # 10 hours

while time.time() < max_time:

    version_response = personalize_client.describe_recommender(
        recommenderArn = frequently_bought_together_arn
    )
    status = version_response["recommender"]["status"]

    if status == "ACTIVE":
        print("Build succeeded for {}".format(frequently_bought_together_arn))
        
    elif status == "CREATE FAILED":
        print("Build failed for {}".format(frequently_bought_together_arn))
        

    if status == "ACTIVE" or status == "CREATE FAILED":
        break
    else:
        print('The "Customers who viewed X also viewed" Recommender build is still in progress')
        
    time.sleep(60)

## 3: Personalized Ranking

### Personalized Ranking

Personalized Ranking is an interesting application of HRNN. Instead of just recommending what is most probable for the user in question, this algorithm takes in a user and a list of items as well. The items are then rendered back in the order of most probable relevance for the user. The use case here is for filtering on genre for example, or when you have a broad collection that you would like better ordered for a particular user.

For our use case, using the LastFM data, we could imagine that a particular record label is paying us to recommend their artists to our users in a special promotion. Therefore, we know the list of artists we want to recommend, but we want to find out which of these artists each user will like most. We would use personalized ranking to re-order the list of artists for each user, based on their previous tagging history. 

#### a) Create the solution

As with the previous solution, start by creating the solution first. Although you provide the dataset ARN in this step, the model is not yet trained. See this as an identifier instead of a trained model.

In [None]:
rerank_create_solution_response = personalize_client.create_solution(
    name = "personalize-demo-ranking-v1",
    datasetGroupArn = dataset_group_arn,
    recipeArn = rerank_recipe_arn
)

rerank_solution_arn = rerank_create_solution_response['solutionArn']
print(json.dumps(rerank_create_solution_response, indent=2))

#### b) Create the solution version

In [None]:
rerank_create_solution_version_response = personalize_client.create_solution_version(
    solutionArn = rerank_solution_arn
)

In [None]:
rerank_solution_version_arn = rerank_create_solution_version_response['solutionVersionArn']
print(json.dumps(rerank_create_solution_version_response, indent=2))

In [None]:
in_progress_solution_versions = [
    rerank_solution_version_arn
]

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    for solution_version_arn in in_progress_solution_versions:
        version_response = personalize_client.describe_solution_version(
            solutionVersionArn = solution_version_arn
        )
        status = version_response["solutionVersion"]["status"]
        
        if status == "ACTIVE":
            print("Build succeeded for {}".format(solution_version_arn))
            in_progress_solution_versions.remove(solution_version_arn)
        elif status == "CREATE FAILED":
            print("Build failed for {}".format(solution_version_arn))
            in_progress_solution_versions.remove(solution_version_arn)
    
    if len(in_progress_solution_versions) <= 0:
        break
    else:
        print("At least one solution build is still in progress")
        
    time.sleep(60)

#### c) Create campaigns <a class="anchor" id="create"></a>
[Back to top](#top)

A campaign is a hosted solution version; an endpoint which you can query for recommendations. Pricing is set by estimating throughput capacity (requests from users for personalization per second). When deploying a campaign, you set a minimum throughput per second (TPS) value. This service, like many within AWS, will automatically scale based on demand, but if latency is critical, you may want to provision ahead for larger demand. For this POC and demo, all minimum throughput thresholds are set to 1. For more information, see the [pricing page](https://aws.amazon.com/personalize/pricing/).

Let's start deploying the campaigns.

In [None]:
rerank_create_campaign_response = personalize_client.create_campaign(
    name = "personalize-demo-campaign-rerank-v1",
    solutionVersionArn = rerank_solution_version_arn,
    minProvisionedTPS = 1
)

rerank_campaign_arn = rerank_create_campaign_response['campaignArn']
print(json.dumps(rerank_create_campaign_response, indent=2))

#### View campaign creation status

As promised, how to view the status updates in the console:

* In another browser tab you should already have the AWS Console up from opening this notebook instance. 
* Switch to that tab and search at the top for the service `Personalize`, then go to that service page. 
* Click `View dataset groups`.
* Click the name of your dataset group, most likely something with POC in the name.
* Click `Campaigns`.
* You will now see a list of all of the campaigns you created above, including a column with the status of the campaign. Once it is `Active`, your campaign is ready to be queried.

Or simply run the cell below to keep track of the campaign creation status.

In [None]:
in_progress_campaigns = [
    rerank_campaign_arn
]

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    for campaign_arn in in_progress_campaigns:
        version_response = personalize_client.describe_campaign(
            campaignArn = campaign_arn
        )
        status = version_response["campaign"]["status"]
        
        if status == "ACTIVE":
            print("Build succeeded for {}".format(campaign_arn))
            in_progress_campaigns.remove(campaign_arn)
        elif status == "CREATE FAILED":
            print("Build failed for {}".format(campaign_arn))
            in_progress_campaigns.remove(campaign_arn)
    
    if len(in_progress_campaigns) <= 0:
        break
    else:
        print("At least one campaign build is still in progress")
        
    time.sleep(60)

## 4: Recommended For you

In [None]:
create_recommender_response = personalize_client.create_recommender(
  name = 'personalize-demo-recommended-for-you-v1',
  recipeArn = recommended_for_you_recipe_arn,
  datasetGroupArn = dataset_group_arn
)
recommended_for_you_arn = create_recommender_response["recommenderArn"]
print (json.dumps(create_recommender_response, indent=2))

In [None]:
%%time

max_time = time.time() + 10*60*60 # 10 hours

while time.time() < max_time:

    version_response = personalize_client.describe_recommender(
        recommenderArn = recommended_for_you_arn
    )
    status = version_response["recommender"]["status"]

    if status == "ACTIVE":
        print("Build succeeded for {}".format(recommended_for_you_arn))
        
    elif status == "CREATE FAILED":
        print("Build failed for {}".format(recommended_for_you_arn))
        

    if status == "ACTIVE" or status == "CREATE FAILED":
        break
    else:
        print('The "Customers who viewed X also viewed" Recommender build is still in progress')
        
    time.sleep(60)

# Create Filter

In [None]:
response = personalize_client.create_filter(
    name = 'available_items_filter',
    datasetGroupArn = dataset_group_arn,
    filterExpression = 'INCLUDE ItemID WHERE Items.STORE_IDS_AVAILABLE IN ($STORE_ID)'
    # filterExpression = 'INCLUDE ItemID WHERE Items.CATEGORY_L1 = $category_l1'
) 
filter_arn = response["filterArn"]
print("Filter ARN: " + filter_arn)

In [None]:
%%time

max_time = time.time() + 10*60*60 # 10 hours

while time.time() < max_time:

    version_response = personalize_client.describe_filter(filterArn= filter_arn)
    status = version_response["filter"]["status"]

    if status == "ACTIVE":
        print("Build succeeded for {}".format(filter_arn))
        
    elif status == "CREATE FAILED":
        print("F failed for {}".format(filter_arn))
        

    if status == "ACTIVE" or status == "CREATE FAILED":
        break
    else:
        print('The "available_items_filter" Filter build is still in progress')
        
    time.sleep(10)

In [None]:
test_user_id = "777"

# Select a random item
test_item_id = "8fbe091c-f73c-4727-8fe7-d27eabd17bea" # a random item: 8fbe091c-f73c-4727-8fe7-d27eabd17bea

# Get recommendations for the user for this item
get_recommendations_response = personalize_runtime.get_recommendations(
    recommenderArn = frequently_bought_together_arn,
    itemId = test_item_id,
    userId = test_user_id, 
    
    numResults = 10,
    promotions = [{
            "name" : "Seasonal-Items-Promotion",
            "percentPromotedItems" : 50,
            "filterArn": filter_arn,
            "filterValues": {
               "STORE_ID": json.dumps("store1")
            } 
          }]
)

# Build a new dataframe for the recommendations
item_list = get_recommendations_response['itemList']

for item in item_list:
    if ("promotionName" in item):
        print(item['itemId'] , "   ", item['promotionName'])
    else:
        print (item['itemId'])

In [None]:
user_id = "777"
rerank_item_list = ['0790267c-c708-424d-81f5-46903a9c8444','575c0ac0-5494-4c64-a886-a9c0cf8b779a','4cf78f85-4200-469c-b7b9-05c93770bf44','b20ba076-58a7-4602-9b56-4bee46e98388','aff05423-76e8-4339-a478-fc17d51ed985','a6432520-a9fe-42a3-8e04-58cd50d18fb0','4a43c5f7-090c-4cce-93fe-36062539ec38','f1e0660b-53db-4e9a-a86a-8a990d6b2988']

get_recommendations_response_rerank = personalize_runtime.get_personalized_ranking(
        campaignArn = rerank_campaign_arn,
        userId = user_id,
        inputList = rerank_item_list,
        filterArn = filter_arn,
        filterValues = {
                "STORE_ID": json.dumps("store1")
            } 
    # promotions = [{
    #         "name" : "Seasonal-Items-Promotion",
    #         "percentPromotedItems" : 50,
    #         "filterArn": filter_arn,
    #         "filterValues": {
    #           "category_l1": json.dumps("seasonal")
    #         } 
    #       }]
)

get_recommendations_response_rerank

In [None]:
# First pick a user
user_id = "777"

print(filter_arn)
get_recommendations_response = personalize_runtime.get_recommendations(
    recommenderArn = recommended_for_you_arn,
    userId = user_id,
    numResults = 10,
     promotions = [{
            "name" : "Seasonal-Items-Promotion",
            "percentPromotedItems" : 50,
            "filterArn": filter_arn,
            "filterValues": {
              "STORE_ID": json.dumps("store1")
            } 
          }]
)

print(json.dumps(get_recommendations_response['itemList'], indent=2))

get_recommendations_response = personalize_runtime.get_recommendations(
    recommenderArn = recommended_for_you_arn,
    userId = user_id,
    numResults = 10,
    filterArn = filter_arn,
    filterValues = {
              "STORE_ID": json.dumps("store1")
            } 
)

print(json.dumps(get_recommendations_response['itemList'], indent=2))

## Notes for the Next Notebook:

There are a few values you will need for the next notebook, execute the cells below to store them so they can be copied and pasted into the next part of the exercise.


In [None]:
%store dataset_group_arn

In [None]:
%store region

In [None]:
%store role_name

# Getting recommendations

### a) AWS Similar Items

In [None]:
# reading the original data in order to have a dataframe that has both item_ids 
# and the corresponding titles to make out recommendations easier to read.
items_df = pd.read_csv('./items.csv')

num_of_items = items_df.shape[0]


item_list = items_df.sample(num_of_items)['ITEM_ID'].tolist()
print(len(item_list))


In [None]:
json_input_filename = "items.jsonl"
with open(json_input_filename, 'w') as json_input:
    for item in item_list:
        json_input.write('{"itemId": "' + item + '"}\n')

In [None]:
s3_prefix_batch = "dataset/uploads/items"
s3_ip_val = "{}/{}".format(s3_prefix_batch, json_input_filename)
boto3.Session().resource('s3').Bucket(bucket_name).Object(s3_ip_val).upload_file(json_input_filename)
s3_input_path = "s3://" + bucket_name + "/"+ s3_ip_val
print(s3_input_path)

s3_op_val = "batched_output"
s3_output_path = "s3://" + bucket_name + "/" + s3_op_val + "/"
print(s3_output_path)

In [None]:
batchInferenceJobResponse = personalize_client.create_batch_inference_job (
    solutionVersionArn = sim_solution_version_arn,
    jobName = "sim-itesm-batch-inference-0",
    roleArn = role_arn,
    jobInput = 
       {"s3DataSource": {"path": s3_input_path}},
    jobOutput = 
       {"s3DataDestination": {"path": s3_output_path}}
)

batchInferenceJobArn = batchInferenceJobResponse['batchInferenceJobArn']

In [None]:
from datetime import datetime
current_time = datetime.now()
print("Import Started on: ", current_time.strftime("%I:%M:%S %p"))

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_inference_job_response = personalize_client.describe_batch_inference_job(
        batchInferenceJobArn = batchInferenceJobArn
    )
    status = describe_dataset_inference_job_response["batchInferenceJob"]['status']
    print("DatasetInferenceJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)
    
current_time = datetime.now()
print("Import Completed on: ", current_time.strftime("%I:%M:%S %p"))

### b) Frequently Bought Together 
Now that the recommenders have been trained, lets have a look at the recommendations we can get for our users!

In [None]:
def get_item_by_id(item_id, item_df):
    """
    This takes in an item_id from a recommendation in string format,
    converts it to an int, and then does a lookup in a default or specified
    dataframe and returns the item description.
    
    A really broad try/except clause was added in case anything goes wrong.
    
    Feel free to add more debugging or filtering here to improve results if
    you hit an error.
    """
    try:
        return items_df.loc[items_df["ITEM_ID"]==str(item_id)]['PRODUCT_DESCRIPTION'].values[0]
    except:
        print (item_id)
        return "Error obtaining item description"

Let us get some recommendations using the "Customers who viewed X also viewed" Recommender:

In [None]:
# First pick a user
test_user_id = "777"

# Select a random item
test_item_id = "8fbe091c-f73c-4727-8fe7-d27eabd17bea" # a random item: 8fbe091c-f73c-4727-8fe7-d27eabd17bea

# Get recommendations for the user for this item
get_recommendations_response = personalize_runtime.get_recommendations(
    recommenderArn = frequently_bought_together_arn,
    itemId = test_item_id,
    userId = test_user_id,
    numResults = 10
)

# Build a new dataframe for the recommendations
item_list = get_recommendations_response['itemList']
recommendation_list = []

for item in item_list:
    item = get_item_by_id(item['itemId'], items_df)
    recommendation_list.append(item)

user_recommendations_df = pd.DataFrame(recommendation_list, columns = [get_item_by_id(test_item_id, items_df)])

pd.options.display.max_rows =10
display(user_recommendations_df)

### c) Personalized Ranking
The core use case for personalized ranking is to take a collection of items and to render them in priority or probable order of interest for a user. To demonstrate this, we will need a random user and a random collection of 25 items.

In [None]:
df = pd.read_csv("interactions.csv")
rerank_user = df.sample(1).index.tolist()[0]
rerank_items = items_df.sample(10)['ITEM_ID'].tolist()

Now build a nice dataframe that shows the input data.

In [None]:
rerank_list = []
for item in rerank_items:
    prod_desc = get_item_by_id(item, items_df)
    rerank_list.append(prod_desc)
rerank_df = pd.DataFrame(rerank_list, columns = [rerank_user])
rerank_df

Then make the personalized ranking API call.

In [None]:
# Convert user to string:
user_id = str(rerank_user)
rerank_item_list = []
for item in rerank_items:
    rerank_item_list.append(str(item))
    
# Get recommended reranking
get_recommendations_response_rerank = personalize_runtime.get_personalized_ranking(
        campaignArn = rerank_campaign_arn,
        userId = user_id,
        inputList = rerank_item_list
)

get_recommendations_response_rerank

Now add the reranked items as a second column to the original dataframe, for a side-by-side comparison.

In [None]:
ranked_list = []
item_list = get_recommendations_response_rerank['personalizedRanking']
for item in item_list:
    prod_desc = get_item_by_id(item['itemId'], items_df)
    ranked_list.append(prod_desc)
ranked_df = pd.DataFrame(ranked_list, columns = ['Re-Ranked'])
rerank_df = pd.concat([rerank_df, ranked_df], axis=1)
rerank_df

You can see above how each entry was re-ordered based on the model's understanding of the user. This is a popular task when you have a collection of items to surface a user, a list of promotions for example, or if you are filtering on a category and want to show the most likely good items.

### d) Recommended For you

In [None]:
# First pick a user
user_id = "777"


get_recommendations_response = personalize_runtime.get_recommendations(
    recommenderArn = recommended_for_you_arn,
    userId = user_id,
    numResults = 10
)
get_recommendations_response['itemList']

## Put Event

In [None]:
import uuid
import time

personalize_events= boto3.client(service_name='personalize-events')

user_id = '4353'
item_id = '6579c22f-be2b-444c-a52b-0116dd82df6c'
session_id = str(uuid.uuid1())

def create_event_tracker(event_tracker_name, dataset_group_arn):
    response = personalize_client.create_event_tracker(
                name= event_tracker_name,
                datasetGroupArn= dataset_group_arn
            )
    return response


def send_movie_click(user_id, item_id, session_id, event_tracking_id):
    event = {
            "itemId": str(item_id),
            }
    event_json = json.dumps(event)
        
    # Make Call
    response = personalize_events.put_events(
            trackingId = event_tracking_id,
            userId= user_id,
            sessionId = session_id,
            eventList = [{
                'sentAt': int(time.time()),
                'eventType': 'View',
                'properties': event_json
                }]
        )
    return response

event_tracker_name = 'websiteClickEvent'
res = create_event_tracker(event_tracker_name, dataset_group_arn)

event_tracker_arn = res['eventTrackerArn']
tracking_id = res['trackingId']

print(res['eventTrackerArn'])
print(res['trackingId'])


In [None]:
%%time

max_time = time.time() + 10*60*60 # 10 hours

while time.time() < max_time:

    version_response = personalize_client.describe_event_tracker(eventTrackerArn= event_tracker_arn)
    status = version_response["eventTracker"]["status"]

    if status == "ACTIVE":
        print("Build succeeded for {}".format(filter_arn))
        
    elif status == "CREATE FAILED":
        print("F failed for {}".format(filter_arn))
        

    if status == "ACTIVE" or status == "CREATE FAILED":
        break
    else:
        print('The "websiteClickEvent" Filter build is still in progress')
        
    time.sleep(10)


In [None]:
send_movie_click(user_id, item_id, user_id, res['trackingId'])

## Review
Using the codes above you have successfully trained a deep learning model to generate item recommendations based on prior user behavior. You have created two recommenders for two foundational use cases. 
Going forward, you can adapt this code to create other recommenders.

## Only Run Below cells after the deploying the CDK stack!!!

In [None]:
# CDK attribute : SimilarItemsInput
sim_items_input_bucket_name =  # sim_items_input_bucket_name = 're-similaritemsinputa49dd15f-15butx9ehcxmk'

# CDK attribute : ItemsTable 
items_table_name =  # items_table_name = 'RE-ItemsTable5AAC2C46-U483L1G6EUB8'

In [None]:
# Use the table created by the CDK "ItemsTable" attribute  
df = pd.read_csv("./datasets/items_with_store_inv.csv")
df = df.rename(columns={ 'ITEM_ID':'id',
                         'NAME':'name',
                         'CATEGORY_L1':'categoryL1',
                         'CATEGORY_L2':'categoryL2',
                         'PRODUCT_DESCRIPTION':'productDescription',
                         'PRICE':'price',
                         'CREATION_TIMESTAMP':'creationTimestamp',
                         'STORE_INVENTORY':'storeInventory',
                         'STORE_IDS_AVAILABLE':'storeIdsAvailable'
                       }
              )

total_items = len(df)
table = dynamodb_resource.Table(items_table_name)
tic = time.time()
try:
    with table.batch_writer() as batch:
        for index, row in df.iterrows():
            a = json.loads(row.to_json(), parse_float=Decimal)
            a["storeInventory"] = ast.literal_eval(a["storeInventory"])
            a["storeIdsAvailable"] = ast.literal_eval(a["storeIdsAvailable"])
            batch.put_item(Item = a)
except Exception as e:
    print(e)
toc = time.time()
exec_time = round((toc-tic), 2)
response = "time took to put {} items in dynamoDb is {} seconds".format(total_items, exec_time)
response

### Upload to S3
Now that our training data is ready for Amazon Personalize,the next step is to upload it to the s3 bucket created earlier

In [None]:
interactions_file_name = 'cleaned_training_data.csv'
s3_prefix = "dataset/uploads/interaction"
boto3.Session().resource('s3').Bucket(sim_items_input_bucket_name).Object(s3_prefix +"/"+interactions_file_name).upload_file(interactions_file_name)
interactions_s3DataPath = "s3://{}/{}/{}".format(sim_items_input_bucket_name, s3_prefix, interactions_file_name)
print(interactions_s3DataPath)

In [None]:
users_file_name = 'users.csv'
s3_prefix = "dataset/uploads/users"
boto3.Session().resource('s3').Bucket(sim_items_input_bucket_name).Object(s3_prefix +"/"+users_file_name).upload_file(users_file_name)
users_s3DataPath = "s3://{}/{}/{}".format(sim_items_input_bucket_name, s3_prefix, users_file_name)
print(users_s3DataPath)


In [None]:
items_file_name = 'items.csv'
s3_prefix = "dataset/uploads/items"
boto3.Session().resource('s3').Bucket(sim_items_input_bucket_name).Object(s3_prefix +"/"+items_file_name).upload_file(items_file_name)
items_s3DataPath = "s3://{}/{}/{}".format(sim_items_input_bucket_name, s3_prefix, items_file_name)
print(items_s3DataPath)


In [None]:
json_input_filename = 'items.jsonl'
s3_client.upload_file(json_input_filename, sim_items_input_bucket_name, json_input_filename)
s3_items_jsonl_path = "s3://" + bucket_name + "/"+ json_input_filename
print(s3_items_jsonl_path)