# Amazon Personalize User Segmentation - Airline Tickets marketing campaigns

Amazon Personalize offers two recipes that segment your users based on their interest in different product categories, brands and more. 
1. Item affinity recipe `aws-item-affinity` identifies users based on their interest in the individual items in your catalog. 
1. Item attribute affinity recipe `aws-item-attribute` identifies users based on the attributes of items in your catalog such as airline, promotion, season, cities etc. This allows you to better engage users with your marketing campaigns and improve retention through targeted messaging.

This notebook demonstrates how to use the `aws-item-affinity` recipe to create user segments based on their preferences for airline products in sample dataset. We use one dataset group which contains user-item interaction data and item metadata. We use these datasets to train solutions using the two recipes and create user segments in batch.


This notebook guides you through the deployment of the following architecture. 


As we can see above, we will deploy the following resources:
1. S3 bucket used to store the training files, plus our inference input and output files
1. A dataset group
1. Three datasets - Interactions, Items, users
1. Solutions and solution versions configured with each of our new User Segmentation recipes
1. Two batch inference jobs

Once we have the batch inference job results, we will be analyzing the results

## Preprocess library 


In [None]:
import pandas as pd
import json
import numpy as np
from datetime import datetime
import boto3
import time
from time import sleep
from lxml import html
import sys
from tqdm import tqdm
import datetime as dt

### Get the Personalize API model Json and Personalize Boto3 Client

In [None]:
# let's validate that your environment can communicate successfully with Amazon Personalize.

personalize = boto3.client(service_name='personalize')
personalize_runtime = boto3.client(service_name='personalize-runtime')
personalize_events = boto3.client(service_name='personalize-events')

s3 =boto3.client('s3')

## Upload data to S3

In [None]:
import boto3

# Create a boto3 session
session = boto3.Session()

# Get the current AWS region
region = session.region_name

# Print the current region
print("Current AWS region:", region)


### Create S3 bucket

In [None]:
s3 = boto3.client('s3')
account_id = boto3.client('sts').get_caller_identity().get('Account')
suffix = str(np.random.uniform())[4:9]
bucket_name = "personalize-user-segment" + suffix
print(bucket_name)
if region != "us-east-1":
    s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': region})
else:
    s3.create_bucket(Bucket=bucket_name)

### Upload datsets into S3

In [None]:
# interaction datset
interactions_filename = 'df_interactions.csv'
boto3.Session().resource('s3').Bucket(bucket_name).Object(interactions_filename).upload_file(interactions_filename)

In [None]:
# item dataset
item_metadata_file = 'df_item_deduplicated.csv'
boto3.Session().resource('s3').Bucket(bucket_name).Object(item_metadata_file).upload_file(item_metadata_file)

In [None]:
# user dataset
user_metadata_file = 'df_users_deduplicated.csv'
boto3.Session().resource('s3').Bucket(bucket_name).Object(user_metadata_file).upload_file(user_metadata_file)


### Helper functions
The following helper functions will be used later in the notebook.

In [None]:
def print_s3_file_content(bucket, key, limit=None):
    obj = s3.get_object(Bucket=bucket, Key=key)

    i = 0
    for line in obj['Body'].read().decode("utf-8").split("\n"):
        print(line)
        i+=1
        if limit is not None and i > limit:
            break

max_time = time.time() + 3 * 60 * 60 # 3 hours

def wait_for_dataset_group_job(dataset_group_arn):
    max_time = time.time() + 3 * 60 * 60
    while time.time() < max_time:
        describe_dataset_group_response = personalize.describe_dataset_group(
            datasetGroupArn = dataset_group_arn
        )
        status = describe_dataset_group_response["datasetGroup"]["status"]
        print("DatasetGroup: {}".format(status))

        if status == "ACTIVE" or status == "CREATE FAILED":
            break

        time.sleep(60)
        
def wait_for_dataset_import_job(dataset_import_job_arn):
    max_time = time.time() + 3 * 60 * 60
    while time.time() < max_time:
        describe_dataset_import_job_response = personalize.describe_dataset_import_job(
            datasetImportJobArn = dataset_import_job_arn
        )
        status = describe_dataset_import_job_response["datasetImportJob"]['status']
        print("DatasetImportJob: {}".format(status))

        if status == "ACTIVE" or status == "CREATE FAILED":
            break

        time.sleep(120)
            
def wait_for_solution_version_job(solution_version_arn):
    max_time = time.time() + 3 * 60 * 60
    while time.time() < max_time:
        describe_solution_version_response = personalize.describe_solution_version(
            solutionVersionArn = solution_version_arn
        )
        status = describe_solution_version_response["solutionVersion"]["status"]
        print("SolutionVersion: {}".format(status))

        start = describe_solution_version_response["solutionVersion"]["creationDateTime"]
        end = describe_solution_version_response["solutionVersion"]["lastUpdatedDateTime"]
        if status == "ACTIVE":
            print("Time took: {}".format(end - start))
            break
        if status == "CREATE FAILED":
            print("Time took: {}".format(end - start))
            print("Job Failed: {}".format(describe_solution_version_response["solutionVersion"]["failureReason"]))
            break

        time.sleep(180)
        
def wait_for_batch_segment_job(batch_segment_job_arn):
    max_time = time.time() + 3 * 60 * 60
    while time.time() < max_time:
        describe_job_response = personalize.describe_batch_segment_job(
            batchSegmentJobArn = batch_segment_job_arn
        )
        status = describe_job_response["batchSegmentJob"]["status"]
        print("Batch Segment Job: {}".format(status))

        start = describe_job_response["batchSegmentJob"]["creationDateTime"]
        end = describe_job_response["batchSegmentJob"]["lastUpdatedDateTime"]
        if status == "ACTIVE":
            print("Time took: {}".format(end - start))
            break
        if status == "CREATE FAILED":
            print("Time took: {}".format(end - start))
            print("Job Failed: {}".format(describe_job_response["batchSegmentJob"]["failureReason"]))
            break

        time.sleep(180)
        
        

# Item affinity recipe

The Item affinity recipe will recommend users that are likely to engage with a given item.


## 1. Create dataset group, datasets and  upload datasets

### Create a Dataset Group
The following cell will create a new dataset group with the name airlines-dataset-group + a suffix

In [None]:
dataset_group_name = "airlines-dataset-group-" + suffix

create_dataset_group_response = personalize.create_dataset_group(
    name = dataset_group_name
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

Before we can use the dataset group, it must be active. This can take a minute or two. Execute the cell below and wait for it to show the ACTIVE status.

In [None]:
wait_for_dataset_group_job(dataset_group_arn)

### Prepare interaction schema

In [None]:
schema_name="airlines-interaction-schema-"+suffix

schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        },
        {
            "name":"CABIN_TYPE",
            "type": "string",
            "categorical": True
        },
        {
          "name": "EVENT_TYPE",
          "type": "string"
        },
        {
          "name": "EVENT_VALUE",
          "type": "float"
        }
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = schema_name,
    schema = json.dumps(schema)
)

schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))



### Prepare interaction dataset

In [None]:
dataset_type = "INTERACTIONS"
create_dataset_response = personalize.create_dataset(
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = schema_arn,
    name = "airlines-dataset-interactions-" + suffix
)

interactions_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

### Prepare users schema

In [None]:
user_metadata_schema_name="airlines-users-schema-"+suffix

user_metadata_schema = {
    "type": "record",
    "name": "Users",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "memberClass",
            "type": "string",
            "categorical": True
        }
    ],
    "version": "1.0"
}

create_metadata_schema_response = personalize.create_schema(
    name = user_metadata_schema_name,
    schema = json.dumps(user_metadata_schema)
)

user_metadata_dataset_arn = create_metadata_schema_response['schemaArn']
print(json.dumps(create_metadata_schema_response, indent=2))

### Prepare users Dataset

In [None]:
dataset_type = "USERS"
create_metadata_dataset_response = personalize.create_dataset(
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = user_metadata_dataset_arn,
    name = "airlines-metadata-dataset-users-" + suffix
)

user_metadata_dataset_arn = create_metadata_dataset_response['datasetArn']
print(json.dumps(create_metadata_dataset_response, indent=2))

### Prepare items schema

In [None]:
item_metadata_schema_name="airlines-item-schema-"+suffix

# Define the updated schema for items based on your dataframe columns
items_schema = {
    "type": "record",
    "name": "Items",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "DSTCity",
            "type": ["null", "string"],
            "categorical": True
        },
        {
            "name": "SRCCity",
            "type": ["null", "string"],
            "categorical": True
        },
        {
            "name": "Airline",
            "type": ["null", "string"],
            "categorical": True
        },
        {
            "name": "DurationDays",
            "type": "int"
        },
        {
            "name": "Season",
            "type": ["null", "string"],
            "categorical": True
        },
        {
            "name": "numberOfSearchByUser",
            "type": "int"
        },
        {
            "name": "Promotion",
            "type": ["null", "string"],
            "categorical": True
        },
        {
            "name": "DynamicPrice",
            "type": "int"
        },
        {
            "name": "DiscountForMember",
            "type": "float"
        },
        {
            "name": "Expired",
            "type": ["null", "string"],
            "categorical": True
        }
    ],
    "version": "1.0"
}

create_metadata_schema_response = personalize.create_schema(
    name = item_metadata_schema_name,
    schema = json.dumps(items_schema)
)

metadata_schema_arn = create_metadata_schema_response['schemaArn']
print(json.dumps(create_metadata_schema_response, indent=2))


### Prepare items dataset

In [None]:
dataset_type = "ITEMS"
create_metadata_dataset_response = personalize.create_dataset(
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = metadata_schema_arn,
    name = "airlines-metadata-dataset-items-" + suffix
)

metadata_dataset_arn = create_metadata_dataset_response['datasetArn']
print(json.dumps(create_metadata_dataset_response, indent=2))

## Configure an S3 bucket and an IAM role

### Set the S3 bucket policy
Amazon Personalize needs to be able to read the contents of your S3 bucket. So add a bucket policy which allows that.


In [None]:
s3 = boto3.client("s3")
bucket = bucket_name
policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com"
            },
            "Action": [
                "s3:GetObject",
                "s3:ListBucket",
                "s3:PutObject"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket),
                "arn:aws:s3:::{}/*".format(bucket)
            ]
        }
    ]
}
# uncomment if this policy has not been attached to the bucket
s3.put_bucket_policy(Bucket=bucket, Policy=json.dumps(policy))

### Create personalize role with Personalize FullAccess and S3 FullAccess



### If you already created the role with policy, then it needn't run again
Pls run the below cell and check if you already has the role and attached policy

In [None]:
role_name = "PersonalizeRoleDemo"+account_id

# Construct the IAM role ARN
role_arn = f"arn:aws:iam::{account_id}:role/{role_name}"

print("IAM Role ARN:", role_arn)

In [None]:
import boto3
import botocore.exceptions

# Initialize the IAM client
iam = boto3.client('iam')


try:
    # List attached policies for the IAM role
    response = iam.list_attached_role_policies(RoleName=role_name)

    # Check if there are attached policies
    if 'AttachedPolicies' in response:
        # Print the attached policies
        for policy in response['AttachedPolicies']:
            print("Attached Policy Name:", policy['PolicyName'])
            print("Policy ARN:", policy['PolicyArn'])
            print()
    else:
        print(f"There haven't any policies attached to the IAM role '{role_name}'.")

except botocore.exceptions.ClientError as e:
    # Check if the error code indicates a "no such entity" error
    if e.response['Error']['Code'] == 'NoSuchEntity':
        print(f"There haven't any policies attached to the IAM role '{role_name}'.")
    else:
        # Handle other exceptions if needed
        print(f"An error occurred: {e}")



### If you can see the results of attached policy, then means you already have the role and attached policy, you can skip the following step


Attached Policy Name: AmazonPersonalizeFullAccess
Policy ARN: arn:aws:iam::aws:policy/service-role/AmazonPersonalizeFullAccess

Attached Policy Name: AmazonS3FullAccess
Policy ARN: arn:aws:iam::aws:policy/AmazonS3FullAccess

### But if there has error message, you need run the following step to create policy

"NoSuchEntityException: An error occurred (NoSuchEntity) when calling the ListAttachedRolePolicies operation: The role with name PersonalizeRoleDemo cannot be found"

In [None]:
iam = boto3.client("iam")
role_name = "PersonalizeRoleDemo"+account_id
assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "personalize.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
    ]
}

create_role_response = iam.create_role(
    RoleName = role_name,
    AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
)

# AmazonPersonalizeFullAccess provides access to any S3 bucket with a name that includes "personalize" or "Personalize" 
# if you would like to use a bucket with a different name, please consider creating and attaching a new policy
# that provides read access to your bucket or attaching the AmazonS3ReadOnlyAccess policy to the role
policy_arn = iam.attach_role_policy(
    RoleName = role_name,
    PolicyArn = "arn:aws:iam::aws:policy/service-role/AmazonPersonalizeFullAccess"
)

# Now add S3 support
iam.attach_role_policy(
    PolicyArn='arn:aws:iam::aws:policy/AmazonS3FullAccess',
    RoleName=role_name
)
time.sleep(60) # wait for a minute to allow IAM role policy attachment to propagate

role_arn = create_role_response["Role"]["Arn"]
print(role_arn)

### Import the interactions data

In [None]:
print (bucket_name)
print (interactions_filename)
print (interactions_dataset_arn)
print (role_arn)

In [None]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "airlines-dataset-import-job-"+suffix,
    datasetArn = interactions_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket_name, interactions_filename)
    },
    roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

In [None]:
wait_for_dataset_import_job(dataset_import_job_arn)

### Import the items data

In [None]:
print (bucket_name)
print (item_metadata_file)
print (metadata_dataset_arn)
print (role_arn)

In [None]:
create_metadata_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "airlines-items-metadata-dataset-import-job-"+suffix,
    datasetArn = metadata_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket_name, item_metadata_file)
    },
    roleArn = role_arn
)

metadata_dataset_import_job_arn = create_metadata_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_metadata_dataset_import_job_response, indent=2))

In [None]:
wait_for_dataset_import_job(metadata_dataset_import_job_arn)

### Import the user data

In [None]:
print (bucket_name)
print (user_metadata_file)
print (user_metadata_dataset_arn)
print (role_arn)

In [None]:
create_user_metadata_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "airlines-user-metadata-dataset-import-job-"+suffix,
    datasetArn = user_metadata_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket_name, user_metadata_file)
    },
    roleArn = role_arn
)

user_metadata_dataset_import_job_arn = create_user_metadata_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_user_metadata_dataset_import_job_response, indent=2))

In [None]:
wait_for_dataset_import_job(user_metadata_dataset_import_job_arn)

## Prepare marketing promotion item

In [None]:
import pandas as pd

# Specify the file path
file_path = "df_item_deduplicated.csv"

# Load the CSV file into a DataFrame
df_item_deduplicated = pd.read_csv(file_path)

df_item_deduplicated.head()

In [None]:
# Assuming df is your DataFrame containing the data
result_df = df_item_deduplicated[(df_item_deduplicated['Promotion'] == 'Yes') & (df_item_deduplicated['Season'] == 'October') & (df_item_deduplicated['DSTCity'] == 'Hong Kong')]

# Display the resulting DataFrame
result_df.head()

In [None]:
# Assuming df is your DataFrame
random_row = result_df.sample(n=1)

In [None]:
columns_to_remove = ['numberOfSearchByUser', 'Promotion', 'Expired']
random_row = random_row.drop(columns=columns_to_remove)

In [None]:


# Save the randomly selected row to a JSON file
random_row.to_json('promotion_item_metadata.json', orient='records')

# Display the randomly selected row
random_row.head()

### Prepare for LangChain prompting metadata, test-metadata.json

In [None]:
import json

# Read the JSON file
with open('promotion_item_metadata.json', 'r') as file:
    data = json.load(file)

# Extract the first element (dictionary) from the list
json_data = data[0]

# Save the extracted data to a new JSON file
with open('test-metadata.json', 'w') as output_file:
    json.dump(json_data, output_file)

print(f'Data saved to "test-metadata.json"')

### Prepare for user-segment batch job query data input file, item-affinity-query.json

In [None]:
df_item_affinity_query = pd.DataFrame({"itemId": [random_row['ITEM_ID'].values[0]]})

In [None]:
# Display the DataFrame
df_item_affinity_query.head()


In [None]:
df_item_affinity_query.to_json('item-affinity-query.json', orient='records')


In [None]:
import json

# Read the JSON file
with open('item-affinity-query.json', 'r') as file:
    data = json.load(file)

# Extract the first element (dictionary) from the list
json_data = data[0]

# Save the extracted data to a new JSON file
with open('item-affinity-query.json', 'w') as output_file:
    json.dump(json_data, output_file)

print(f'Data saved to "item-affinity-query.json"')

## 2. Create Solution

#### 2.1 Select item-affinity recipe

In [None]:
item_user_recipe = 'arn:aws:personalize:::recipe/aws-item-affinity'

### 2.2 Create solution

In [None]:
print (dataset_group_arn)

In [None]:
create_solution_response = personalize.create_solution(
    name = "item-affinity-airline-meta-demo",
    datasetGroupArn = dataset_group_arn,
    recipeArn = item_user_recipe,
)
solution_arn = create_solution_response['solutionArn']

In [None]:
personalize.describe_solution(solutionArn = solution_arn)

### 2.3 Create Solution Version

In [None]:
create_solution_version_response = personalize.create_solution_version(
    solutionArn = solution_arn
)
solution_version_arn = create_solution_version_response['solutionVersionArn']
print(solution_version_arn)

#### Wait for Solution Version to Have ACTIVE Status

In [None]:
wait_for_solution_version_job(solution_version_arn)

# Noted: It need take 33min for model training, during the period, please run another Bedrock SDK notebook to creat promotion content first.

### 2.4 Get Metrics
Note: these metrics note are Amazon Personalize’s offline metrics that are used to evaluate results across solution versions. These should not be confused with the metrics that we will derive from the test dataset that we built when preprocessing the data.

In [None]:
get_solution_metrics_response = personalize.get_solution_metrics(solutionVersionArn=solution_version_arn)
print(get_solution_metrics_response['metrics'])

## 3. Create Batch Segment Job


### 3.1 Prepare input query data by item-affinity-query.json

In [None]:
# example json lines in the input file:
# {"itemId": "1"}
# {"itemId": "2"}
# {"itemId": "3"}
# ...
batch_file_name = 'item-affinity-query.json'

# upload the file into S3
boto3.Session().resource('s3').Bucket(bucket_name).Object(batch_file_name).upload_file(batch_file_name)

batch_input_path = "s3://"+bucket_name+"/"+batch_file_name
batch_output_path = "s3://"+bucket_name+"/output/"
print_s3_file_content(bucket=bucket_name, key=batch_file_name,limit=3)
# these are the file contents

### 3.2 Create Batch Segment Job

In [None]:
import datetime

# Define the prefix
prefix = "item-affinity-query-query-"

# Get the current timestamp in the desired format
current_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

# Combine the prefix and current timestamp to create the job name
job_name = f"{prefix}{current_time}"




In [None]:

create_batch_segment_response = personalize.create_batch_segment_job(
    jobName = job_name,
    solutionVersionArn = solution_version_arn,
    numResults = 5,
    jobInput =  {
        "s3DataSource": {
            "path": batch_input_path
        }
    },
    jobOutput = {
        "s3DataDestination": {
            "path": batch_output_path
        }
    },
    roleArn = role_arn 
    )

batch_segment_job_arn = create_batch_segment_response['batchSegmentJobArn']
print(batch_segment_job_arn)

In [None]:
wait_for_batch_segment_job(batch_segment_job_arn)

## Note: it will take 7min to run the batch job

### Download the result from S3

In [None]:
print(bucket_name)
print(batch_output_path)

In [None]:

object_key = 'output/item-affinity-query.json.out'

local_file_name = 'item-affinity-query-results.json'

# Download the file from S3 to the current directory
s3.download_file(bucket_name, object_key, local_file_name)

In [None]:
# Open the JSON file for reading
with open(local_file_name, 'r') as file:
    # Load the JSON data
    data = json.load(file)

# Now, 'data' contains the contents of the JSON file
print(data)