Take a look at the data files you have downloaded.

## Prepare your data <a class="anchor" id="prepare"></a>
[Back to top](#top)

The next thing to be done is to load the data and confirm the data is in a good state, then save it to a CSV where it is ready to be used with Amazon Personalize.

To get started, import a collection of Python libraries commonly used in data science.

In [1]:
import time
from time import sleep
import json
from datetime import datetime
import numpy as np
import boto3
import pandas as pd

In [2]:
# Configure the SDK to Personalize:
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

In [3]:

item_df = pd.read_csv("ml-100k/items.csv")

In [4]:
item_df.head()

Unnamed: 0,ITEM_ID,title,release_date,video_release_date,imdb url,unknown,Action,Adventure,Animation,Childrens,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,CREATION_TIMESTAMP
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,788918400.0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,788918400.0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,788918400.0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,788918400.0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,788918400.0


In [5]:

user_df = pd.read_csv("ml-100k/users.csv")

In [6]:

ui = pd.read_csv("ml-100k/interactions.csv")

In [7]:
ui['TIMESTAMP'].max(), ui['TIMESTAMP'].min()

(893286638, 874724710)

Next,open the data file and take a look at the first several rows.

### Prepare Training Data and Testing Data 

In [8]:
train_data = ui[(ui['TIMESTAMP']<=888700934) ] #20210426
test_data = ui[(ui['TIMESTAMP']>888700934) & (ui['TIMESTAMP']<=893286638)] #20210426 - 20210429


### Offline Evaluation 

In [10]:
!pip install tqdm
from tqdm import tqdm_notebook
from metrics import mean_reciprocal_rank, ndcg_at_k, precision_at_k



In [11]:
sample_number = 1000
unique_user = test_data['USER_ID'].unique()
np.random.shuffle(unique_user)
sampled_user = unique_user[:sample_number]

In [12]:


sampled_results = test_data[test_data['USER_ID'].isin(sampled_user)].groupby('USER_ID').ITEM_ID
sampled_results

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f56ef807b70>

In [25]:

rerank_campaign_arn = "arn:aws:personalize:us-west-2:230755935769:campaign/p13n-ml100k-96536"

relevance = []
for user_id, true_items in tqdm_notebook(sampled_results):
    rec_response = personalize_runtime.get_recommendations(
        campaignArn = rerank_campaign_arn,
        userId = str(user_id)
    )
    rec_items = [int(x['itemId']) for x in rec_response['itemList']]
    relevance.append([int(x in true_items.values) for x in rec_items])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/330 [00:00<?, ?it/s]

In [27]:
print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance]))
print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance]))
print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance]))
print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance]))

mean_reciprocal_rank 0.5135726169932475
precision_at_5 0.3066666666666667
precision_at_10 0.2833333333333333
precision_at_25 0.2483636363636364
normalized_discounted_cumulative_gain_at_5 0.3572034776683027
normalized_discounted_cumulative_gain_at_10 0.40412757650037634
normalized_discounted_cumulative_gain_at_25 0.5675452762745384


### Calculate diversity, novelty and serendipity

In [15]:
genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \
                                        'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \
                                        'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
userprofile = train_data.merge(item_df, on=['ITEM_ID'])[['USER_ID']+genres].groupby(['USER_ID']).sum().reset_index()

In [16]:
userprofile

Unnamed: 0,USER_ID,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,74,42,10,24,84,24,4,103,2,1,13,12,5,42,41,50,25,6
1,2,0,7,2,1,1,15,8,0,30,0,2,2,0,4,16,4,10,3,0
2,5,1,56,33,14,29,82,9,0,27,2,1,28,12,3,19,33,19,14,2
3,6,0,25,22,10,20,66,14,1,104,3,6,4,13,12,41,13,24,21,5
4,8,0,39,17,0,3,10,9,0,17,0,1,1,0,1,7,17,18,11,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,937,0,6,3,1,2,12,2,1,25,0,0,0,0,1,9,3,4,4,0
730,939,0,18,10,1,1,15,3,0,18,1,0,1,1,1,10,8,12,3,0
731,940,0,22,11,2,5,39,7,0,47,0,3,4,7,3,22,18,18,14,0
732,941,0,10,7,3,2,7,1,0,5,0,0,0,1,1,1,8,7,1,0


In [17]:
## item object 

class Item: 
    def __init__(self, item_df, play_log_df):
        self.items = item_df
        self.play_log = play_log_df[play_log_df['EVENT_TYPE']==1].groupby(['ITEM_ID']).sum().reset_index()
        
    def get_contents_by_id(self, id): 
        return self.items[self.items['ITEM_ID']==id].values[0][5:-1] #categories 
    
    def get_recency_by_id(self, id):
        return self.items[self.items['ITEM_ID']==id].values[0][-1] #categories 
    
    def get_popularity_by_id(self, id):
        if len(self.play_log[self.play_log['ITEM_ID']==id].values) == 0:
            return 0
        return self.play_log[self.play_log['ITEM_ID']==id].values[0][3]

class User: 
    def __init__(self, train_data, item_df):
        genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \
                                        'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \
                                        'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
        self.userprofile = train_data.merge(item_df, on=['ITEM_ID'])[['USER_ID']+genres].groupby(['USER_ID']).sum().reset_index()
    
    def get_user_profile(self, id):

        if len(self.userprofile[self.userprofile['USER_ID']==id].values)==0: 
            return [0 for i in range(0, 19)]
        else:
            raw_profile = self.userprofile[self.userprofile['USER_ID']==id].values[0][1:]
            avg = np.average(raw_profile)
            return [1 if x >=avg else 0 for x in raw_profile]
            
    
    
item_db =  Item(item_df, ui)
user_db = User(train_data, item_df)
print(item_db.get_contents_by_id(1))    
print(item_db.get_recency_by_id(1))    
print(item_db.get_popularity_by_id(1))  
print(user_db.get_user_profile(7))

[0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
788918400.0
0
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [18]:
ui

Unnamed: 0,USER_ID,ITEM_ID,EVENT_VALUE,TIMESTAMP,EVENT_TYPE
0,196,242,3,881250949,rating
1,186,302,3,891717742,rating
2,22,377,1,878887116,rating
3,244,51,2,880606923,rating
4,166,346,1,886397596,rating
...,...,...,...,...,...
99995,880,476,3,880175444,rating
99996,716,204,5,879795543,rating
99997,276,1090,1,874795795,rating
99998,13,225,2,882399156,rating


In [19]:
### done by inter-similarity of a recommendation list 


def diversity(pred, item_db):
    d = 0 
    for i, p1 in enumerate(pred): 
        for j, p2 in enumerate(pred):
            if j > i: 
                dist = sum(abs(item_db.get_contents_by_id(p1) - item_db.get_contents_by_id(p2))) 
                d += dist
    return d 

def novelty(pred, item_db):
    d = 0 
    for i, p in enumerate(pred):
        d += 1/(item_db.get_popularity_by_id(p)+1)
#     print(d)
    return d


def serendipity(pred, groud_truth, uid, user_db, item_db): 
    up = user_db.get_user_profile(uid)
    up_norm = [1 if i > 0 else 0 for i in up ]
    dist_total = 0 
    for p in pred:
        if p in groud_truth:
            contents = item_db.get_contents_by_id(p)
            dist = sum(abs(up_norm - contents))   
            dist_total += dist
    return  dist_total / len(pred)
    
    
 

In [28]:
total_diversity = 0 
total_novelty = 0 
total_serendipity = 0 


for user_id, true_items in tqdm_notebook(sampled_results):
    rec_response = personalize_runtime.get_recommendations(
        campaignArn = rerank_campaign_arn,
        userId = str(user_id)
    )
    rec_items = [int(x['itemId']) for x in rec_response['itemList']]
    total_diversity += diversity(rec_items, item_db)
    total_novelty += novelty(rec_items, item_db)
    total_serendipity += serendipity(rec_items, true_items, user_id, user_db, item_db)
    
users = test_data['USER_ID'].unique()    
print(total_diversity / sample_number)    
print(total_novelty / sample_number)
print(total_serendipity / sample_number)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/330 [00:00<?, ?it/s]

328.416
8.25
0.0010400000000000001


In [None]:
%store dataset_group_arn

In [None]:
%store schema_arn 
%store item_schema_arn
%store user_schema_arn

In [None]:
%store role_arn

In [None]:
dataset_group_arn