# Aula 10 - Recomendação baseada em sessão - Exercícios

In [1]:
import pandas as pd
import numpy as np

### Leitura do arquivo 2019-Oct-sample.csv (vide Aula 10 - Exemplos) caso não possua o arquivo

In [2]:
subset = pd.read_csv('./2019-Oct-sample.csv')
subset.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session
0,2019-10-31 06:23:12 UTC,view,1005115,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
1,2019-10-31 06:23:52 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
2,2019-10-31 06:25:30 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
3,2019-10-31 06:26:58 UTC,view,1004858,electronics.smartphone,samsung,00000056-a206-40dd-b174-a072550fa38c
4,2019-10-31 06:28:21 UTC,view,1005104,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c


In [3]:
map_items = {item: idx for idx, item in enumerate(subset.product_id.unique())}
map_sessions = {item: idx for idx, item in enumerate(subset.user_session.unique())}
subset['itemId'] = subset['product_id'].map(map_items)
subset['sessionId'] = subset['user_session'].map(map_sessions)
subset.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
0,2019-10-31 06:23:12 UTC,view,1005115,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,0,0
1,2019-10-31 06:23:52 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,1,0
2,2019-10-31 06:25:30 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,1,0
3,2019-10-31 06:26:58 UTC,view,1004858,electronics.smartphone,samsung,00000056-a206-40dd-b174-a072550fa38c,2,0
4,2019-10-31 06:28:21 UTC,view,1005104,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,3,0


In [4]:
n_items = subset['itemId'].max()+1
print('No. items: ', n_items)
n_sessions = subset['sessionId'].max()+1
print('No. sessions: ', n_sessions)

No. items:  42581
No. sessions:  483508


In [5]:
# create a dataset
# remove sessions with less than 2 items
def create_data(df):
    df.sort_values(by=['sessionId', 'event_time'], inplace=True, ignore_index=True)
    sessions, session = [], []
    for index, value in df.iterrows():
        if index != 0:
            if value["sessionId"] == df.at[index-1, "sessionId"]:
                if value["event_type"] == 'view':
                    session.append(value["itemId"])
            else:
                if len(session) > 1:
                    sessions.append((df.at[index-1, "sessionId"], session))
                session = [value["itemId"]]
        else:
            session.append(value["itemId"])
    return sessions

In [6]:
sessions = create_data(subset)

In [7]:
import random

random.shuffle(sessions)
split = len(sessions) * 0.8
train = sessions[:int(split)]
test = sessions[int(split):]
print('No. train sessions: ', len(train))
print('No. test sessions: ', len(test))

No. train sessions:  237531
No. test sessions:  59383


In [8]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [36]:
actual_session = test[37]
target = actual_session[1][0:-1]
print(actual_session)
print(target)
subset.loc[subset.sessionId==actual_session[0]]

(np.int64(158854), [18, 17, 146, 484, 352, 113, 177, 177, 423, 483])
[18, 17, 146, 484, 352, 113, 177, 177, 423]


Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
657935,2019-10-15 05:19:43 UTC,view,1004856,electronics.smartphone,samsung,0659028a-8ece-42d8-991f-12180c7ee9e2,18,158854
657936,2019-10-15 05:19:54 UTC,view,1004870,electronics.smartphone,samsung,0659028a-8ece-42d8-991f-12180c7ee9e2,17,158854
657937,2019-10-15 05:20:31 UTC,view,1004873,electronics.smartphone,samsung,0659028a-8ece-42d8-991f-12180c7ee9e2,146,158854
657938,2019-10-15 05:21:43 UTC,view,1005100,electronics.smartphone,samsung,0659028a-8ece-42d8-991f-12180c7ee9e2,484,158854
657939,2019-10-15 05:21:57 UTC,view,1004766,electronics.smartphone,samsung,0659028a-8ece-42d8-991f-12180c7ee9e2,352,158854
657940,2019-10-15 05:22:19 UTC,view,1002633,electronics.smartphone,apple,0659028a-8ece-42d8-991f-12180c7ee9e2,113,158854
657941,2019-10-15 05:23:22 UTC,view,1004990,electronics.smartphone,oppo,0659028a-8ece-42d8-991f-12180c7ee9e2,177,158854
657942,2019-10-15 05:23:44 UTC,view,1004990,electronics.smartphone,oppo,0659028a-8ece-42d8-991f-12180c7ee9e2,177,158854
657943,2019-10-15 05:24:08 UTC,view,1005205,electronics.smartphone,oppo,0659028a-8ece-42d8-991f-12180c7ee9e2,423,158854
657944,2019-10-15 05:24:27 UTC,view,1004838,electronics.smartphone,oppo,0659028a-8ece-42d8-991f-12180c7ee9e2,483,158854


In [10]:
def compute_score(train, target, itemId):
    candidate_sessions = []
    for s in range(len(train)):
        if itemId in train[s][1]:
            candidate_sessions.append(train[s][1])
            
    score = 0
    for n in range(len(candidate_sessions)):
        score += jaccard(candidate_sessions[n], target)
    
    return score

In [37]:
categories = subset.loc[subset.sessionId==actual_session[0]]['category_code'].unique().tolist()
candidate_items = subset.loc[subset.category_code.isin(categories)]['itemId'].unique().tolist()
len(candidate_items)

1190

In [38]:
ranking = []
for i in range(len(candidate_items)):
    ranking.append((compute_score(train, target, candidate_items[i]), candidate_items[i]))

ranking.sort()
ranking.reverse()
print(ranking[0:10])

[(1095.2041239558525, 18), (569.8593082793881, 17), (450.40003494955977, 146), (385.45994554133887, 136), (385.2007971406133, 484), (289.93883927976356, 113), (262.1890167012076, 147), (165.2389084960345, 153), (136.572534320218, 352), (124.2570761022573, 0)]


In [39]:
subset.loc[subset.itemId==483]

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
1033,2019-10-25 05:16:19 UTC,view,1004838,electronics.smartphone,oppo,000363e5-735b-483c-90c9-15b3674de2c1,483,296
2419,2019-10-20 05:44:38 UTC,view,1004838,electronics.smartphone,oppo,0006ca58-f3e4-48da-9d34-4393155a5046,483,619
2695,2019-10-16 15:40:31 UTC,view,1004838,electronics.smartphone,oppo,0007c11a-a649-4756-a77f-243493cd6da3,483,701
2696,2019-10-16 15:41:45 UTC,view,1004838,electronics.smartphone,oppo,0007c11a-a649-4756-a77f-243493cd6da3,483,701
3288,2019-10-09 14:37:12 UTC,view,1004838,electronics.smartphone,oppo,0009978a-59d0-4e69-adb0-6562776155dd,483,879
...,...,...,...,...,...,...,...,...
1999075,2019-10-24 08:05:50 UTC,cart,1004838,electronics.smartphone,oppo,13491c3c-3b46-4196-bddf-1e6464c5a8d7,483,483275
1999249,2019-10-28 15:49:19 UTC,view,1004838,electronics.smartphone,oppo,13499752-efa9-4c20-aa8d-fc5415175299,483,483320
1999804,2019-10-19 23:06:34 UTC,view,1004838,electronics.smartphone,oppo,134af63e-0534-4ab0-a56b-6b957c356bd9,483,483461
1999812,2019-10-13 13:04:42 UTC,view,1004838,electronics.smartphone,oppo,134afa6a-5c97-4a11-8189-bd7223f2d0a4,483,483464


***Exercício 01:*** A função compute_score() definida acima e explicada na aula, é a implementação do algoritmo Session-based KNN (S-KNN). Implemente uma variação da função que represente o algoritmo Sequential Session-based KNN (S-SKNN). Compare o desempenho de ambas as funções na recomendação do último item de uma sessão qualquer do conjunto de teste. Para fazer essa comparação, utilize a métrica Average Precision. 

In [14]:
# TODO

# Function that calculate the average precision for a session#
def average_precision(target_session, recommendations):
    # Get the recommended items
    recs_items = [rec[1] for rec in recommendations] 

    # Last item of the session (the target item that we want to recommend)
    relevant_item = target_session[-1]
    
    n_relevant_items = 0
    cumulative_precision = 0.0
    
    # Ierate over the recommendations and calc the precision
    for i, item in enumerate(recs_items):
        if item in relevant_item:
            n_relevant_items += 1
            # Precision at index i
            precision_at_i = n_relevant_items / (i + 1)
            cumulative_precision += precision_at_i
    
    # If no relevant item was found, return 0
    if n_relevant_items == 0:
        return 0.0
    
    # Calculate the average precision
    ap = cumulative_precision / n_relevant_items
    
    return ap


In [40]:
# Compute the recommendations using Session-based KMM (S-KNN)
ranking = []
for i in range(len(candidate_items)):
    ranking.append((compute_score(train, target, candidate_items[i]), candidate_items[i]))

ranking.sort()
ranking.reverse()

# Print the top 10 recommendations
print(ranking[0:10])

[(1095.2041239558525, 18), (569.8593082793881, 17), (450.40003494955977, 146), (385.45994554133887, 136), (385.2007971406133, 484), (289.93883927976356, 113), (262.1890167012076, 147), (165.2389084960345, 153), (136.572534320218, 352), (124.2570761022573, 0)]


In [41]:
# Calculate and print the Average Precision for S-KNN
print(f"Average Precision S-KNN: {average_precision(actual_session, ranking)}")

Average Precision S-KNN: 0.7273504273504273


In [17]:
# Function that calculate the score using Sequential Session-bases KNN (S-SKNN)
def compute_score_ssknn(train, target, itemId):
    candidate_sessions = []
    
    # Get all the sessions where the item appears
    for s in range(len(train)):
        if itemId in train[s][1]:
            candidate_sessions.append(train[s][1])
    
    score = 0
    
    # For each candidate session, calculate the weighted Jaccard similarity
    for n in range(len(candidate_sessions)):
        # Calculate the Jaccard similarity
        jaccard_sim = jaccard(candidate_sessions[n], target)
        
        weight = 0
        
        # Get the most recent item in target session that also appears in candidate session
        recent_common_item = None
        for i in range(len(target)-1, -1, -1):  
            if target[i] in candidate_sessions[n]:
                recent_common_item = target[i]
                break
        
        # If there is a recent commom item, calculate the weight
        if recent_common_item is not None:
            # Position of the recent item in target session
            target_position = target.index(recent_common_item) + 1
            
            # Calculate the weight
            weight = target_position / len(target) 

        # Multiply the Jaccard similarity by the weight
        weighted_jaccard = jaccard_sim * weight

        score += weighted_jaccard
    
    return score


In [42]:
# Compute the recommendations using Sequential Session-based KMM (S-SKNN)
ranking = []
for i in range(len(candidate_items)):
    ranking.append((compute_score_ssknn(train, target, candidate_items[i]), candidate_items[i]))

ranking.sort()
ranking.reverse()

# Print the top 10 recommendations
print(ranking[0:10])

[(244.0180301315601, 18), (194.57508856258903, 113), (180.43387959325736, 484), (174.59258790678928, 17), (173.23046258543127, 146), (133.5807685014861, 136), (84.56685102761746, 147), (78.75826133873976, 352), (67.04356056957289, 423), (65.46125620346633, 177)]


In [43]:
# Calculate and print the Average Precision for S-SKNN
print(f"Average Precision S-SKNN: {average_precision(actual_session, ranking)}")

Average Precision S-SKNN: 0.8911206077872743


***Exercício 02:*** Implemente outra variação da função compute_score() que represente o algoritmo Sequential Filter Session-based KNN (SF-SKNN). Compare o desempenho desse algoritmo com as demais abordagens para uma sessão qualquer do conjunto de teste.

In [20]:
# TODO

# Function that calculate the score using Sequential Filter Session-bases KNN (SF-SKNN)
def compute_score_sfsknn(train, target, itemId):
    candidate_sessions = []
    
    # Pegar o último item do target
    # Get the last item of target session
    last_item_target = target[-1]
    
    # Get all sessions that contains the last item of the target session followed by the desired item (itemId)
    for s in range(len(train)):
        session = train[s][1]
        
        if last_item_target in session:
            last_item_index = session.index(last_item_target)
            
            # Check if the item appears right after the last item of target session
            if last_item_index + 1 < len(session) and session[last_item_index + 1] == itemId:
                candidate_sessions.append(session)
    
    score = 0
    
    # For each valid candidate session, calculate the Jaccard similarity
    for n in range(len(candidate_sessions)):
        score += jaccard(candidate_sessions[n], target)
    
    return score


In [44]:
# Compute the recommendations using Sequential Filter Session-based KMM (SF-SKNN)
ranking = []
for i in range(len(candidate_items)):
    ranking.append((compute_score_sfsknn(train, target, candidate_items[i]), candidate_items[i]))

ranking.sort()
ranking.reverse()

# Print the top 10 recommendations
print(ranking[0:10])

[(21.69072508789284, 423), (5.186331904224047, 177), (3.37721115603125, 2278), (2.333279122112189, 605), (2.0720862470862467, 754), (1.7446600314553604, 483), (1.567925332321617, 37), (1.1582198770209249, 97), (0.9626800889475605, 180), (0.8717836629601335, 276)]


In [45]:
# Calculate and print the Average Precision for SF-SKNN
print(f"Average Precision SF-SKNN: {average_precision(actual_session, ranking)}")

Average Precision SF-SKNN: 0.36809295028429356


***Exercício 03:*** Na aula utilizamos uma estratégia trivial para selecionar itens candidatos para poder calcular seu escore: selecionamos apenas itens da mesma categoria que os itens que estão na sessão atual. Isso pode ser um problema, pois numa sessão, um usuário pode estar visualizando um produto e o sistema poderia recomendar um produto de outra categoria (exemplo: usuário visualiza/compra um smartphone, e o sistema recomenda uma capa protetora). Pense e implemente uma estratégia para selecionar os itens candidatos para os quais será calculado o escore via função compute_score(). Lembre-se de que quanto menor a quantidade de itens candidatos, mais rápido o sistema irá gerar a recomendação top N. Explique sua estratégia.

In [46]:
# TODO

# The stategy consists in select candidate items by combining items 
# from the same category and items purchased in related sessions

# Get the categories of the items of current session
categories = subset.loc[subset.sessionId==actual_session[0]]['category_code'].unique().tolist()

# Get all the items from the current session
session_items = subset.loc[subset.sessionId == actual_session[0]]['itemId'].tolist()

# Find the sessions that contain these items
related_sessions = subset.loc[subset.itemId.isin(session_items)]['sessionId'].unique().tolist()

# Select rows that correspond to the related sessions
related_sessions_df = subset[subset.sessionId.isin(related_sessions)]

# Filter to get only the items that were purchased in the related sessions
purchased_items_df = related_sessions_df[related_sessions_df.event_type == 'purchase']

# Extract the unique purchased item IDs from those sessions
purchased_items = purchased_items_df['itemId'].unique().tolist()

# Get candidate items based on category (those in the same category as current session items)
candidate_items = subset.loc[subset.category_code.isin(categories), 'itemId'].unique().tolist()

# Combine candidate items with purchased items, removing duplicates
candidate_items = list(set(candidate_items + purchased_items))

print(len(candidate_items))

1386


In [48]:
# Compute the recommendations using Sequential Filter Session-based KMM (SF-SKNN)
ranking_sknn = []
ranking_ssknn = []
ranking_sfsknn = []

for i in range(len(candidate_items)):
    ranking_sknn.append((compute_score(train, target, candidate_items[i]), candidate_items[i]))
    ranking_ssknn.append((compute_score_ssknn(train, target, candidate_items[i]), candidate_items[i]))
    ranking_sfsknn.append((compute_score_sfsknn(train, target, candidate_items[i]), candidate_items[i]))


ranking_sknn.sort()
ranking_sknn.reverse()

ranking_ssknn.sort()
ranking_ssknn.reverse()

ranking_sfsknn.sort()
ranking_sfsknn.reverse()

# Print the top 10 recommendations for each algorithm
print(f"SKNN Recommendations:\n{ranking_sknn[0:10]}")
print(f"\nS-SKNN Recommendations:\n{ranking_ssknn[0:10]}")
print(f"\nSF-SKNN Recommendations:\n{ranking_sfsknn[0:10]}\n")

# Calculate and print the Average Precision for each algorithm
print(f"Average Precision SKNN: {average_precision(actual_session, ranking_sknn)}")
print(f"Average Precision S-SKNN: {average_precision(actual_session, ranking_ssknn)}")
print(f"Average Precision SF-SKNN: {average_precision(actual_session, ranking_sfsknn)}")

SKNN Recommendations:
[(1095.2041239558525, 18), (569.8593082793881, 17), (450.40003494955977, 146), (385.45994554133887, 136), (385.2007971406133, 484), (289.93883927976356, 113), (262.1890167012076, 147), (165.2389084960345, 153), (136.572534320218, 352), (124.2570761022573, 0)]

S-SKNN Recommendations:
[(244.0180301315601, 18), (194.57508856258903, 113), (180.43387959325736, 484), (174.59258790678928, 17), (173.23046258543127, 146), (133.5807685014861, 136), (84.56685102761746, 147), (78.75826133873976, 352), (67.04356056957289, 423), (65.46125620346633, 177)]

SF-SKNN Recommendations:
[(21.69072508789284, 423), (5.186331904224047, 177), (3.37721115603125, 2278), (2.333279122112189, 605), (2.0720862470862467, 754), (1.7446600314553604, 483), (1.567925332321617, 37), (1.1582198770209249, 97), (0.9626800889475605, 180), (0.8717836629601335, 276)]

Average Precision SKNN: 0.7273504273504273
Average Precision S-SKNN: 0.8911206077872743
Average Precision SF-SKNN: 0.3679720928873892
