# Aula 10 - Recomendação baseada em sessão - Exercícios

In [1]:
import pandas as pd
import numpy as np

### Baixar a base de dados

In [None]:
# Fazer download no link: https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store?resource=download&select=2019-Oct.csv
# !tar -xvzf 2019-Oct.csv.zip
# ! unzip 2019-Oct.csv.zip

In [None]:
data = pd.read_csv('./2019-Oct.csv')
data.head()

In [None]:
# drop NaN values in specific columns
data = data.dropna(subset=["category_code", "brand", "user_session", "product_id"])

# keep only relevant columns in our dataset
data = data[["event_time", "event_type", "product_id", "category_code", "brand", "user_session"]]
data.head()

In [None]:
data.sort_values(by=['user_session', 'event_time'], inplace=True, ignore_index=True)
data.head()

In [None]:
# select where to split the data
split_at = 2000000

#  make sure the split doesn't cut off session data
while data["user_session"].iloc[split_at-1] == data["user_session"].iloc[split_at]:
    split_at += 1
    
# perform the split
split_range = list(range(0, split_at))
subset = data.iloc[split_range]
subset.shape

In [None]:
subset.to_csv('2019-Oct-sample.csv', index=False, header=['event_time', 'event_type', 'product_id', 'category_code', 'brand', 'user_session'], sep=',')

### Leitura do arquivo 2019-Oct-sample.csv (vide Aula 10 - Exemplos) caso não possua o arquivo

In [2]:
subset = pd.read_csv('./2019-Oct-sample.csv')
subset.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session
0,2019-10-31 06:23:12 UTC,view,1005115,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
1,2019-10-31 06:23:52 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
2,2019-10-31 06:25:30 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c
3,2019-10-31 06:26:58 UTC,view,1004858,electronics.smartphone,samsung,00000056-a206-40dd-b174-a072550fa38c
4,2019-10-31 06:28:21 UTC,view,1005104,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c


In [3]:
map_items = {item: idx for idx, item in enumerate(subset.product_id.unique())}
map_sessions = {item: idx for idx, item in enumerate(subset.user_session.unique())}
subset['itemId'] = subset['product_id'].map(map_items)
subset['sessionId'] = subset['user_session'].map(map_sessions)
subset.head()

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
0,2019-10-31 06:23:12 UTC,view,1005115,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,0,0
1,2019-10-31 06:23:52 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,1,0
2,2019-10-31 06:25:30 UTC,view,1005105,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,1,0
3,2019-10-31 06:26:58 UTC,view,1004858,electronics.smartphone,samsung,00000056-a206-40dd-b174-a072550fa38c,2,0
4,2019-10-31 06:28:21 UTC,view,1005104,electronics.smartphone,apple,00000056-a206-40dd-b174-a072550fa38c,3,0


In [4]:
n_items = subset['itemId'].max()+1
print('No. items: ', n_items)
n_sessions = subset['sessionId'].max()+1
print('No. sessions: ', n_sessions)

No. items:  42581
No. sessions:  483508


In [5]:
# create a dataset
# remove sessions with less than 2 items
def create_data(df):
    df.sort_values(by=['sessionId', 'event_time'], inplace=True, ignore_index=True)
    sessions, session = [], []
    for index, value in df.iterrows():
        if index != 0:
            if value["sessionId"] == df.at[index-1, "sessionId"]:
                if value["event_type"] == 'view':
                    session.append(value["itemId"])
            else:
                if len(session) > 1:
                    sessions.append((df.at[index-1, "sessionId"], session))
                session = [value["itemId"]]
        else:
            session.append(value["itemId"])
    return sessions

In [6]:
import os
import pickle

def cache(filename : str, func : callable, *args, **kwargs):
    # Ensure the precomp directory exists
    os.makedirs('./cache', exist_ok=True)
    
    # Full file path in the precomp directory
    file_path = f'./cache/{filename}'
    
    # Try loading the data from the file
    try:
        if os.path.exists(file_path):
            print(f"Loading data from {file_path}...")
            with open(file_path, 'rb') as f:
                return pickle.load(f)
        else:
            raise FileNotFoundError
    except (FileNotFoundError, EOFError, pickle.PickleError):
        print(f"File not found or corrupted. Precomputing data...")
        
        # Precompute the data by calling the passed function with *args and **kwargs
        data = func(*args, **kwargs)
        
        # Save the data to the file for future use
        with open(file_path, 'wb') as f:
            pickle.dump(data, f)
        
        return data


In [7]:
# Use precomp to either load or compute
sessions = cache('sessions_data.pkl', create_data, subset)
sessions

File not found or corrupted. Precomputing data...


[(0, [0, 1, 1, 2, 3, 4, 5]),
 (1, [6, 7, 8, 9, 10, 11, 12, 9, 13, 9, 0, 14, 1, 15, 16, 17]),
 (2, [18, 18, 19, 20, 20, 20]),
 (4, [22, 23]),
 (5, [24, 25, 26, 25]),
 (8, [1, 1, 15]),
 (9, [29, 30, 30, 31]),
 (10, [32, 32, 33, 34, 34]),
 (11, [31, 31, 35, 35, 35]),
 (12, [36, 37]),
 (19, [43, 43, 43, 44, 43]),
 (21, [46, 47, 47, 48, 48]),
 (22, [49, 49, 50]),
 (24, [52, 53]),
 (25, [54, 55]),
 (27, [57, 58, 58, 58]),
 (28, [14, 59, 14]),
 (30, [61, 61, 61, 61]),
 (31, [62, 63, 63, 64, 65, 65, 65]),
 (32, [66, 66, 66, 67, 68, 69, 70, 70]),
 (34, [31, 31, 31, 71, 72, 72, 73, 74]),
 (36, [76, 77, 77, 76, 78]),
 (38, [80, 80]),
 (39, [81, 82]),
 (42, [84, 85]),
 (43, [86, 25, 87, 87]),
 (47, [91, 92]),
 (49, [18, 18]),
 (54, [97, 98, 99, 100, 101, 102, 102, 103]),
 (55, [104, 105, 106, 106, 107, 108, 109, 110, 111]),
 (58, [113, 114, 113]),
 (59, [115, 115]),
 (63, [18, 73, 73, 73, 73, 73]),
 (64, [119, 119, 120, 120, 120, 119]),
 (66, [122, 123, 124, 125]),
 (67, [126, 126, 126]),
 (68, [6

In [8]:
import random

random.shuffle(sessions)
split = len(sessions) * 0.8
train = sessions[:int(split)]
test = sessions[int(split):]
print('No. train sessions: ', len(train))
print('No. test sessions: ', len(test))

No. train sessions:  237531
No. test sessions:  59383


In [9]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [10]:
def pre_jaccard(ses, target, jaccard_func=jaccard):
    return {
        k: jaccard_func(v, target) for k, v in ses
    }

In [11]:
actual_session = test[37]
target = actual_session[1][0:-1]
print(actual_session)
print(target)
subset.loc[subset.sessionId==actual_session[0]]

(257392, [17, 17, 17, 17, 17, 17, 17])
[17, 17, 17, 17, 17, 17]


Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
1064861,2019-10-24 02:45:18 UTC,view,1004870,electronics.smartphone,samsung,0a4725aa-7e48-4474-ac36-213e2a82f7f4,17,257392
1064862,2019-10-24 03:19:46 UTC,view,1004870,electronics.smartphone,samsung,0a4725aa-7e48-4474-ac36-213e2a82f7f4,17,257392
1064863,2019-10-24 05:08:34 UTC,view,1004870,electronics.smartphone,samsung,0a4725aa-7e48-4474-ac36-213e2a82f7f4,17,257392
1064864,2019-10-24 05:08:46 UTC,view,1004870,electronics.smartphone,samsung,0a4725aa-7e48-4474-ac36-213e2a82f7f4,17,257392
1064865,2019-10-24 06:19:00 UTC,view,1004870,electronics.smartphone,samsung,0a4725aa-7e48-4474-ac36-213e2a82f7f4,17,257392
1064866,2019-10-24 06:19:17 UTC,view,1004870,electronics.smartphone,samsung,0a4725aa-7e48-4474-ac36-213e2a82f7f4,17,257392
1064867,2019-10-24 06:22:24 UTC,purchase,1004870,electronics.smartphone,samsung,0a4725aa-7e48-4474-ac36-213e2a82f7f4,17,257392
1064868,2019-10-24 06:22:59 UTC,view,1004870,electronics.smartphone,samsung,0a4725aa-7e48-4474-ac36-213e2a82f7f4,17,257392


Observação: por conta do custo computacional, iremos medir a precisão apenas com a predição do de um target. Isso implica que nossa medida não é muito estatisticamente significante.

In [12]:
def check_precision(pred, actual=actual_session):
    for idx, (val, i) in enumerate(pred):
        if i == actual[0]:
            print('At', idx, 'Predicted: ', i, 'Actual: ', actual[0], 'Jaccard: ', val)
            return 1 / len(pred)
        
    print('Not found')
    return 0

In [13]:
jaccard_dict = cache('jaccard', pre_jaccard, train, target)
jaccard_dict

File not found or corrupted. Precomputing data...


{154963: 0.0,
 347212: 0.0,
 341709: 0.0,
 269547: 0.0,
 50073: 0.0,
 250828: 0.0,
 330563: 0.0,
 203451: 0.0,
 389268: 0.0,
 357342: 0.0,
 471082: 0.0,
 114103: 0.0,
 250677: 0.0,
 224666: 0.0,
 329717: 0.0,
 391929: 0.0,
 31247: 0.0,
 85312: 0.0,
 229047: 0.0,
 147931: 0.0,
 117336: 0.0,
 298837: 0.07142857142857142,
 44256: 0.0,
 277091: 0.0,
 96770: 0.0,
 102391: 0.0,
 41245: 0.0,
 118673: 0.0,
 176342: 0.0,
 402107: 0.0,
 116736: 0.0,
 28747: 0.0,
 246418: 0.0,
 382283: 0.0,
 249048: 0.0,
 106460: 0.0,
 472312: 0.0,
 352136: 0.0,
 78121: 0.0,
 154469: 0.0,
 14848: 0.0,
 308254: 0.0,
 338588: 0.0,
 395489: 0.0,
 380349: 0.0,
 397682: 0.0,
 178029: 0.0,
 470258: 0.0,
 365109: 0.0,
 390523: 0.0,
 252763: 0.0,
 343452: 0.0,
 238076: 0.0,
 481812: 0.0,
 353155: 0.0,
 156204: 0.0,
 176407: 0.0,
 77466: 0.0,
 83748: 0.0,
 174163: 0.0,
 223478: 0.0,
 164624: 0.09090909090909091,
 88975: 0.0,
 112176: 0.0,
 278689: 0.0,
 371776: 0.0,
 396521: 0.0,
 270645: 0.0,
 411068: 0.0,
 210027: 0.0,


In [14]:
def compute_score(train, target, itemId, jaccard_dict):
    candidate_sessions = []
    for ses, seq in train:
        if itemId in seq:
            candidate_sessions.append((ses, seq))
    
    score = 0
    for ses, seq in candidate_sessions:
        score += jaccard_dict.get(ses, 0)
    
    return score


In [15]:
categories = subset.loc[subset.sessionId==actual_session[0]]['category_code'].unique().tolist()
candidate_items = subset.loc[subset.category_code.isin(categories)]['itemId'].unique().tolist()
len(candidate_items)

1190

In [16]:
def comp_ranking(train, target, candidate_items, jaccard_dict, score_func=compute_score):
    _ranking = []
    
    for i, candidate in enumerate(candidate_items):
        _score = score_func(train, target, candidate, jaccard_dict)
        _ranking.append( (_score, candidate) )
        
        percentage = (i+1) / len(candidate_items) * 100
        print(f'Progress: {percentage:.2f}%', end='\r')
        
    print("All done!")
    
    _ranking.sort()
    _ranking.reverse()
    
    return _ranking

In [17]:
ranking = cache('ranking.pkl', comp_ranking, train, target, candidate_items, jaccard_dict)

ranking[0:10]

File not found or corrupted. Precomputing data...
All done! 100.00%


[(533.2718897183382, 17),
 (130.8356948056746, 136),
 (67.19312896331432, 146),
 (57.86484510729358, 18),
 (44.554054701151685, 153),
 (40.55997553575742, 147),
 (25.465538649807485, 201),
 (23.217619399412065, 6),
 (22.56573744156284, 0),
 (21.565681089681526, 210)]

In [18]:
subset.loc[subset.itemId==21083]

Unnamed: 0,event_time,event_type,product_id,category_code,brand,user_session,itemId,sessionId
262378,2019-10-28 11:56:34 UTC,view,21408343,electronics.clocks,romanson,02880533-d7ce-45bc-bc70-5d348203de11,21083,63162
329712,2019-10-11 15:52:48 UTC,view,21408343,electronics.clocks,romanson,032e7ca0-c78a-4167-b40b-e695e86a2f03,21083,79442
361345,2019-10-08 18:25:15 UTC,view,21408343,electronics.clocks,romanson,037aa64f-9798-4a10-b692-6d60d757ba6b,21083,87016
493880,2019-10-30 13:46:25 UTC,view,21408343,electronics.clocks,romanson,04c22360-b6c7-47bb-af50-501b85ce3399,21083,119192
657797,2019-10-08 17:07:25 UTC,view,21408343,electronics.clocks,romanson,0658adfb-8f43-4869-8f68-f11e16a40e45,21083,158819
667941,2019-10-09 07:40:36 UTC,view,21408343,electronics.clocks,romanson,06725817-89ff-4b8f-8c1a-adda02bd13a8,21083,161355
714952,2019-10-15 16:32:32 UTC,view,21408343,electronics.clocks,romanson,06e60861-aecc-4544-866b-a01499926aae,21083,172626
714953,2019-10-15 16:33:16 UTC,view,21408343,electronics.clocks,romanson,06e60861-aecc-4544-866b-a01499926aae,21083,172626
909150,2019-10-19 01:05:03 UTC,view,21408343,electronics.clocks,romanson,08c3c7ba-3874-45a7-a836-5952a02ed413,21083,219468
1055780,2019-10-07 10:35:35 UTC,view,21408343,electronics.clocks,romanson,0a3039ef-a7d3-4101-959b-30003cb161d4,21083,255199


In [19]:
check_precision(ranking)

Not found


0

***Exercício 01:*** A função compute_score() definida acima e explicada na aula, é a implementação do algoritmo Session-based KNN (S-KNN). Implemente uma variação da função que represente o algoritmo Sequential Session-based KNN (S-SKNN). Compare o desempenho de ambas as funções na recomendação do último item de uma sessão qualquer do conjunto de teste. Para fazer essa comparação, utilize a métrica Average Precision. 

In [20]:
def ssknn_one_func(s : list[int], n : list[int]):
    for i in range(len(s)-1, -1, -1):
        if s[i] in n:
            return i / len(s)
    return 0

def ssknn_compute_score(train, target, itemId, jaccard_dict):
    candidate_sessions = []
    for ses, seq in train:
        if itemId in seq:
            candidate_sessions.append((ses, seq))
    
    score = 0
    for ses, seq in candidate_sessions:
        score += jaccard_dict.get(ses, 0) * ssknn_one_func(seq, target)
    
    return score

In [21]:
ssknn_ranking = cache('ssknn_ranking.pkl', comp_ranking, train, target, candidate_items, jaccard_dict, ssknn_compute_score)

ssknn_ranking[0:10]

File not found or corrupted. Precomputing data...
All done! 100.00%


[(222.26443418631638, 17),
 (60.85153144201054, 136),
 (23.838793334716037, 18),
 (23.2101587054539, 146),
 (16.88079314462143, 147),
 (16.330484661848285, 153),
 (9.932685520103192, 0),
 (9.741686861545402, 201),
 (8.137228387640171, 285),
 (8.00502806552571, 6)]

In [22]:
check_precision(ssknn_ranking)

Not found


0

***Exercício 02:*** Implemente outra variação da função compute_score() que represente o algoritmo Sequential Filter Session-based KNN (SF-SKNN). Compare o desempenho desse algoritmo com as demais abordagens para uma sessão qualquer do conjunto de teste.

In [23]:
def sfknn_compute_score(train, target, itemId, jaccard_dict):
    last = target[-1]
    score = 0
    for ses, seq in train:
        # Check if 'itemId' occurs immediately after 'last' in 'seq'
        if any(x == last and y == itemId for x, y in zip(seq[:-1], seq[1:])):
            score += jaccard_dict.get(ses, 0)
    return score


In [24]:
sfknn_raking = cache('sfknn_ranking.pkl', comp_ranking, train, target, candidate_items, jaccard_dict, sfknn_compute_score)

sfknn_raking[0:10]

File not found or corrupted. Precomputing data...
All done! 100.00%


[(246.7788023062898, 17),
 (40.85011933143059, 146),
 (38.54667309051084, 136),
 (24.176637123859685, 153),
 (13.612539177369928, 18),
 (11.10044724823541, 147),
 (10.06056968178851, 201),
 (8.714885675345595, 6),
 (7.522923598635667, 210),
 (7.282152728038273, 285)]

In [25]:
check_precision(sfknn_raking)

Not found


0

***Exercício 03:*** Na aula utilizamos uma estratégia trivial para selecionar itens candidatos para poder calcular seu escore: selecionamos apenas itens da mesma categoria que os itens que estão na sessão atual. Isso pode ser um problema, pois numa sessão, um usuário pode estar visualizando um produto e o sistema poderia recomendar um produto de outra categoria (exemplo: usuário visualiza/compra um smartphone, e o sistema recomenda uma capa protetora). Pense e implemente uma estratégia para selecionar os itens candidatos para os quais será calculado o escore via função compute_score(). Lembre-se de que quanto menor a quantidade de itens candidatos, mais rápido o sistema irá gerar a recomendação top N. Explique sua estratégia.

Proposta: Partindo do pressuposto que consumidores desenvolvem preferências por marcas e tendem a comprar itens da mesma marca (mesmo itens bem diferentes), podemos adicionar à lista de candidatos itens da mesma marca, porem de outras categorias. Isso pode aumentar a precisão, mas irá aumentar o custo computacional.

In [26]:
brands = subset.loc[subset.sessionId==actual_session[0]]['brand'].unique().tolist()
candidates_q3 = subset.loc[(subset.category_code.isin(categories)) | (subset.brand.isin(brands))]['itemId'].unique().tolist()
len(candidates_q3)

1689

In [27]:
ssknn_ranking_q3 = cache('ssknn_ranking_q3.pkl', comp_ranking, train, target, candidates_q3, jaccard_dict, ssknn_compute_score)
ssknn_ranking_q3[0:10]

File not found or corrupted. Precomputing data...
All done! 100.00%


[(222.26443418631638, 17),
 (60.85153144201054, 136),
 (23.838793334716037, 18),
 (23.2101587054539, 146),
 (16.88079314462143, 147),
 (16.330484661848285, 153),
 (9.932685520103192, 0),
 (9.741686861545402, 201),
 (8.137228387640171, 285),
 (8.00502806552571, 6)]

In [28]:
check_precision(ssknn_ranking_q3)

Not found


0