In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys  
sys.path.insert(0, '..')

In [47]:
from main import get_tafeng_graph
from knowledge_graph.datasets import KgPosNegTriples, TimeSplittedDataset, KgCustomers
from knowledge_graph.layer_generators import LayerNodeGenerator
from utils import get_dates_for_split, get_graph_splits, get_test_interactions
from models.Model import Model
from models.config import Config
from toolz import valmap
import numpy as np
import torch
from datetime import datetime

In [4]:
knowledge_graph = get_tafeng_graph(user_k_core=2, item_k_core=1)

2021-05-09 16:29:36,986 - numexpr.utils - [INFO] - Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2021-05-09 16:29:36,987 - numexpr.utils - [INFO] - NumExpr defaulting to 8 threads.
2021-05-09 16:29:39,882 - TaFengGraph - [INFO] - loading entities
2021-05-09 16:29:40,181 - TaFengGraph - [INFO] - loading relations
2021-05-09 16:31:07,311 - TaFengGraph - [INFO] - loaded purchase
2021-05-09 16:31:58,514 - TaFengGraph - [INFO] - loaded bought_in
2021-05-09 16:32:51,999 - TaFengGraph - [INFO] - loaded belongs_to_age_group
2021-05-09 16:33:45,214 - TaFengGraph - [INFO] - loaded belongs_to_subclass


In [5]:
len(knowledge_graph.entity_set)

55868

In [6]:
len(knowledge_graph.entity_set.customer)

30130

In [7]:
timestamps = knowledge_graph.relation_set.get_all_timestamps()
splitting_points = get_dates_for_split(timestamps, n_points=3)

In [8]:
splitting_points

[datetime.datetime(2000, 11, 28, 3, 0),
 datetime.datetime(2001, 1, 1, 3, 0),
 datetime.datetime(2001, 1, 30, 3, 0)]

In [9]:
splits = get_graph_splits(knowledge_graph, splitting_points)

2021-05-09 16:33:46,583 - TaFengGraph - [INFO] - converting purchase
2021-05-09 16:33:49,842 - TaFengGraph - [INFO] - converting belongs_to_subclass
2021-05-09 16:33:50,394 - TaFengGraph - [INFO] - converting belongs_to_age_group
2021-05-09 16:33:50,642 - TaFengGraph - [INFO] - converting bought_in
2021-05-09 16:33:51,720 - TaFengGraph - [INFO] - converting purchase
2021-05-09 16:33:54,584 - TaFengGraph - [INFO] - converting belongs_to_subclass
2021-05-09 16:33:54,799 - TaFengGraph - [INFO] - converting belongs_to_age_group
2021-05-09 16:33:55,047 - TaFengGraph - [INFO] - converting bought_in
2021-05-09 16:33:56,372 - TaFengGraph - [INFO] - converting purchase
2021-05-09 16:33:59,081 - TaFengGraph - [INFO] - converting belongs_to_subclass
2021-05-09 16:33:59,337 - TaFengGraph - [INFO] - converting belongs_to_age_group
2021-05-09 16:33:59,623 - TaFengGraph - [INFO] - converting bought_in
2021-05-09 16:34:01,445 - TaFengGraph - [INFO] - converting purchase
2021-05-09 16:34:04,842 - TaFen

In [10]:
len(splits)

4

In [11]:
train_splits = splits[:-1]
test_split = splits[-1]

In [12]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Init model

In [13]:
config = Config(
    entity_embedding_dim=10,
    relation_embedding_dim=10,
    n_entities=len(knowledge_graph.entity_set),
    n_relations=len(knowledge_graph.relation_set),
    n_layers=2,
    transR_l2_weight=0.05,
    concat_layers=True
)
model = Model(
    config,
    layer_generators=[LayerNodeGenerator(split, n_neighbours=5) for split in train_splits],
    device=device
)

# TransR part

In [14]:
pos_neg_triples_ds = [KgPosNegTriples(split) for split in train_splits]
ts_ds = TimeSplittedDataset(pos_neg_triples_ds)

In [15]:
pos_neg_triples_ds[0][1]

(563, 0, 34149, 9473)

In [16]:
pos_neg_triples_ds[0][1:3]

PosNegBatch(head=[563, 39064], relation=[0, 0], pos_tail=[34149, 8399], neg_tail=[15168, 31438])

In [17]:
batches = list(map(lambda x: x.data, ts_ds[1:5]))

In [18]:
batches

[PosNegBatch(head=[563, 39064, 35030, 32840], relation=[0, 0, 0, 0], pos_tail=[33046, 14114, 1006, 9332], neg_tail=[10465, 40378, 3238, 34403]),
 PosNegBatch(head=[50625, 21613, 16031, 29728], relation=[3, 0, 0, 2], pos_tail=[32153, 32418, 37087, 32142], neg_tail=[21705, 18745, 17074, 36062]),
 PosNegBatch(head=[15033, 39476, 44290, 13502], relation=[2, 0, 0, 2], pos_tail=[32146, 14443, 22932, 32142], neg_tail=[33068, 55405, 22532, 868])]

In [19]:
time_outputs = model(
    list(map(lambda x: list(set(x.head + x.pos_tail + x.neg_tail)), batches)),
    mode='transR'
)

In [20]:
len(time_outputs)

3

In [21]:
time_outputs[0]

{10465: tensor([ 0.6276, -0.0023,  0.4895, -0.0070,  0.0892,  0.0228,  0.5493, -0.0086,
         -0.0046,  0.2370], grad_fn=<DivBackward0>),
 14114: tensor([ 6.8483e-01, -1.9292e-03,  4.4356e-02, -1.4948e-04, -2.1145e-03,
          1.7615e-01,  6.5749e-01, -5.1472e-03,  1.5572e-01,  2.0356e-01],
        grad_fn=<DivBackward0>),
 34403: tensor([ 0.9238,  0.0138,  0.0897, -0.0056, -0.0051,  0.2871,  0.2360, -0.0022,
         -0.0074, -0.0040], grad_fn=<DivBackward0>),
 3238: tensor([ 0.5844,  0.1118,  0.2255, -0.0047, -0.0016,  0.6616,  0.3966, -0.0041,
         -0.0011, -0.0009], grad_fn=<DivBackward0>),
 32840: tensor([ 4.5381e-01, -4.2713e-04,  9.7117e-02,  6.8263e-02, -5.7411e-03,
         -1.5957e-03,  8.8313e-01, -1.4483e-03, -1.0739e-03, -1.4975e-03],
        grad_fn=<DivBackward0>),
 1006: tensor([ 0.8356, -0.0029,  0.3361, -0.0058,  0.0682,  0.1744,  0.3916, -0.0076,
         -0.0018,  0.0144], grad_fn=<DivBackward0>),
 563: tensor([ 0.8232, -0.0010,  0.2261, -0.0049,  0.0495,  

In [22]:
model(
    list(set(batches[0].head + batches[0].pos_tail + batches[0].neg_tail)),
    mode='transR'
)

[{10465: tensor([ 6.6372e-01, -3.1680e-03,  4.7953e-01, -5.6688e-03,  1.0736e-01,
          -2.0546e-04,  5.1009e-01, -8.2295e-03, -3.0152e-03,  2.4020e-01],
         grad_fn=<DivBackward0>),
  14114: tensor([ 5.0055e-01, -7.8070e-04,  4.4055e-02, -1.2958e-03, -2.0540e-03,
           2.5683e-01,  8.0195e-01, -7.0739e-03,  8.4307e-02,  1.7680e-01],
         grad_fn=<DivBackward0>),
  34403: tensor([ 8.8753e-01, -4.3606e-04,  1.2105e-01, -5.4741e-03, -4.3328e-03,
           5.9953e-02,  4.4038e-01, -3.3498e-03, -6.1324e-03, -3.1645e-03],
         grad_fn=<DivBackward0>),
  3238: tensor([ 0.7059,  0.0119,  0.1919, -0.0016, -0.0048,  0.1958,  0.6530, -0.0057,
          -0.0017, -0.0041], grad_fn=<DivBackward0>),
  32840: tensor([ 6.5789e-01, -1.6645e-03, -4.0736e-04, -2.3901e-04, -7.2559e-03,
          -3.9195e-04,  7.5307e-01, -3.1051e-04, -1.4953e-03, -2.4724e-03],
         grad_fn=<DivBackward0>),
  1006: tensor([ 0.9047, -0.0027,  0.2836, -0.0049,  0.0329,  0.0682,  0.3075, -0.0066,
  

In [23]:
model.transR_loss(batches, time_outputs)

tensor(0.8795, grad_fn=<DivBackward0>)

# Recommender part

In [24]:
customer_indices = list(map(lambda x: knowledge_graph.entity_set.entity2idx[x], knowledge_graph.entity_set.customer))
product_indices = list(map(lambda x: knowledge_graph.entity_set.entity2idx[x], knowledge_graph.entity_set.product))

In [25]:
len(product_indices), len(customer_indices)

(23709, 30130)

In [26]:
customer_dataset = KgCustomers(
    splits=train_splits,
    customer_indices=customer_indices,
    product_indices=product_indices,
    purchase_relation_idx=knowledge_graph.relation_set.relation2idx['purchase']
)

In [27]:
len(customer_dataset.customers_to_iterate_over)

5791

In [28]:
customer_dataset.sample_pos_products_for_customer(splits[0], customer_indices[0], 4)

[32374, 32195, 32308, 32159]

In [29]:
customer_dataset.sample_neg_products_for_customer(splits[0], customer_indices[0], 4)

[34350, 55199, 44647, 40712]

In [30]:
batch = customer_dataset[:30]

In [31]:
batch

CustomerPosNegBatch(customer=[9940, 14483, 5665, 8140, 4501, 4065, 6676, 852, 1693, 8685, 981, 13512, 4415, 8947, 9366, 14807, 4550, 2711, 13917, 144, 31063, 6532, 11983, 7015, 4249, 4053, 3421, 11574, 529, 14047], pos_products=tensor([[32207, 47429],
        [35049, 35808],
        [34118, 44658],
        [39047, 43304],
        [32568, 50960],
        [37160, 36576],
        [34900, 36047],
        [39200, 34173],
        [32885, 35305],
        [38833, 35500],
        [40465, 34900],
        [41572, 36426],
        [34382, 33085],
        [33747, 54397],
        [33928, 34906],
        [49161, 49838],
        [33388, 33579],
        [35890, 36358],
        [42853, 47884],
        [35509, 34289],
        [44562, 34006],
        [32185, 32466],
        [42443, 36982],
        [33486, 33365],
        [32621, 46056],
        [32855, 33735],
        [39815, 35609],
        [39234, 37888],
        [36779, 32330],
        [36952, 46215]]), neg_products=tensor([[45779, 38700],
        [4175

In [32]:
batch.pos_products.T.numpy().tolist()[0]

[32207,
 35049,
 34118,
 39047,
 32568,
 37160,
 34900,
 39200,
 32885,
 38833,
 40465,
 41572,
 34382,
 33747,
 33928,
 49161,
 33388,
 35890,
 42853,
 35509,
 44562,
 32185,
 42443,
 33486,
 32621,
 32855,
 39815,
 39234,
 36779,
 36952]

In [33]:
batch.pos_products.T.numpy().tolist()[0]

[32207,
 35049,
 34118,
 39047,
 32568,
 37160,
 34900,
 39200,
 32885,
 38833,
 40465,
 41572,
 34382,
 33747,
 33928,
 49161,
 33388,
 35890,
 42853,
 35509,
 44562,
 32185,
 42443,
 33486,
 32621,
 32855,
 39815,
 39234,
 36779,
 36952]

In [34]:
model._embed_nodes(batch.pos_products.T.numpy().tolist()[0], concat_layers=True, time_start=0).size()

torch.Size([30, 3, 20])

In [35]:
model._embed_nodes(batch.pos_products.T.numpy().tolist()[0], concat_layers=False, time_start=0).size()

torch.Size([30, 3, 10])

In [36]:
model.recommender_task_loss(batch)

tensor(0.7091, grad_fn=<DivBackward0>)

# Get customer representations from the last lstm timestep

In [37]:
customer_embeddings = model(customer_dataset.customers_to_iterate_over, mode='customers')

In [38]:
customer_embeddings.size(), len(customer_dataset.customers_to_iterate_over)

(torch.Size([5791, 20]), 5791)

In [41]:
del customer_embeddings

# Get product representations from the last timestep

In [39]:
product_embeddings = model(product_indices, mode='products')

In [40]:
product_embeddings.size(), len(product_indices)

(torch.Size([23709, 20]), 23709)

In [42]:
del product_embeddings

# Get similarity scores for every user item pair

In [100]:
test_interactions = get_test_interactions(
    customer_indices, test_split, knowledge_graph.relation_set.relation2idx['purchase']
)

In [108]:
def precision_at_k(hits: np.ndarray, k: int) -> np.ndarray:
    return hits[:, :k].mean(axis=1)

def recall_at_k(hits: np.ndarray, k: int) -> np.ndarray:
    return hits[:, :k].sum(axis=1) / hits.sum(axis=1)

def ndcg_at_k(hits: np.ndarray, k: int) -> np.ndarray:
    def dcg_at_k(hits_k: np.ndarray) -> np.ndarray:
        return np.sum((2**hits_k - 1) / np.log2(np.arange(2, k + 2)), axis=1)

    hits_k = hits[:, :k]
    dcg = dcg_at_k(hits_k)
    sorted_hits_k = np.sort(hits_k)[::-1]
    
    idcg = dcg_at_k(sorted_hits_k)
    idcg[idcg == 0] = np.inf
    
    return dcg / idcg

In [109]:
def calculate_metrics_at_k(k: int, scores: torch.Tensor, interactions_list: list) -> tuple:
    
    interactions_matrix = np.zeros((scores.size(0), scores.size(1)))
    for idx, interactions in enumerate(interactions_list):
        interactions_matrix[idx, interactions] = 1
    
    _, sorted_items = torch.sort(scores, descending=True)
    sorted_items = sorted_items.numpy()
    
    binary_hits = np.zeros_like(interactions_matrix)
    for idx, items in enumerate(sorted_items):
        binary_hits[idx, :] = interactions_matrix[idx, items]

    precision = precision_at_k(binary_hits, k)
    recall = recall_at_k(binary_hits, k)
    ndcg = ndcg_at_k(binary_hits, k)
        
    return precision, recall, ndcg

In [112]:
def evaluate(model: Model, interactions_dict: dict, product_ids: list, batch_size: int, k: int) -> tuple:
    
    customer_ids = list(interactions_dict.keys())
    
    model.eval()
    
    precision_k = []
    recall_k = []
    ndcg_k = []
    
    with torch.no_grad():
        product_embeddings = model(product_ids, mode='products')
        
        product_to_idx = {product: idx for idx, product in enumerate(product_ids)}
        
        interactions_dict = valmap(
            lambda x_list: list(map(lambda item: product_to_idx[item], x_list)),
            interactions_dict
        )
        
        batch_idx = 0
        n_batches = len(customer_ids) // batch_size + 1
        for _ in range(n_batches):
            customer_batch = customer_ids[batch_idx: batch_idx + batch_size]
            interactions_list = [interactions_dict[customer] for customer in customer_batch] 
            scores = model(customer_batch, mode='customers') @ product_embeddings.T
            batch_precision, batch_recall, batch_ndcg = calculate_metrics_at_k(k, scores, interactions_list)
            precision_k.append(batch_precision)
            recall_k.append(batch_recall)
            ndcg_k.append(batch_ndcg)
            break
            
    precision_k = sum(np.concatenate(precision_k)) / len(customer_ids)
    recall_k = sum(np.concatenate(recall_k)) / len(customer_ids)
    ndcg_k = sum(np.concatenate(ndcg_k)) / len(customer_ids)
    
    return precision_k, recall_k, ndcg_k

In [114]:
evaluate(model, test_interactions, product_indices, 32, 20)

(3.017319413433106e-06, 3.017319413433106e-06, 0.0)