In [1]:
%cd ..
from relbench.datasets import get_dataset
from tqdm import tqdm
import numpy as np
from utils.data import StackDataset

/home/lingze/embedding_fusion


In [2]:
dataset = StackDataset(cache_dir="/home/lingze/.cache/relbench/stack")
db = dataset.get_db()

Loading Database object from /home/lingze/.cache/relbench/stack/db...
Done in 9.84 seconds.


In [3]:
for table_name, table in db.table_dict.items():
    n = len(table.df)
    print(f"Table {table_name} has {n} rows")

Table tags has 1597 rows
Table postHistory has 1175368 rows
Table comments has 623967 rows
Table badges has 463463 rows
Table postTag has 648577 rows
Table users has 255360 rows
Table postLinks has 77337 rows
Table votes has 1317876 rows
Table posts has 333893 rows


In [4]:
from utils.document import generate_document_given_table
from utils.builder import identify_entity_table
from utils.builder import generate_hop_matrix
entity_tables = identify_entity_table(db)
entity_tables.append("posts")
entity_tables

  from .autonotebook import tqdm as notebook_tqdm


['tags', 'badges', 'users', 'posts']

In [5]:
hop_matrix = generate_hop_matrix(db)
edge_candidates_pairs = []
for entity in entity_tables:
    for entity2 in entity_tables:
        if entity == entity2:
            continue
        
        if entity2 not in hop_matrix.graph[entity]:
            # not one hop
            edge_candidates_pairs.append((entity, entity2))
edge_candidates_pairs

[('tags', 'badges'),
 ('tags', 'users'),
 ('tags', 'posts'),
 ('badges', 'tags'),
 ('badges', 'posts'),
 ('users', 'tags'),
 ('posts', 'tags'),
 ('posts', 'badges')]

In [6]:
# remove (tags, posts) and (posts, tags), since they are one hop
# we transfer the relationship table to edge
edge_candidates_pairs.remove(("tags", "posts"))
edge_candidates_pairs.remove(("posts", "tags"))
edge_candidates_pairs

[('tags', 'badges'),
 ('tags', 'users'),
 ('badges', 'tags'),
 ('badges', 'posts'),
 ('users', 'tags'),
 ('posts', 'badges')]

In [7]:
# homoGraph
from utils.builder import HomoGraph, make_homograph_from_db
homoGraph = make_homograph_from_db(db, verbose=True)

table postHistory -> table posts has 1175368 edges
table postHistory -> table users has 1100031 edges
table comments -> table users has 612288 edges
table comments -> table posts has 623962 edges
table badges -> table users has 463463 edges
table postTag -> table posts has 499164 edges
table postTag -> table tags has 648577 edges
table postLinks -> table posts has 61171 edges
table postLinks -> table posts has 75588 edges
table votes -> table posts has 1199831 edges
table votes -> table users has 5182 edges
table posts -> table users has 328648 edges
table posts -> table posts has 167355 edges
table posts -> table posts has 57714 edges


In [8]:
from utils.preprocess import infer_type_in_db
from utils.tokenize import tokenize_database
col_type_dict = infer_type_in_db(db, True)

[rule 0]: tags Inferred Id from numerical as categorical
[rule 0]: postHistory Inferred Id from numerical as categorical
[rule 0]: postHistory Inferred PostId from numerical as categorical
[rule 0]: postHistory Inferred UserId from numerical as categorical
[rule 0]: postHistory Inferred PostHistoryTypeId from numerical as categorical
[rule 0]: postHistory Inferred ContentLicense from categorical as text_embedded
[rule 1]: postHistory Inferred ContentLicense from text_embedded as categorical
[rule 0]: postHistory Inferred RevisionGUID from text_embedded as categorical
[rule 0]: comments Inferred Id from numerical as categorical
[rule 0]: comments Inferred PostId from numerical as categorical
[rule 0]: comments Inferred UserId from numerical as categorical
[rule 1]: comments Inferred Score from numerical as categorical
[rule 0]: comments Inferred ContentLicense from categorical as text_embedded
[rule 1]: comments Inferred ContentLicense from text_embedded as categorical
[rule 0]: badges 

In [9]:
tk_db = tokenize_database(db, col_type_dict, './tmp_docs/rel-stack', True)

----------------> Tokenizing tags each column
-> Load tokenized data from ./tmp_docs/rel-stack/tags.npy
----------------> Tokenizing postHistory each column
-> Load tokenized data from ./tmp_docs/rel-stack/postHistory.npy
----------------> Tokenizing comments each column
-> Load tokenized data from ./tmp_docs/rel-stack/comments.npy
----------------> Tokenizing badges each column
-> Load tokenized data from ./tmp_docs/rel-stack/badges.npy
----------------> Tokenizing postTag each column
-> Load tokenized data from ./tmp_docs/rel-stack/postTag.npy
----------------> Tokenizing users each column
-> Load tokenized data from ./tmp_docs/rel-stack/users.npy
----------------> Tokenizing postLinks each column
-> Load tokenized data from ./tmp_docs/rel-stack/postLinks.npy
----------------> Tokenizing votes each column
-> Load tokenized data from ./tmp_docs/rel-stack/votes.npy
----------------> Tokenizing posts each column
-> Load tokenized data from ./tmp_docs/rel-stack/posts.npy


In [None]:
# generated the documents and build the retrieval index
# entity_to_docs = {}
# walk_length = 10
# round = 10
# for entity in entity_tables:
#    _, entity_to_docs[entity] = generate_document_given_table(
#         homoGraph, 
#         tk_db, 
#         entity, 
#         walk_length=walk_length, 
#         round = round, 
#         verbose=True
#     )

In [None]:
# # temporarily save the index
# import bm25s
# entity_to_retriver = {}
# for entity, docs in entity_to_docs.items():
#     retriever = bm25s.BM25(backend="numba")
#     retriever.index(docs)
#     retriever.activate_numba_scorer()
#     entity_to_retriver[entity] = retriever

# # save the retriever
# for entity, retriever in entity_to_retriver.items():
#     retriever.save(f"./tmp/stack/{entity}_retriever_bm25")

                                                                                           

In [12]:
import bm25s
entity_to_retriver = {}

# save the retriever
entity_to_retriver = {}
for entity in entity_tables:
    path = f"./tmp/stack/{entity}_retriever_bm25"
    retriever = bm25s.BM25.load(path)
    retriever.activate_numba_scorer()
    entity_to_retriver[entity] = retriever
    print(f"load {path}")

load ./tmp/stack/tags_retriever_bm25
load ./tmp/stack/badges_retriever_bm25
load ./tmp/stack/users_retriever_bm25
load ./tmp/stack/posts_retriever_bm25


In [14]:
# resample the candidate docs, and retrieve the related docs in the bm25 retrievers
# generated the documents and build the retrieval index
walk_length = 10
round = 10
entity_to_docs = {}
entity_candidate_pkys = {}
# for each
for entity in entity_tables:
    n = len(db.table_dict[entity].df)
    sample_size = n // 5
    sample_size = max(sample_size, 4096)
    entity_candidate_pkys[entity], entity_to_docs[entity] = generate_document_given_table(
        homoGraph, 
        tk_db, 
        entity, 
        walk_length=walk_length, 
        round = round,
        sample_size = sample_size,
        verbose=True
    )

- Walks for table tags - shape torch.Size([1597, 10, 10])


                                                     

- Walks for table badges - shape torch.Size([92692, 10, 10])


                                                       

- Walks for table users - shape torch.Size([51072, 10, 10])


                                                        

- Walks for table posts - shape torch.Size([66778, 10, 10])


                                                       

In [15]:
# Add the cross-table edges,
import numpy as np
topn = 20
edge_dict = {}
# (src_table, des_table) -> edge 2-D array
for entity, retrieve_entity in edge_candidates_pairs:

    # retrieve the related docs
    entity_query_docs = entity_to_docs[entity]
    entity_query_pkys = entity_candidate_pkys[entity]
    retriever = entity_to_retriver[retrieve_entity]
    
    related_pkys, scores = retriever.retrieve(entity_query_docs, k = topn, n_threads = 24)
    
    score_np = np.array(scores)
    related_pkys_np = np.array(related_pkys)
    threshold = score_np.mean() + 2*scores.std()
    
    # Get indices where the score is above the threshold
    mask = score_np > threshold

    # Apply the mask
    filtered_cols = related_pkys_np[mask]

    # Generate the corresponding query entities
    entity_query_pkys = np.array(entity_query_pkys)  # shape [n]

    # Repeat each query item the number of True values per row in the mask
    row_repeats = mask.sum(axis=1)  # how many times to repeat each query
    filtered_rows = np.repeat(entity_query_pkys, row_repeats)
    
    
    filtered_edge = np.stack([filtered_rows, filtered_cols], axis=1)
    # added edge
    num_edges = filtered_rows.shape[0]
    edge_dict[(entity, retrieve_entity)] = filtered_edge
    print(f"Add cross table edges #{num_edges} between {entity} and {retrieve_entity}")
    

Add cross table edges #985 between tags and badges
Add cross table edges #838 between tags and users
Add cross table edges #71696 between badges and tags
Add cross table edges #90787 between badges and posts
Add cross table edges #33379 between users and tags
Add cross table edges #74825 between posts and badges


In [16]:
# resample the candidate docs, and retrieve the related docs in the bm25 retrievers
# generated the documents and build the retrieval index
walk_length = 10
round = 10
entity_to_docs = {}
entity_candidate_pkys = {}
# for each
for entity in entity_tables:
    n = len(db.table_dict[entity].df)
    sample_size = n // 5
    sample_size = max(sample_size, 4096)
    entity_candidate_pkys[entity], entity_to_docs[entity] = generate_document_given_table(
        homoGraph, 
        tk_db, 
        entity, 
        walk_length=walk_length, 
        round = round,
        sample_size = sample_size,
        verbose=True
    )

- Walks for table tags - shape torch.Size([1597, 10, 10])


                                                     

- Walks for table badges - shape torch.Size([92692, 10, 10])


                                                       

- Walks for table users - shape torch.Size([51072, 10, 10])


                                                        

- Walks for table posts - shape torch.Size([66778, 10, 10])


                                                       

In [17]:
# self-entity correlation
# which can generate the positive pairs in the contrastive learning
topn = 21
# the most related doc should be itself, so we need to retrieve topn + 1
positive_pool_dict = {}
# entity -> positive candidate, padding the non-value
threshold = 0.7
batch_size = 1024
for entity, retriever in entity_to_retriver.items():
    # retrieve the related docs
    entity_query_docs = entity_to_docs[entity]
    entity_query_pkys = entity_candidate_pkys[entity]
    score_np = []
    related_pkys_np = []
    print(f"--------> {entity}")
    for batch_idx in tqdm(range(0, len(entity_query_docs), batch_size)):
        batch_query_docs = entity_query_docs[batch_idx:batch_idx + batch_size]
        related_pkys, scores = retriever.retrieve(batch_query_docs, k = topn, n_threads=-1)
        score_np.append(np.array(scores))
        related_pkys_np.append(np.array(related_pkys))
    
    score_np = np.concatenate(score_np, axis = 0)
    related_pkys_np = np.concatenate(related_pkys_np, axis = 0)
    # Get indices where the score is above the threshold
    # the first one is the most related one, should be itself
    mask = score_np > (score_np[:,[0]] * threshold)
    # add padding for those non-related docs which is filtered out.
    related_pkys_np[~mask] = -1
    rows_num = np.sum(mask, axis = 1)
    # except itself, still has similar docs
    rows_mask = rows_num > 1
    positive_pool = related_pkys_np[rows_mask]
    
    positive_pool_dict[entity] = positive_pool
    print(f"Generate positive pools #{len(positive_pool)}, original candidate {len(entity_query_docs)} in {entity} table")

--------> tags


100%|██████████| 2/2 [00:00<00:00,  7.48it/s]


Generate positive pools #154, original candidate 1597 in tags table
--------> badges


100%|██████████| 91/91 [23:14<00:00, 15.32s/it]


Generate positive pools #79752, original candidate 92692 in badges table
--------> users


100%|██████████| 50/50 [00:59<00:00,  1.19s/it]


Generate positive pools #768, original candidate 51072 in users table
--------> posts


100%|██████████| 66/66 [14:30<00:00, 13.19s/it]

Generate positive pools #44429, original candidate 66778 in posts table





In [18]:
# (src_table, des_table) -> edge 2-D array
npz_data = {
    f"{src}-{dst}": edge_array
    for (src, dst), edge_array in edge_dict.items()
}

path = f"./edges/rel-stack-edges.npz"
np.savez(path, **npz_data)

In [19]:
edge_dict.keys()

dict_keys([('tags', 'badges'), ('tags', 'users'), ('badges', 'tags'), ('badges', 'posts'), ('users', 'tags'), ('posts', 'badges')])

In [20]:
# path = "./samples/rel-avito-samples.npz"
path = "./samples/rel-stack-samples.npz"
np.savez(path, **positive_pool_dict)