In [1]:
%cd ..
from relbench.datasets import get_dataset
from tqdm import tqdm
import numpy as np

/home/lingze/embedding_fusion


In [2]:
dataset = get_dataset(name="rel-avito", download = True)
db = dataset.get_db()

Loading Database object from /home/lingze/.cache/relbench/rel-avito/db...
Done in 5.33 seconds.


In [3]:
# homoGraph
from utils.builder import HomoGraph, make_homograph_from_db
homoGraph = make_homograph_from_db(db, verbose=True)

table PhoneRequestsStream -> table UserInfo has 243415 edges
table PhoneRequestsStream -> table AdsInfo has 243836 edges
table SearchInfo -> table UserInfo has 1961902 edges
table SearchInfo -> table Location has 1987154 edges
table SearchInfo -> table Category has 1987156 edges
table SearchStream -> table SearchInfo has 7107268 edges
table SearchStream -> table AdsInfo has 7107274 edges
table VisitStream -> table UserInfo has 5254710 edges
table VisitStream -> table AdsInfo has 5265422 edges
table AdsInfo -> table Location has 5935808 edges
table AdsInfo -> table Category has 5960526 edges


In [4]:
from utils.preprocess import infer_type_in_db
from utils.tokenize import tokenize_database

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
col_type_dict = infer_type_in_db(db, True)

[rule 0]: PhoneRequestsStream Inferred UserID from numerical as categorical
[rule 0]: PhoneRequestsStream Inferred IPID from numerical as categorical
[rule 0]: PhoneRequestsStream Inferred AdID from numerical as categorical
[rule 0]: Location Inferred LocationID from numerical as categorical
[rule 1]: LocationInferred Level from numerical as categorical
[rule 0]: Location Inferred RegionID from numerical as categorical
[rule 0]: Location Inferred CityID from numerical as categorical
[rule 0]: SearchInfo Inferred UserID from numerical as categorical
[rule 0]: SearchInfo Inferred SearchID from numerical as categorical
[rule 0]: SearchInfo Inferred IPID from numerical as categorical
[rule 1]: SearchInfoInferred IsUserLoggedOn from numerical as categorical
[rule 0]: SearchInfo Inferred LocationID from numerical as categorical
[rule 0]: UserInfo Inferred UserID from numerical as categorical
[rule 0]: UserInfo Inferred UserAgentID from numerical as categorical
[rule 1]: UserInfoInferred User

In [6]:
tk_db = tokenize_database(db, col_type_dict, './tmp_docs/rel-avito', True)

----------------> Tokenizing PhoneRequestsStream each column
-> Load tokenized data from ./tmp_docs/rel-avito/PhoneRequestsStream.npy
----------------> Tokenizing Location each column
-> Load tokenized data from ./tmp_docs/rel-avito/Location.npy
----------------> Tokenizing SearchInfo each column
-> Load tokenized data from ./tmp_docs/rel-avito/SearchInfo.npy
----------------> Tokenizing UserInfo each column
-> Load tokenized data from ./tmp_docs/rel-avito/UserInfo.npy
----------------> Tokenizing SearchStream each column
-> Load tokenized data from ./tmp_docs/rel-avito/SearchStream.npy
----------------> Tokenizing VisitStream each column
-> Load tokenized data from ./tmp_docs/rel-avito/VisitStream.npy
----------------> Tokenizing AdsInfo each column
-> Load tokenized data from ./tmp_docs/rel-avito/AdsInfo.npy
----------------> Tokenizing Category each column
-> Load tokenized data from ./tmp_docs/rel-avito/Category.npy


In [8]:
from utils.document import generate_document_given_table
from utils.builder import identify_entity_table
from utils.builder import generate_hop_matrix

In [9]:
entity_tables = identify_entity_table(db)
entity_tables

['Location', 'UserInfo', 'Category']

In [10]:
# generated the documents and build the retrieval index
entity_to_docs = {}
walk_length = 10
round = 8
for entity in entity_tables:
   _, entity_to_docs[entity] = generate_document_given_table(
        homoGraph, 
        tk_db, 
        entity, 
        walk_length=walk_length, 
        round = round, 
        verbose=True
    )
   
# temporarily save the index
import bm25s
entity_to_retriver = {}
for entity, docs in entity_to_docs.items():
    retriever = bm25s.BM25(backend="numba")
    retriever.index(docs)
    retriever.activate_numba_scorer()
    entity_to_retriver[entity] = retriever

# save the retriever
for entity, retriever in entity_to_retriver.items():
    retriever.save(f"./tmp/rel-avito/{entity}_retriever_bm25")

                                                                                         

In [11]:
# reload this retriever
import bm25s

entity_to_retriver = {}
for entity in entity_tables:
    path = f"./tmp/rel-avito/{entity}_retriever_bm25"
    retriever = bm25s.BM25.load(path)
    retriever.activate_numba_scorer()
    entity_to_retriver[entity] = retriever
    print(f"load {path}")

load ./tmp/rel-avito/Location_retriever_bm25
load ./tmp/rel-avito/UserInfo_retriever_bm25
load ./tmp/rel-avito/Category_retriever_bm25


In [13]:
# Add the cross-table edges,
# first we want to find the multi-hop entity pairs
hop_matrix = generate_hop_matrix(db)
edge_candidates_pairs = []
for entity in entity_tables:
    for entity2 in entity_tables:
        if entity == entity2:
            continue
        
        if entity2 not in hop_matrix.graph[entity]:
            # not one hop
            edge_candidates_pairs.append((entity, entity2))
edge_candidates_pairs

[('Location', 'UserInfo'),
 ('Location', 'Category'),
 ('UserInfo', 'Location'),
 ('UserInfo', 'Category'),
 ('Category', 'Location'),
 ('Category', 'UserInfo')]

In [16]:
# generated the documents and build the retrieval index
entity_to_docs = {}
walk_length = 10
round = 8
entity_to_docs = {}
entity_candidate_pkys = {}
# for each
for entity in entity_tables:
    n = len(db.table_dict[entity].df)
    sample_size = n // 2
    sample_size = max(sample_size, 4096)
    entity_candidate_pkys[entity], entity_to_docs[entity] = generate_document_given_table(
        homoGraph, 
        tk_db, 
        entity, 
        walk_length=walk_length, 
        round = round,
        sample_size = sample_size,
        verbose=True
    )


- Walks for table Location - shape torch.Size([3512, 8, 9])


                                                     

- Walks for table UserInfo - shape torch.Size([49125, 8, 9])


                                                        

- Walks for table Category - shape torch.Size([68, 8, 9])


                                      

In [17]:
import numpy as np
topn = 20
edge_dict = {}
# (src_table, des_table) -> edge 2-D array
for entity, retrieve_entity in edge_candidates_pairs:

    # retrieve the related docs
    entity_query_docs = entity_to_docs[entity]
    entity_query_pkys = entity_candidate_pkys[entity]
    retriever = entity_to_retriver[retrieve_entity]
    
    related_pkys, scores = retriever.retrieve(entity_query_docs, k = topn, n_threads = 24)
    
    score_np = np.array(scores)
    related_pkys_np = np.array(related_pkys)
    threshold = score_np.mean() + 2*scores.std()
    
    # Get indices where the score is above the threshold
    mask = score_np > threshold

    # Apply the mask
    filtered_cols = related_pkys_np[mask]

    # Generate the corresponding query entities
    entity_query_pkys = np.array(entity_query_pkys)  # shape [n]

    # Repeat each query item the number of True values per row in the mask
    row_repeats = mask.sum(axis=1)  # how many times to repeat each query
    filtered_rows = np.repeat(entity_query_pkys, row_repeats)
    
    
    filtered_edge = np.stack([filtered_rows, filtered_cols], axis=1)
    # added edge
    num_edges = filtered_rows.shape[0]
    edge_dict[(entity, retrieve_entity)] = filtered_edge
    print(f"Add cross table edges #{num_edges} between {entity} and {retrieve_entity}")
    

Add cross table edges #2607 between Location and UserInfo
Add cross table edges #3324 between Location and Category
Add cross table edges #22244 between UserInfo and Location
Add cross table edges #29904 between UserInfo and Category
Add cross table edges #24 between Category and Location
Add cross table edges #11 between Category and UserInfo


In [19]:
# (src_table, des_table) -> edge 2-D array
npz_data = {
    f"{src}-{dst}": edge_array
    for (src, dst), edge_array in edge_dict.items()
}

path = f"./edges/rel-avito-Ads-edges.npz"
np.savez(path, **npz_data)

In [20]:
edge_dict.keys()

dict_keys([('Location', 'UserInfo'), ('Location', 'Category'), ('UserInfo', 'Location'), ('UserInfo', 'Category'), ('Category', 'Location'), ('Category', 'UserInfo')])

In [21]:
# self-entity correlation
# which can generate the positive pairs in the contrastive learning
# generated the documents and build the retrieval index
entity_to_docs = {}
walk_length = 10
round = 8
entity_to_docs = {}
entity_candidate_pkys = {}
# for each

for entity in entity_tables:
    n = len(db.table_dict[entity].df)
    sample_size = n // 2
    sample_size = max(sample_size, 4096)
    pkys , docs = generate_document_given_table(
        homoGraph, 
        tk_db, 
        entity, 
        walk_length=walk_length, 
        round = round,
        sample_size = sample_size,
        verbose=True
    )
    entity_candidate_pkys[entity] = pkys
    entity_to_docs[entity] = docs

- Walks for table Location - shape torch.Size([3512, 8, 9])


                                                     

- Walks for table UserInfo - shape torch.Size([49125, 8, 9])


                                                        

- Walks for table Category - shape torch.Size([68, 8, 9])


                                      

In [22]:
topn = 21
# the most related doc should be itself, so we need to retrieve topn + 1
positive_pool_dict = {}
# entity -> positive candidate, padding the non-value
threshold = 0.7
batch_size = 1024
for entity, retriever in entity_to_retriver.items():
    # retrieve the related docs
    entity_query_docs = entity_to_docs[entity]
    entity_query_pkys = entity_candidate_pkys[entity]
    score_np = []
    related_pkys_np = []
    print(f"--------> {entity}")
    for batch_idx in tqdm(range(0, len(entity_query_docs), batch_size)):
        batch_query_docs = entity_query_docs[batch_idx:batch_idx + batch_size]
        related_pkys, scores = retriever.retrieve(batch_query_docs, k = topn, n_threads=-1)
        score_np.append(np.array(scores))
        related_pkys_np.append(np.array(related_pkys))
    
    score_np = np.concatenate(score_np, axis = 0)
    related_pkys_np = np.concatenate(related_pkys_np, axis = 0)
    # Get indices where the score is above the threshold
    # the first one is the most related one, should be itself
    mask = score_np > (score_np[:,[0]] * threshold)
    # add padding for those non-related docs which is filtered out.
    related_pkys_np[~mask] = -1
    rows_num = np.sum(mask, axis = 1)
    # except itself, still has similar docs
    rows_mask = rows_num > 1
    positive_pool = related_pkys_np[rows_mask]
    
    positive_pool_dict[entity] = positive_pool
    print(f"Generate positive pools #{len(positive_pool)}, original candidate {len(entity_query_docs)} in {entity} table")

--------> Location


100%|██████████| 4/4 [00:00<00:00,  5.04it/s]


Generate positive pools #162, original candidate 3512 in Location table
--------> UserInfo


100%|██████████| 48/48 [00:25<00:00,  1.90it/s]


Generate positive pools #17615, original candidate 49125 in UserInfo table
--------> Category


100%|██████████| 1/1 [00:00<00:00, 76.16it/s]

Generate positive pools #6, original candidate 68 in Category table





In [23]:
# path = "./samples/rel-avito-samples.npz"
path = "./samples/rel-avito-Ads-samples.npz"
np.savez(path, **positive_pool_dict)