In [1]:
%cd ..
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
import pickle
import os

from torch_geometric.data import HeteroData
from relbench.datasets import get_dataset

from utils.data import preprocess_event_database

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

/home/lingze/embedding_fusion


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = get_dataset('rel-event')
db = dataset.get_db()
preprocess_event_database(db)

Loading Database object from /home/lingze/.cache/relbench/rel-event/db...
Done in 2.97 seconds.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  event_df["event_id"].replace(event_id2index, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  event_df["event_id"].replace(event_id2index, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] 

In [3]:
# describe data
def describe_df(df: pd.DataFrame):
    n = len(df)
    for col in df.columns:
        unique_count = df[col].nunique()
        nan_count = df[col].isnull().sum()
        print(f'{col} Nonan {n-nan_count} - {unique_count}  unique values, {nan_count} NaN values')
        
for table_name, table in db.table_dict.items():
    print(f"Table {table_name}")
    describe_df(table.df)
    print("*"*40)

Table event_attendees
id Nonan 49822 - 49822  unique values, 0 NaN values
event Nonan 49822 - 6823  unique values, 0 NaN values
status Nonan 49822 - 4  unique values, 0 NaN values
user_id Nonan 49822 - 9257  unique values, 0 NaN values
start_time Nonan 49822 - 2576  unique values, 0 NaN values
****************************************
Table user_friends
id Nonan 213703 - 213703  unique values, 0 NaN values
user Nonan 213703 - 28164  unique values, 0 NaN values
friend Nonan 213703 - 28184  unique values, 0 NaN values
****************************************
Table events
event_id Nonan 11465 - 11465  unique values, 0 NaN values
user_id Nonan 86 - 61  unique values, 11379 NaN values
start_time Nonan 11465 - 3667  unique values, 0 NaN values
city Nonan 6024 - 998  unique values, 5441 NaN values
state Nonan 4661 - 89  unique values, 6804 NaN values
zip Nonan 973 - 643  unique values, 10492 NaN values
country Nonan 6047 - 82  unique values, 5418 NaN values
lat Nonan 7396 - 3418  unique values

In [4]:
for table_name, table in db.table_dict.items():
    n = len(table.df)
    print(f"Table {table_name} has {n} rows")

Table event_attendees has 49822 rows
Table user_friends has 213703 rows
Table events has 11465 rows
Table event_interest has 14135 rows
Table users has 37143 rows


In [5]:
from utils.preprocess import infer_type_in_db
from utils.tokenize import tokenize_database
col_type_dict = infer_type_in_db(db, True)

[rule 0]: event_attendees Inferred id from numerical as categorical
[rule 0]: event_attendees Inferred user_id from numerical as categorical
[rule 0]: user_friends Inferred id from numerical as categorical
[rule 0]: events Inferred event_id from numerical as categorical
[rule 0]: events Inferred user_id from numerical as categorical
[rule 1]: events Inferred c_1 from numerical as categorical
[rule 1]: events Inferred c_2 from numerical as categorical
[rule 1]: events Inferred c_3 from numerical as categorical
[rule 1]: events Inferred c_4 from numerical as categorical
[rule 1]: events Inferred c_5 from numerical as categorical
[rule 1]: events Inferred c_6 from numerical as categorical
[rule 1]: events Inferred c_7 from numerical as categorical
[rule 1]: events Inferred c_8 from numerical as categorical
[rule 1]: events Inferred c_9 from numerical as categorical
[rule 1]: events Inferred c_10 from numerical as categorical
[rule 1]: events Inferred c_11 from numerical as categorical
[ru

In [6]:
# check all col types
for table_name, col_types in col_type_dict.items():
    print(f"Table {table_name}")
    for col, type_ in col_types.items():
        print(f"{col}: {type_}")
    print("*"*40)

Table event_attendees
id: categorical
event: categorical
status: categorical
user_id: categorical
start_time: timestamp
****************************************
Table user_friends
id: categorical
user: categorical
friend: categorical
****************************************
Table events
event_id: categorical
user_id: categorical
start_time: timestamp
city: text_embedded
state: text_embedded
zip: text_embedded
country: text_embedded
lat: numerical
lng: numerical
c_1: categorical
c_2: categorical
c_3: categorical
c_4: categorical
c_5: categorical
c_6: categorical
c_7: categorical
c_8: categorical
c_9: categorical
c_10: categorical
c_11: categorical
c_12: categorical
c_13: categorical
c_14: categorical
c_15: categorical
c_16: categorical
c_17: categorical
c_18: categorical
c_19: categorical
c_20: categorical
c_21: categorical
c_22: categorical
c_23: categorical
c_24: categorical
c_25: categorical
c_26: categorical
c_27: categorical
c_28: categorical
c_29: categorical
c_30: categorical
c_3

In [7]:
# preprocess the table, concatenate the columns which is text type
#        /--- text_col_1 ---/ --- text_col_2 --- / --- text_col_3 --- / 
# row 1  /------- A   -----/ ------- B   -----  / -----   C  ------- /
# -------> Generate a new TexT column
# "text_col_1 is A, text_col_2 is B, text_col_3 is C"

# Therefore, we only need to convert this text column to vector 
# and drop the original text columns
# for saving memory and computation 
from torch_frame import stype

for table_name, type_dict in col_type_dict.items():
    # collect the text columns
    text_cols = [ col for col, stype in type_dict.items() if stype == stype.text_embedded]
    compress_cols = []
    # for long text, we still keep it as one column
    for col in text_cols:
        avg_word_count = db.table_dict[table_name].df[col].dropna().apply(lambda x: len(str(x).split())).mean()
        if avg_word_count < 128: # a half of default max length of BERT Max length （256）
            # remove the long text cols
            compress_cols.append(col)
          
    
    if len(compress_cols) <= 1:
        # if only one text column, we do not need to compress
        continue
    
    print(f"----> Compressing {table_name} text columns: {compress_cols}")
    
    df = db.table_dict[table_name].df
    compress_text_df = df[compress_cols]
    
    def row_to_text(row):
        if row.isna().all():
            return None
        tokens = [f"{key} is {value}" for key, value in row.dropna().items()]
        return ", ".join(tokens)

    text_list = compress_text_df.apply(row_to_text, axis=1).tolist()
    
    # drop the compressed columns
    df.drop(columns=compress_cols, inplace=True)
    df["text_compress"] = text_list
    
    # update the type dict
    for col in compress_cols:
        type_dict.pop(col)
    type_dict["text_compress"] = stype.text_embedded

----> Compressing events text columns: ['city', 'state', 'zip', 'country']


In [8]:
from utils.resource import get_text_embedder_cfg
text_embedder_cfg = get_text_embedder_cfg(
    # model_name = "sentence-transformers/average_word_embeddings_glove.6B.300d", 
    model_name = "all-MiniLM-L12-v2",
    device = device)

In [9]:
from utils.builder import build_pyg_hetero_graph
cache_dir = "./data/rel-event-tensor-frame"
data, col_stats_dict = build_pyg_hetero_graph(
    db,
    col_type_dict,
    text_embedder_cfg,
    cache_dir,
    True,
)

-----> Materialize event_attendees Tensor Frame
-----> Build edge between users and users
-----> Materialize events Tensor Frame


Embedding raw data in mini-batch: 100%|██████████| 23/23 [00:09<00:00,  2.41it/s]


-----> Materialize event_interest Tensor Frame
-----> Materialize users Tensor Frame


Embedding raw data in mini-batch: 100%|██████████| 73/73 [00:24<00:00,  2.95it/s]


In [10]:
# save the col_type_dict
with open(os.path.join(cache_dir, "col_type_dict.pkl"), "wb") as f:
    pickle.dump(col_type_dict, f)