In [1]:
import pandas as pd

wiki_items = pd.read_csv('./data/wiki_lite/wiki_items.csv')
statements = pd.read_csv('./data/wiki_lite/statements.csv')
statements.reset_index(inplace=True, drop=True)
properties = pd.read_csv('./data/wiki_lite/property.csv')
wiki_items.reset_index(drop=True, inplace=True)

In [2]:
# join statements(edge_property_id) with properties(property_id)
statements = statements.merge(properties, left_on='edge_property_id', right_on='property_id')

# join statements(source_item_id) on wiki_items(item_id)
statements = statements.merge(wiki_items[['item_id', 'wikipedia_title']], left_on='source_item_id', right_on='item_id')
# drop item_id column and rename wikipedia_title to source_item_title
statements.drop('item_id', axis=1, inplace=True)
statements.rename(columns={'wikipedia_title': 'source_item_title'}, inplace=True)
# join statements(target_item_id) on wiki_items(item_id)
statements = statements.merge(wiki_items[['item_id', 'wikipedia_title']], left_on='target_item_id', right_on='item_id')
# drop item_id column and rename wikipedia_title to target_item_title
statements.drop('item_id', axis=1, inplace=True)
statements.rename(columns={'wikipedia_title': 'target_item_title'}, inplace=True)

In [3]:
# From statements.csv drop duplicate rows that have the same source_item_id, edge_property_id, target_item_id
statements.drop_duplicates(subset=['source_item_id', 'edge_property_id', 'target_item_id'], inplace=True)
# group by statements on source_item_id then aggregate en_label and target_item_title in a list
statements['information'] = statements['en_label'] + ' ' + statements['target_item_title']
statements_grouped = statements.groupby('source_item_id').agg({'information': list, 'source_item_title': 'first'})
statements_grouped['information'] = statements_grouped['information'].apply(lambda x: ' '.join(x))
statements_grouped.reset_index(inplace=True)

In [4]:
statements_grouped['source_item_title'] = statements_grouped['source_item_title'].fillna('')

statements_grouped.source_item_title.isna().sum()

0

In [5]:
statements_grouped

Unnamed: 0,source_item_id,information,source_item_title
0,1,child astronomical body Sloan Great Wall child...,Universe
1,2,creator Big Bang described by source Brockhaus...,Earth
2,3,described by source Brockhaus and Efron Encycl...,Life
3,4,described by source Brockhaus and Efron Encycl...,Death
4,5,described by source Brockhaus and Efron Encycl...,Human
...,...,...,...
4885256,77042017,instance of Binary star constellation Centaurus,HD 102365
4885257,77043280,instance of Human member of sports team Ranger...,Charlie Johnstone
4885258,77231860,instance of Human instrument Saxophone family ...,Aldo Rossi (musician)
4885259,77240068,country United States located in the administr...,Ebenezer Baptist Church


In [6]:
aliases = pd.read_csv('./data/wiki_lite/item_aliases.csv') # 2 columns: item_id,en_alias
aliases['en_alias'].fillna('', inplace=True)
# group by item_id and aggregate en_alias in a list
aliases_grouped = aliases.groupby('item_id').agg({'en_alias': list})
aliases_grouped.reset_index(inplace=True)
aliases_grouped['en_alias'] = aliases_grouped['en_alias'].apply(lambda x: ' '.join(x))

aliases_grouped.en_alias.isna().sum()

0

In [7]:

# join aliases_grouped(item_id) on statements_grouped(source_item_id)
statements_grouped = statements_grouped.merge(aliases_grouped, left_on='source_item_id', right_on='item_id', how='left')

In [8]:
statements_grouped.en_alias.fillna('', inplace=True)
display(statements_grouped.en_alias.isna().sum(), statements_grouped.source_item_title.isna().sum())

statements_grouped

0

0

Unnamed: 0,source_item_id,information,source_item_title,item_id,en_alias
0,1,child astronomical body Sloan Great Wall child...,Universe,1.0,Our Universe The Universe The Cosmos cosmos
1,2,creator Big Bang described by source Brockhaus...,Earth,2.0,Blue Planet Terra Mater Terra Planet Earth Tel...
2,3,described by source Brockhaus and Efron Encycl...,Life,3.0,biota
3,4,described by source Brockhaus and Efron Encycl...,Death,4.0,bereft of life perish oblivion mortis fallen m...
4,5,described by source Brockhaus and Efron Encycl...,Human,5.0,human being humankind people homosapiens person
...,...,...,...,...,...
4885256,77042017,instance of Binary star constellation Centaurus,HD 102365,,
4885257,77043280,instance of Human member of sports team Ranger...,Charlie Johnstone,,
4885258,77231860,instance of Human instrument Saxophone family ...,Aldo Rossi (musician),,
4885259,77240068,country United States located in the administr...,Ebenezer Baptist Church,,


In [9]:
statements_grouped['source_item_title'] = statements_grouped['source_item_title'] + ' ' + statements_grouped['en_alias']

In [10]:
statements_grouped.reset_index(inplace=True, drop=True)

display(statements_grouped.shape)

statements_grouped

(4885261, 5)

Unnamed: 0,source_item_id,information,source_item_title,item_id,en_alias
0,1,child astronomical body Sloan Great Wall child...,Universe Our Universe The Universe The Cosmos ...,1.0,Our Universe The Universe The Cosmos cosmos
1,2,creator Big Bang described by source Brockhaus...,Earth Blue Planet Terra Mater Terra Planet Ear...,2.0,Blue Planet Terra Mater Terra Planet Earth Tel...
2,3,described by source Brockhaus and Efron Encycl...,Life biota,3.0,biota
3,4,described by source Brockhaus and Efron Encycl...,Death bereft of life perish oblivion mortis fa...,4.0,bereft of life perish oblivion mortis fallen m...
4,5,described by source Brockhaus and Efron Encycl...,Human human being humankind people homosapiens...,5.0,human being humankind people homosapiens person
...,...,...,...,...,...
4885256,77042017,instance of Binary star constellation Centaurus,HD 102365,,
4885257,77043280,instance of Human member of sports team Ranger...,Charlie Johnstone,,
4885258,77231860,instance of Human instrument Saxophone family ...,Aldo Rossi (musician),,
4885259,77240068,country United States located in the administr...,Ebenezer Baptist Church,,


In [11]:
# create a dictionary of source_item_id to index
item_id_to_index = dict(zip(statements_grouped['source_item_id'], statements_grouped.index))

import pickle
with open('./data/wiki_lite/statement_bge_features_item_id_to_index.pickle', 'wb') as handle:
    pickle.dump(item_id_to_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
info_list = statements_grouped['information'].tolist()

title_list = statements_grouped['source_item_title'].tolist()

assert len(info_list) == len(title_list)

del aliases
del properties

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('BAAI/bge-base-en', device='cuda')

In [13]:
info_embeddings = model.encode(info_list, show_progress_bar=True)

Batches:   0%|          | 0/152665 [00:00<?, ?it/s]

In [14]:
# info embeddings is a numpy array of shape (n, 768) where n is the number of items
# save info embeddings to disk
import numpy as np
np.save('./bge_info_embeds.npy', info_embeddings)

In [15]:
del info_embeddings
del info_list

In [16]:
import torch
# clear GPU memory
torch.cuda.empty_cache()
import gc
gc.collect()

0

In [17]:
title_embeddings = model.encode(title_list, show_progress_bar=True)

Batches:   0%|          | 0/152665 [00:00<?, ?it/s]

In [18]:
np.save('./bge_title_embeds.npy',title_embeddings)

In [19]:
del title_embeddings

torch.cuda.empty_cache()
gc.collect()

0

In [20]:
wiki_items['en_description'] = wiki_items['en_description'].fillna('')
wiki_items['wikipedia_title'] = wiki_items['wikipedia_title'].fillna('')
# join wiki_items and aliases_grouped on item_id
wiki_items = wiki_items.merge(aliases_grouped, on='item_id', how='left')
wiki_items['en_alias'] = wiki_items['en_alias'].fillna('')
wiki_items['wikipedia_title'] = wiki_items['wikipedia_title'] + ' ' + wiki_items['en_alias']
# only keep those wiki_items that are NOT in statements_grouped
wiki_items = wiki_items[~wiki_items['item_id'].isin(statements_grouped['source_item_id'])]
wiki_items.reset_index(inplace=True, drop=True)

# create a dictionary mapping item_id to row index
item_id_to_index = dict(zip(wiki_items['item_id'], wiki_items.index))
import pickle
with open('./data/wiki_lite/wiki_features_bge_item_id_to_index.pickle', 'wb') as handle:
    pickle.dump(item_id_to_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
del aliases_grouped
del statements
del statements_grouped


description_list = wiki_items['en_description'].tolist()

title_list = wiki_items['wikipedia_title'].tolist()

assert len(description_list) == len(title_list)


In [23]:
wiki_title_embeddings = model.encode(title_list, show_progress_bar=True)

np.save('./bge_wiki_title_embeds.npy', wiki_title_embeddings)

del wiki_title_embeddings

torch.cuda.empty_cache()
gc.collect()

Batches:   0%|          | 0/10343 [00:00<?, ?it/s]

0

In [25]:
description_embeddings = model.encode(description_list, show_progress_bar=True)

np.save('./bge_description_embeds.npy', description_embeddings)

del description_embeddings

torch.cuda.empty_cache()
gc.collect()

Batches:   0%|          | 0/10343 [00:00<?, ?it/s]

0

## the rest

In [1]:
import torch

info_embeddings = torch.load('./info_embeddings.pt', map_location=torch.device('cpu'))

info_embeddings

tensor([[-0.0488, -0.0118,  0.0525,  ..., -0.0292,  0.0058,  0.0687],
        [ 0.0305, -0.0525,  0.0397,  ..., -0.0116, -0.0489,  0.0142],
        [ 0.0094, -0.0168, -0.0829,  ...,  0.0381,  0.1065, -0.0130],
        ...,
        [-0.0413,  0.0022, -0.0617,  ..., -0.0125, -0.0042, -0.0875],
        [ 0.0514,  0.0171, -0.0427,  ..., -0.0174,  0.0310, -0.0132],
        [ 0.1285, -0.0306, -0.0131,  ..., -0.0252,  0.0280, -0.0690]])

In [2]:
title_embeddings = torch.load('./title_embeddings.pt', map_location=torch.device('cpu'))

title_embeddings

tensor([[ 3.9014e-02,  9.5367e-03,  1.0904e-02,  ..., -2.3632e-02,
          1.2907e-02,  2.0615e-02],
        [-6.0217e-02, -3.4879e-02,  8.7372e-02,  ..., -3.3114e-02,
         -6.0199e-02, -1.0293e-02],
        [-2.5279e-02,  6.6466e-02, -6.4570e-02,  ...,  3.3458e-03,
          7.2263e-02,  9.9293e-05],
        ...,
        [-3.2201e-02,  3.0584e-02, -3.4390e-02,  ..., -1.0032e-01,
          8.4057e-03, -9.9466e-02],
        [-9.2137e-03,  5.4107e-02, -4.0712e-02,  ..., -2.4230e-02,
         -8.2808e-03, -8.6116e-02],
        [-2.8107e-02,  2.5851e-02, -7.0126e-03,  ..., -7.7793e-03,
          8.9322e-02,  4.5101e-02]])

In [3]:
info_embeddings.shape , title_embeddings.shape

(torch.Size([4885261, 384]), torch.Size([4885261, 384]))

In [4]:
features = (info_embeddings + title_embeddings) / 2 # type: ignore

features

tensor([[-0.0049, -0.0011,  0.0317,  ..., -0.0264,  0.0094,  0.0446],
        [-0.0149, -0.0437,  0.0635,  ..., -0.0223, -0.0546,  0.0020],
        [-0.0080,  0.0248, -0.0737,  ...,  0.0207,  0.0894, -0.0065],
        ...,
        [-0.0367,  0.0164, -0.0481,  ..., -0.0564,  0.0021, -0.0935],
        [ 0.0211,  0.0356, -0.0417,  ..., -0.0208,  0.0114, -0.0496],
        [ 0.0502, -0.0024, -0.0100,  ..., -0.0165,  0.0587, -0.0119]])

In [5]:
import torch

torch.save(features, './data/wiki_lite/statement_features.pt')

## Loading bge embeds

In [26]:
bge_wiki_title_embeds = np.load('./bge_wiki_title_embeds.npy')
bge_description_embeds = np.load('./bge_description_embeds.npy')

bge_wiki_title_embeds.shape, bge_description_embeds.shape

((330975, 768), (330975, 768))

In [27]:
bge_wiki_features = (bge_wiki_title_embeds + bge_description_embeds) / 2

bge_wiki_features.shape

(330975, 768)

In [28]:
np.save('./data/wiki_lite/bge_wiki_features.npy', bge_wiki_features)

In [2]:
import numpy as np
# average files under ./bge_info_embeds.npy and ./bge_title_embeds.npy
bge_info_embeds = np.load('./bge_info_embeds.npy')
bge_title_embeds = np.load('./bge_title_embeds.npy')

In [3]:
bge_info_embeds = (bge_info_embeds + bge_title_embeds) / 2

In [4]:
np.save('./bge_statement_features.npy', bge_info_embeds)