In [1]:
import pandas as pd

wiki_items = pd.read_csv('./data/wiki_lite/wiki_items.csv')
statements = pd.read_csv('./data/wiki_lite/statements.csv')
statements.reset_index(inplace=True, drop=True)
properties = pd.read_csv('./data/wiki_lite/property.csv')

# join statements(edge_property_id) with properties(property_id)
statements = statements.merge(properties, left_on='edge_property_id', right_on='property_id')

# join statements(source_item_id) on wiki_items(item_id)
statements = statements.merge(wiki_items[['item_id', 'wikipedia_title']], left_on='source_item_id', right_on='item_id')
# drop item_id column and rename wikipedia_title to source_item_title
statements.drop('item_id', axis=1, inplace=True)
statements.rename(columns={'wikipedia_title': 'source_item_title'}, inplace=True)
# join statements(target_item_id) on wiki_items(item_id)
statements = statements.merge(wiki_items[['item_id', 'wikipedia_title']], left_on='target_item_id', right_on='item_id')
# drop item_id column and rename wikipedia_title to target_item_title
statements.drop('item_id', axis=1, inplace=True)
statements.rename(columns={'wikipedia_title': 'target_item_title'}, inplace=True)

# From statements.csv drop duplicate rows that have the same source_item_id, edge_property_id, target_item_id
statements.drop_duplicates(subset=['source_item_id', 'edge_property_id', 'target_item_id'], inplace=True)
# group by statements on source_item_id then aggregate en_label and target_item_title in a list
statements['information'] = statements['en_label'] + ' ' + statements['target_item_title']
statements_grouped = statements.groupby('source_item_id').agg({'information': list, 'source_item_title': 'first'})
statements_grouped['information'] = statements_grouped['information'].apply(lambda x: ' '.join(x))
statements_grouped.reset_index(inplace=True)

In [2]:
aliases = pd.read_csv('./data/wiki_lite/item_aliases.csv') # 2 columns: item_id,en_alias
aliases['en_alias'].fillna('', inplace=True)
# group by item_id and aggregate en_alias in a list
aliases_grouped = aliases.groupby('item_id').agg({'en_alias': list})
aliases_grouped.reset_index(inplace=True)
aliases_grouped['en_alias'] = aliases_grouped['en_alias'].apply(lambda x: ' '.join(x))

aliases_grouped.en_alias.isna().sum()

0

In [3]:
wiki_items['en_description'] = wiki_items['en_description'].fillna('')
wiki_items['wikipedia_title'] = wiki_items['wikipedia_title'].fillna('')

In [4]:
# join wiki_items and aliases_grouped on item_id
wiki_items = wiki_items.merge(aliases_grouped, on='item_id', how='left')

wiki_items

Unnamed: 0,item_id,en_label,en_description,wikipedia_title,en_alias
0,1,Universe,totality of space and all contents,Universe,Our Universe The Universe The Cosmos cosmos
1,2,Earth,third planet from the Sun in the Solar System,Earth,Blue Planet Terra Mater Terra Planet Earth Tel...
2,3,life,matter capable of extracting energy from the e...,Life,biota
3,4,death,permanent cessation of vital functions,Death,bereft of life perish oblivion mortis fallen m...
4,5,human,"common name of Homo sapiens, unique extant spe...",Human,human being humankind people homosapiens person
...,...,...,...,...,...
5216231,77042017,HR 4523,,HD 102365,
5216232,77043280,Charlie Johnston,,Charlie Johnstone,
5216233,77231860,Aldo Rossi,musician,Aldo Rossi (musician),
5216234,77240068,Ebenezer Baptist Church,"church in Atlanta, Georgia, USA",Ebenezer Baptist Church,


In [5]:
wiki_items['en_alias'] = wiki_items['en_alias'].fillna('')

In [6]:
wiki_items['wikipedia_title'] = wiki_items['wikipedia_title'] + ' ' + wiki_items['en_alias']

In [7]:
# only keep those wiki_items that are NOT in statements_grouped
wiki_items = wiki_items[~wiki_items['item_id'].isin(statements_grouped['source_item_id'])]

In [8]:
wiki_items.reset_index(inplace=True, drop=True)

# create a dictionary mapping item_id to row index
item_id_to_index = dict(zip(wiki_items['item_id'], wiki_items.index))

In [10]:
import pickle
with open('./data/wiki_lite/wiki_features_item_id_to_index.pickle', 'wb') as handle:
    pickle.dump(item_id_to_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
description_list = wiki_items['en_description'].tolist()

title_list = wiki_items['wikipedia_title'].tolist()

assert len(description_list) == len(title_list)

del aliases
del aliases_grouped
del statements
del statements_grouped
del properties


from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

In [18]:
del wiki_items

In [19]:
title_embeddings = model.encode(title_list, show_progress_bar=True, convert_to_tensor=True)

Batches:   0%|          | 0/10343 [00:00<?, ?it/s]

In [21]:
title_embeddings, title_embeddings.shape

(tensor([[-0.0756,  0.0253,  0.0096,  ...,  0.0604,  0.0012, -0.0978],
         [-0.0170,  0.0599,  0.0380,  ...,  0.0317,  0.0148,  0.0543],
         [-0.0678, -0.0499, -0.0075,  ...,  0.0457, -0.0852, -0.0856],
         ...,
         [ 0.0359,  0.0509, -0.0214,  ..., -0.0486, -0.0148,  0.0416],
         [-0.0617,  0.0226,  0.0066,  ..., -0.0251,  0.0160,  0.0221],
         [-0.0442,  0.0270, -0.0472,  ..., -0.0217,  0.0063, -0.0012]],
        device='cuda:0'),
 torch.Size([330975, 384]))

In [22]:
description_embeddings = model.encode(description_list, show_progress_bar=True, convert_to_tensor=True)

Batches:   0%|          | 0/10343 [00:00<?, ?it/s]

In [23]:
description_embeddings, description_embeddings.shape

(tensor([[-0.0411,  0.0417, -0.0184,  ..., -0.0524, -0.0478, -0.0581],
         [ 0.0250,  0.0604, -0.0156,  ...,  0.0565, -0.0314,  0.0322],
         [-0.0370,  0.0279, -0.0221,  ...,  0.0337, -0.0012, -0.0058],
         ...,
         [-0.1188,  0.0483, -0.0025,  ...,  0.1264,  0.0465, -0.0157],
         [-0.0577, -0.0065, -0.0429,  ..., -0.0150,  0.0573, -0.0145],
         [-0.0861,  0.0050,  0.0055,  ..., -0.0636,  0.0128,  0.0414]],
        device='cuda:0'),
 torch.Size([330975, 384]))

In [24]:
wiki_items_features = (title_embeddings + description_embeddings) / 2  # type: ignore

wiki_items_features, wiki_items_features.shape

(tensor([[-0.0584,  0.0335, -0.0044,  ...,  0.0040, -0.0233, -0.0780],
         [ 0.0040,  0.0601,  0.0112,  ...,  0.0441, -0.0083,  0.0432],
         [-0.0524, -0.0110, -0.0148,  ...,  0.0397, -0.0432, -0.0457],
         ...,
         [-0.0415,  0.0496, -0.0120,  ...,  0.0389,  0.0159,  0.0130],
         [-0.0597,  0.0081, -0.0182,  ..., -0.0201,  0.0366,  0.0038],
         [-0.0652,  0.0160, -0.0208,  ..., -0.0426,  0.0095,  0.0201]],
        device='cuda:0'),
 torch.Size([330975, 384]))

In [25]:
import torch

torch.save(wiki_items_features, './data/wiki_lite/wiki_items_features.pt')