In [32]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import json
import time

In [33]:
# ls large-verbalized/

In [35]:
df_inference = pd.read_csv("~/bulatov/datasets/ilpc22/github/data/large/inference.txt", names=['head', 'relation', 'tail'], sep='\t')
df_inference

Unnamed: 0,head,relation,tail
0,Q220192,P161,Q379400
1,Q3350666,P106,Q43845
2,Q102127,P106,Q245068
3,Q742217,P27,Q15180
4,Q1020,P463,Q827525
...,...,...,...
77039,Q87513,P27,Q40
77040,Q5275089,P27,Q30
77041,Q327146,P106,Q2865819
77042,Q1696019,P19,Q78


In [36]:
# !ls ~/bulatov/datasets/ilpc22/large_2sep_enum


In [37]:
df_large_valid = pd.read_csv("~/bulatov/datasets/ilpc22/large_2sep_enum/large_valid.csv", index_col=0)
df_large_valid

Unnamed: 0,verbalization,head,tail,verbalized_tail
0,predict [SEP] Jane Goldman [SEP-2] English scr...,Q32661,Q6625963,novelist [SEP-2] writer of novels
1,predict [SEP] novelist [SEP-2] writer of novel...,Q6625963,Q32661,"Jane Goldman [SEP-2] English screenwriter, aut..."
2,predict [SEP] Rounder Records [SEP-2] American...,Q2164531,Q30,United States of America [SEP-2] federal repub...
3,predict [SEP] United States of America [SEP-2]...,Q30,Q2164531,Rounder Records [SEP-2] American record label
4,predict [SEP] Anna Gmeyner [SEP-2] Austrian wr...,Q89261,Q1741,Vienna [SEP-2] capital and one of nine States ...
...,...,...,...,...
20353,predict [SEP] arteriosclerosis [SEP-2] thicken...,Q184559,Q207359,Georges Bataille [SEP-2] French intellectual a...
20354,predict [SEP] Clement of Ohrid [SEP-2] Macedon...,Q158504,Q12544,Byzantine Empire [SEP-2] Eastern Roman Empire ...
20355,predict [SEP] Byzantine Empire [SEP-2] Eastern...,Q12544,Q158504,Clement of Ohrid [SEP-2] Macedonian scholar an...
20356,predict [SEP] Katharine Isabelle [SEP-2] Canad...,Q236826,Q33999,actor [SEP-2] person who acts in a dramatic or...


In [38]:
df_large_test = pd.read_csv("~/bulatov/datasets/ilpc22/large_2sep_enum/large_test.csv", index_col=0)
df_large_test

Unnamed: 0,verbalization,head,tail,verbalized_tail
0,predict [SEP] Karl Gebhardt [SEP-2] Nazi docto...,Q58626,Q183,Germany [SEP-2] federal parliamentary republic...
1,predict [SEP] Germany [SEP-2] federal parliame...,Q183,Q58626,Karl Gebhardt [SEP-2] Nazi doctor and war crim...
2,predict [SEP] Kurtis Blow [SEP-2] American rap...,Q961447,Q183945,record producer [SEP-2] individual who oversee...
3,predict [SEP] record producer [SEP-2] individu...,Q183945,Q961447,Kurtis Blow [SEP-2] American rapper
4,predict [SEP] Joesi Prokopetz [SEP-2] Austrian...,Q43864,Q40,Austria [SEP-2] federal republic in western-ce...
...,...,...,...,...
20363,predict [SEP] actor [SEP-2] person who acts in...,Q33999,Q451251,"Ovidie [SEP-2] French pornographic actress, di..."
20364,predict [SEP] Charles Richet [SEP-2] French ph...,Q214851,Q2370801,Academy of Sciences of the USSR [SEP-2] former...
20365,predict [SEP] Academy of Sciences of the USSR ...,Q2370801,Q214851,Charles Richet [SEP-2] French physiologist and...
20366,predict [SEP] Tito Schipa [SEP-2] Italian oper...,Q456827,Q33999,actor [SEP-2] person who acts in a dramatic or...


### Checking that everything from test and valid is in the inference graph

In [39]:
set(list(df_large_test['head']) + list(df_large_test['tail']) + list(df_large_valid['head']) + list(df_large_valid['tail'])) \
     - set(list(df_inference['head']) + list(df_inference['tail']))

set()

In [40]:
set(list(df_large_valid['head']) + list(df_large_valid['tail']))  \
     - set(list(df_inference['head']) + list(df_inference['tail']))

set()

## Verbalizing entities from inference - label + description 

In [41]:
with open('entities.json', 'r') as f:
    entities = json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'entities.json'

In [42]:
inference_entities = list(set(list(df_inference['tail']) + list(df_inference['head'])))
len(inference_entities)

29246

In [43]:
verbalized_inference_entities = []

for entity in inference_entities:
    verb_ent = entities[entity]['label'] + " " + entities[entity]['description']
    verbalized_inference_entities.append(verb_ent)
verbalized_inference_entities[:3]

NameError: name 'entities' is not defined

## Embedding verbalizations of entities from inference graph

In [44]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [45]:
embeddings = model.encode(verbalized_inference_entities)

In [46]:
embeddings.shape

(0,)

## FAISS

In [71]:
inference_entities = [int(ent[1:]) for ent in inference_entities]

In [73]:
dim = embeddings.shape[1]
metric = faiss.METRIC_INNER_PRODUCT
index = faiss.index_factory(dim, 'IDMap,Flat', metric)

index.add_with_ids(embeddings, inference_entities)

In [74]:
print(index.is_trained)

True


In [84]:
distances, indices = index.search(embeddings[:3, :], 3)

In [79]:
indices

array([[3768363, 3766433, 3504610],
       [ 519273,  739636, 1175878],
       [ 861936,  861975,  862365]])

In [80]:
inference_entities[:3]

[3768363, 519273, 861936]

In [85]:
ks = [1, 3, 5, 10]


for k in ks:
    start_time = time.time()

    distances, indices = index.search(embeddings, k)
    print("--- It takes {} seconds on average to find {} neighbours---".format((time.time() - start_time) / embeddings.shape[0], k))

--- It takes 0.0007022369627765727 seconds on average to find 1 neighbours---
--- It takes 0.0006997868421750609 seconds on average to find 3 neighbours---
--- It takes 0.0007011300927431975 seconds on average to find 5 neighbours---
--- It takes 0.000700585625121607 seconds on average to find 10 neighbours---


In [87]:
faiss.write_index(index,"vector.index")  # save the index to disk
index = faiss.read_index("vector.index")  # load the index