In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

def extract(entsList):
    words = []
    for ents in entsList:
        row = []
        for ent in ents:
            row.append(ent['word'])
        words.append(row)
    return words

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

ner = pipeline("ner", model=model, tokenizer=tokenizer, device=0, aggregation_strategy="max")

ents = extract(ner(['What is the capital of Afghanistan?', 'Who directed Pulp Fiction?', 'What is the capital of France?', 'What English settlement was John Smith associated with?', 'Who were some key historical figures during the Victorian Era?', 'Who is the current president of the Teamsters union?', 'How do the populist platforms of Theodore Roosevelt and Andrew Jackson differ?', 'How were the 1950s in New York City different from the 1980s?', 'How large are the biggest species of tortoise?']))

print('')
print('Extracted Entity Words:')
for ent in ents:
    print(ent)

In [None]:
import pandas
adf = pandas.read_parquet('/mnt/bitgraph/data/rag/articles.parquet')
adf

In [None]:
import pandas
edf = pandas.read_parquet('/mnt/bitgraph/data/rag/edgelist.parquet')
edf

In [None]:
num_vertices = max(edf.src.max(), edf.dst.max(), len(adf))
num_vertices

In [8]:
import sys
sys.path.append('/mnt/bitgraph')
sys.path.append('/mnt/gremlin++')
from pybitgraph import BitGraph

graph = BitGraph('int64', 'int64', 'DEVICE', 'DEVICE', 'DEVICE')

graph.add_vertices(num_vertices)
graph.add_edges(
    edf.src.values,
    edf.dst.values,
    'link',
)

In [None]:
import numpy as np
emb = np.load('/mnt/bitgraph/data/rag/emb.npy')
emb.shape
emb = np.concatenate([
    emb,
    np.zeros((num_vertices - emb.shape[0], emb.shape[1]), dtype=emb.dtype)
])
emb.shape

In [10]:
graph.set_vertex_embeddings(
    "emb0",
    np.array([],dtype='int64'),
    emb, 
)

In [None]:
import pandas
qdf = pandas.read_json('/mnt/data/train.json')
qdf

In [None]:
qdf.question[168]

In [None]:
g = graph.traversal()
adf.title.iloc[g.V().like('emb0', [nlp('Margaret of Hungary').vector], .2).toArray()]

In [None]:
from pygremlinxx import GraphTraversal

g = graph.traversal()
__ = lambda : GraphTraversal()

for question in qdf.question[1000:1003]:
    ents = extract([ner(question)])[0]
    print(ents)
    
    result = adf.title.iloc[g.V().like('emb0', [z.vector for z in nlp.pipe(ents)], 3)._as('s').out().limit(4).dedup()._as('t')._union([__().select('s'), __().select('t')]).dedup().toArray()]
    print(
        [r for r in result],
        '\n',
        len(result),
        '\n'
    )

In [None]:
g = graph.traversal()
candidates = g.V().like('emb0', [nlp('Afghanistan').vector], 0.92).out().limit(10).toArray()
adf.iloc[candidates]

In [None]:
adf.iloc[2347966].sentences

In [None]:
adf.title.iloc[
    g.V().like('emb0', [nlp('Fix-up').vector], 10).toArray()
]

In [None]:
from transformers import AutoModel, AutoTokenizer
model = AutoModel.from_pretrained('sentence-transformers/all-roberta-large-v1')
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-roberta-large-v1')



In [2]:
import torch
import torch.nn.functional as F
class Sentence_Transformer(torch.nn.Module):

    def __init__(self, bert_model):
        super(Sentence_Transformer, self).__init__()
        self.bert_model = bert_model

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
        data_type = token_embeddings.dtype
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).to(data_type)
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def forward(self, input_ids, att_mask):
        bert_out = self.bert_model(input_ids=input_ids, attention_mask=att_mask)
        sentence_embeddings = self.mean_pooling(bert_out, att_mask)

        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings

In [None]:
module = Sentence_Transformer(model)
module

In [None]:
t = tokenizer('How large are the biggest species of tortoise?', return_tensors='pt')
a = module(t.input_ids, t.attention_mask)
a

In [None]:
u = tokenizer('Sulcata tortoises are the largest continental species of tortoise.', return_tensors='pt')
b = module(u.input_ids, u.attention_mask)
b

In [None]:
v = tokenizer('tortoise', return_tensors='pt')
c = module(v.input_ids, v.attention_mask)
c

In [None]:
w = tokenizer('Barack Obama was once preident of the United States.', return_tensors='pt')
d = module(w.input_ids, w.attention_mask)
d

In [None]:
from scipy.spatial.distance import cosine

1 - cosine(a.detach().numpy()[0],d.detach().numpy()[0])

In [1]:
import cudf
df = cudf.read_json('/mnt/para_with_hyperlink.jsonl', lines=True)
df

Unnamed: 0,id,title,sentences,mentions
0,17888798,The Circle (Wipers album),[The Circle is the sixth studio album by punk ...,"[{'id': 0, 'start': 40, 'end': 49, 'ref_url': ..."
1,17888807,Urgand,[Urgand is a village in Badakhshan Province in...,"[{'id': 0, 'start': 12, 'end': 19, 'ref_url': ..."
2,17888822,"Urup, Afghanistan",[Urup is a village in Badakhshan Province in n...,"[{'id': 0, 'start': 10, 'end': 17, 'ref_url': ..."
3,17888850,WMIA (AM),"[""For the Miami, Florida radio station, see WM...","[{'id': 0, 'start': 9, 'end': 23, 'ref_url': '..."
4,17888858,Guido of Acqui,[Saint Guido of Acqui( also Wido)( c. 1004 – 1...,"[{'id': 0, 'start': 62, 'end': 77, 'ref_url': ..."
...,...,...,...,...
5989842,12347579,Hebeclinium,[Hebeclinium is a genus of flowering plant in ...,"[{'id': 0, 'start': 26, 'end': 41, 'ref_url': ..."
5989843,12347585,Hebeclinium recreense,[Hebeclinium recreense is a species of floweri...,"[{'id': 0, 'start': 38, 'end': 53, 'ref_url': ..."
5989844,12347593,Helichrysum aciculare,[Helichrysum aciculare is a species of floweri...,"[{'id': 0, 'start': 38, 'end': 53, 'ref_url': ..."
5989845,12347598,Helichrysum arachnoides,[Helichrysum arachnoides is a species of flowe...,"[{'id': 0, 'start': 40, 'end': 55, 'ref_url': ..."


In [2]:
mentions = df.mentions.explode()
mentions = mentions[~mentions.struct.field('sent_idx').isna()]
mentions = mentions[~mentions.struct.field('ref_ids').isna()]
mentions

0          {'id': 0, 'start': 40, 'end': 49, 'ref_url': '...
0          {'id': 1, 'start': 55, 'end': 61, 'ref_url': '...
0          {'id': 2, 'start': 75, 'end': 83, 'ref_url': '...
1          {'id': 0, 'start': 12, 'end': 19, 'ref_url': '...
1          {'id': 1, 'start': 23, 'end': 42, 'ref_url': '...
                                 ...                        
5989846    {'id': 0, 'start': 38, 'end': 53, 'ref_url': '...
5989846    {'id': 1, 'start': 68, 'end': 78, 'ref_url': '...
5989846    {'id': 2, 'start': 20, 'end': 25, 'ref_url': '...
5989846    {'id': 3, 'start': 12, 'end': 19, 'ref_url': '...
5989846    {'id': 4, 'start': 51, 'end': 57, 'ref_url': '...
Name: mentions, Length: 29530757, dtype: struct

In [3]:
import cupy
df['sentence_offsets'] = cupy.concatenate([
    cupy.array([0]),
    df.sentences.list.len().cumsum().values[:-1]
])
df

Unnamed: 0,id,title,sentences,mentions,sentence_offsets
0,17888798,The Circle (Wipers album),[The Circle is the sixth studio album by punk ...,"[{'id': 0, 'start': 40, 'end': 49, 'ref_url': ...",0
1,17888807,Urgand,[Urgand is a village in Badakhshan Province in...,"[{'id': 0, 'start': 12, 'end': 19, 'ref_url': ...",3
2,17888822,"Urup, Afghanistan",[Urup is a village in Badakhshan Province in n...,"[{'id': 0, 'start': 10, 'end': 17, 'ref_url': ...",4
3,17888850,WMIA (AM),"[""For the Miami, Florida radio station, see WM...","[{'id': 0, 'start': 9, 'end': 23, 'ref_url': '...",5
4,17888858,Guido of Acqui,[Saint Guido of Acqui( also Wido)( c. 1004 – 1...,"[{'id': 0, 'start': 62, 'end': 77, 'ref_url': ...",10
...,...,...,...,...,...
5989842,12347579,Hebeclinium,[Hebeclinium is a genus of flowering plant in ...,"[{'id': 0, 'start': 26, 'end': 41, 'ref_url': ...",23245675
5989843,12347585,Hebeclinium recreense,[Hebeclinium recreense is a species of floweri...,"[{'id': 0, 'start': 38, 'end': 53, 'ref_url': ...",23245676
5989844,12347593,Helichrysum aciculare,[Helichrysum aciculare is a species of floweri...,"[{'id': 0, 'start': 38, 'end': 53, 'ref_url': ...",23245680
5989845,12347598,Helichrysum arachnoides,[Helichrysum arachnoides is a species of flowe...,"[{'id': 0, 'start': 40, 'end': 55, 'ref_url': ...",23245683


In [4]:
import torch
destinations_m = mentions.struct.field('ref_ids').list.get(0).astype('int64').values
sources_m = mentions.struct.field('sent_idx').values + df.sentence_offsets[mentions.index].values + len(df)
eim = torch.stack([
    torch.as_tensor(sources_m, device='cuda'),
    torch.as_tensor(destinations_m, device='cuda'),
])
eim,eim.shape

(tensor([[ 5989847,  5989847,  5989847,  ..., 29235534, 29235535, 29235535],
         [   23037,   939352,  1864203,  ...,   350939,  1596317,    11090]],
        device='cuda:0'),
 torch.Size([2, 29530757]))

In [5]:
sentences = df.sentences.explode().reset_index().rename({"index": 'article'},axis=1)
sentences

Unnamed: 0,article,sentences
0,0,The Circle is the sixth studio album by punk r...
1,0,The album received positive reviews.
2,0,"""The Rough Guide to Rock"" wrote that ""jazzy di..."
3,1,Urgand is a village in Badakhshan Province in ...
4,2,Urup is a village in Badakhshan Province in no...
...,...,...
23333449,5989845,It is found only in Yemen.
23333450,5989845,Its natural habitat is subtropical or tropical...
23333451,5989846,Helichrysum balfourii is a species of flowerin...
23333452,5989846,It is found only in Yemen.


In [6]:
sources_s = sentences.index.values + len(df)
destinations_s = sentences.article.values
eis = torch.stack([
    torch.as_tensor(sources_s, device='cuda'),
    torch.as_tensor(destinations_s, device='cuda'),
])
eis,eis.shape

(tensor([[ 5989847,  5989848,  5989849,  ..., 29323298, 29323299, 29323300],
         [       0,        0,        0,  ...,  5989846,  5989846,  5989846]],
        device='cuda:0'),
 torch.Size([2, 23333454]))

In [7]:
eix = torch.concatenate([eim,eis],axis=1)
eix,eix.shape

(tensor([[ 5989847,  5989847,  5989847,  ..., 29323298, 29323299, 29323300],
         [   23037,   939352,  1864203,  ...,  5989846,  5989846,  5989846]],
        device='cuda:0'),
 torch.Size([2, 52864211]))

In [8]:
eis[0].max(), eim[0].max()

(tensor(29323300, device='cuda:0'), tensor(29235535, device='cuda:0'))

In [9]:
eis[0].min(), eim[0].min()

(tensor(5989847, device='cuda:0'), tensor(5989847, device='cuda:0'))

In [10]:
assert eis[0].max() >= eim[0].max()

In [11]:
# Total number of vertices, including those with no embedding
eix.max()+1

tensor(62717570, device='cuda:0')

In [12]:
# Number of vertices with embedding
len(sentences) + len(df)

29323301

In [11]:
import sys
sys.path.append('/mnt/bitgraph')
sys.path.append('/mnt/gremlin++')
from pybitgraph import BitGraph

In [14]:
import cupy

graph = BitGraph(
    "int64",
    "int64",
    "DEVICE",
    "DEVICE",
    "PINNED",
)

src, dst = eix
graph.add_vertices(eix.max() + 1)
graph.add_edges(src, dst, 'link')

In [13]:
import os
import re

import torch

ex = re.compile(r'part_([0-9]+)\_([0-9]+).pt')
def fname_to_key(s):
    m = ex.match(s)
    return int(m[1]), int(m[2])

directory = '/mnt/bitgraph/data/rag/n2v'

emb = torch.empty((29323301, 300), dtype=torch.float32, device=1)
ix = 0

for emb_type in ['titles', 'sentences']:
    path = os.path.join(directory, emb_type)
    files = os.listdir(path)

    files = sorted(files, key=fname_to_key)
    for f in files:
        e = torch.load(os.path.join(path, f), weights_only=True, map_location='cuda:1').reshape((-1, 300))
        emb[ix : (ix + e.shape[0])] = e        
        ix += e.shape[0]
        del e
    
emb

part_0_0.pt 0
part_0_1.pt 1
part_0_2.pt 1000001
part_0_3.pt 2000001
part_1_0.pt 2994924
part_1_1.pt 2994925
part_1_2.pt 3994925
part_1_3.pt 4994925
part_0_0.pt 5989847
part_0_1.pt 5989848
part_0_2.pt 6989848
part_0_3.pt 7989848
part_0_4.pt 8989848
part_0_5.pt 9989848
part_0_6.pt 10989848
part_0_7.pt 11989848
part_0_8.pt 12989848
part_0_9.pt 13989848
part_0_10.pt 14989848
part_0_11.pt 15989848
part_0_12.pt 16989848
part_1_0.pt 17786973
part_1_1.pt 17786974
part_1_2.pt 18786974
part_1_3.pt 19786974
part_1_4.pt 20786974
part_1_5.pt 21786974
part_1_6.pt 22786974
part_1_7.pt 23786974
part_1_8.pt 24786974
part_1_9.pt 25786974
part_1_10.pt 26786974
part_1_11.pt 27786974
part_1_12.pt 28786974


tensor([[-0.0005,  0.0742,  0.0331,  ...,  0.1338,  0.1362, -0.0740],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0574,  0.0383,  0.0260,  ...,  0.0435,  0.4102,  0.0381],
        ...,
        [ 0.0639,  0.1371,  0.0456,  ...,  0.0092,  0.1628,  0.0573],
        [-0.0064,  0.0178,  0.0919,  ...,  0.0089,  0.0992, -0.0496],
        [ 0.0265,  0.0418, -0.0059,  ..., -0.0184,  0.2299,  0.0544]],
       device='cuda:1')

In [10]:
emb[1000002]

tensor([ 3.9173e-02,  4.8354e-02,  1.2332e-01,  8.9205e-03, -4.6352e-02,
         2.7588e-02, -2.6639e-02, -1.1987e-01,  6.3561e-02,  1.0618e-01,
        -2.4808e-02, -9.3252e-02, -8.2684e-02, -1.0388e-02, -6.2595e-02,
         4.3333e-02, -7.6571e-02,  9.2945e-02, -3.9889e-02, -1.7240e-02,
        -8.0801e-02, -1.0599e-02,  1.5339e-02, -3.6979e-03,  5.4256e-02,
        -1.2803e-01, -1.8577e-01,  1.0210e-01,  2.2244e-02, -6.2216e-02,
         5.0269e-02, -5.6969e-02, -1.1328e-01,  8.0480e-02, -7.2021e-03,
        -7.8710e-02, -7.9064e-03, -7.9090e-02,  6.3223e-02,  8.0111e-02,
         4.7315e-02,  7.9599e-02, -7.1599e-03,  2.2402e-02,  1.6193e-02,
        -3.7174e-02, -5.6641e-02, -1.2562e-02, -3.5851e-02, -3.4649e-02,
         3.7208e-02, -1.6771e-02,  7.5308e-03, -9.9628e-02, -1.6771e-02,
         3.4339e-02, -3.5053e-02,  8.3313e-03, -3.3724e-02, -4.3594e-02,
        -2.2768e-02,  3.9600e-02, -5.7918e-02, -1.4754e-01, -3.5565e-02,
        -1.2630e-02,  1.1738e-02,  2.8757e-02, -3.7

In [None]:
graph.set_vertex_embeddings(
    "emb",
    np.array([],dtype='int64'),
    emb
)

In [15]:
g = graph.traversal()
