In [1]:
import argparse
import sys, os
import re
import warnings

import numpy as np

import rmm
from rmm.allocators.torch import rmm_torch_allocator
from rmm.allocators.cupy import rmm_cupy_allocator

# Initialize shared allocator to prevent fragmentation
rmm.reinitialize(devices=0, pool_allocator=False, managed_memory=False)

import cupy
cupy.cuda.set_allocator(rmm_cupy_allocator)

import torch
torch.cuda.change_current_allocator(rmm_torch_allocator)

import cudf

sys.path.append('/mnt/bitgraph')
sys.path.append('/mnt/gremlin++')
from pybitgraph import BitGraph

from preprocess import Sentence_Transformer, Word2Vec_Transformer
from transformers import AutoModel, AutoTokenizer
torch.set_float32_matmul_precision('high')

def read_wiki_data(fname, skip_empty=True):
    df = cudf.read_json('/mnt/para_with_hyperlink.jsonl', lines=True)

    mentions = df.mentions.explode()
    mentions = mentions[~mentions.struct.field('sent_idx').isna()]
    mentions = mentions[~mentions.struct.field('ref_ids').isna()]

    df['sentence_offsets'] = cupy.concatenate([
        cupy.array([0]),
        df.sentences.list.len().cumsum().values[:-1]
    ])

    destinations_m = mentions.struct.field('ref_ids').list.get(0).astype('int64').values
    sources_m = mentions.struct.field('sent_idx').values + df.sentence_offsets[mentions.index].values + len(df)

    if skip_empty:
        # Does not add vertices/edges for vertices with no embedding
        f = destinations_m < len(df)
        destinations_m = destinations_m[f]
        sources_m = sources_m[f]
        del f

    eim = torch.stack([
        torch.as_tensor(sources_m, device='cuda'),
        torch.as_tensor(destinations_m, device='cuda'),
    ])

    sentences = df.sentences.explode().reset_index().rename({"index": 'article'},axis=1)

    sources_s = sentences.index.values + len(df)
    destinations_s = sentences.article.values
    eis = torch.stack([
        torch.as_tensor(sources_s, device='cuda'),
        torch.as_tensor(destinations_s, device='cuda'),
    ])

    eix = torch.concatenate([eim,eis],axis=1)
    del eis
    del eim

    return eix, df.title.to_pandas(), sentences.sentences.to_pandas()


def read_embeddings(graph, directory, td):
    ex = re.compile(r'part_([0-9]+)\_([0-9]+).pt')
    def fname_to_key(s):
        m = ex.match(s)
        return int(m[1]), int(m[2])

    ix = 0

    for emb_type in ['titles', 'sentences']:
        path = os.path.join(directory, emb_type)
        files = os.listdir(path)

        files = sorted(files, key=fname_to_key)
        for f in files:
            e = torch.load(os.path.join(path, f), weights_only=True, map_location='cuda').reshape((-1, td))

            print(ix, e.shape)
            graph.set_vertex_embeddings('emb', ix, ix + e.shape[0] - 1, e)
            
            ix += e.shape[0]
            del e


def getem_roberta(model, tokenizer, text):
    t = tokenizer(text, return_tensors='pt')
    while t.input_ids.shape[1] > 512:
        a = a[:-10]
        t = tokenizer(a, return_tensors='pt')
    return model(t.input_ids, t.attention_mask)


def getem_w2v(model, text):
    return model(text)


args = {
    'skip_empty_vertices': True,
    'property_storage': 'managed',
    'fname': '/mnt/para_with_hyperlink.jsonl',
    'embeddings_dir': '/mnt/bitgraph/data/rag/w2v/',
    'embedding_type': 'w2v',
    'w2v_path': '/mnt/GoogleNews-vectors-negative300.bin.gz',
}

eix, titles, sentences = read_wiki_data(
    args['fname'],
    args['skip_empty_vertices']
)
print('read wiki data')

graph = BitGraph(
    'int64',
    'int64',
    'DEVICE',
    'DEVICE',
    args['property_storage'].upper(),
)

graph.add_vertices(eix.max() + 1)
graph.add_edges(eix[0], eix[1], 'link')

read_embeddings(
    graph,
    args['embeddings_dir'],
    td=300 if args['embedding_type'] == 'w2v' else 1024,
)    
print('read embeddings into graph')

g = graph.traversal()
print('constructed graph')

if args['embedding_type'] == 'w2v':
    import gensim
    warnings.warn("Word2Vec encoder is for testing/debugging purposes only!")
    module = Word2Vec_Transformer(
        gensim.models.KeyedVectors.load_word2vec_format(args['w2v_path'], binary=True),
        dim=300,
    )
    getem = lambda t : getem_w2v(module, t)
elif args['embedding_type'] == 'roberta':
    model = AutoModel.from_pretrained('sentence-transformers/all-roberta-large-v1')
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-roberta-large-v1')
    
    mod = Sentence_Transformer(model).cuda()
    import torch._dynamo
    torch._dynamo.reset()

    module = torch.compile(mod, fullgraph=True)
    getem = lambda t : getem_roberta(module, tokenizer, t)
else:
    raise ValueError("Expected 'w2v' or 'roberta' for embedding type")


  from .autonotebook import tqdm as notebook_tqdm


read wiki data
0 torch.Size([1, 300])
1 torch.Size([1000000, 300])
1000001 torch.Size([1000000, 300])
2000001 torch.Size([994923, 300])
2994924 torch.Size([1, 300])
2994925 torch.Size([1000000, 300])
3994925 torch.Size([1000000, 300])
4994925 torch.Size([994922, 300])
5989847 torch.Size([1, 300])
5989848 torch.Size([1000000, 300])
6989848 torch.Size([1000000, 300])
7989848 torch.Size([1000000, 300])
8989848 torch.Size([1000000, 300])
9989848 torch.Size([1000000, 300])
10989848 torch.Size([1000000, 300])
11989848 torch.Size([1000000, 300])
12989848 torch.Size([1000000, 300])
13989848 torch.Size([1000000, 300])
14989848 torch.Size([1000000, 300])
15989848 torch.Size([1000000, 300])
16989848 torch.Size([797125, 300])
17786973 torch.Size([1, 300])
17786974 torch.Size([1000000, 300])
18786974 torch.Size([1000000, 300])
19786974 torch.Size([1000000, 300])
20786974 torch.Size([1000000, 300])
21786974 torch.Size([1000000, 300])
22786974 torch.Size([1000000, 300])
23786974 torch.Size([1000000, 



In [2]:
def query(search_query, lim=4):
    qe = getem(search_query)
    vids = g.V().like('emb', [qe], lim).toArray()

    f = vids < len(titles)
    article_ids = vids[f]
    sentence_ids = vids[~f] - len(titles)

    print('articles:', titles.iloc[article_ids.get()])
    print('sentences:', sentences.iloc[sentence_ids.get()])


In [3]:
import pandas
truth_df = pandas.read_json('/mnt/data/train.json')
truth_df

Unnamed: 0,_id,type,question,context,supporting_facts,evidences,answer
0,13f5ad2c088c11ebbd6fac1f6bf848b6,bridge_comparison,Are director of film Move (1970 Film) and dire...,"[[Stuart Rosenberg, [Stuart Rosenberg (August ...","[[Move (1970 film), 0], [Méditerranée (1963 fi...","[[Move (1970 film), director, Stuart Rosenberg...",no
1,3057c6c4086111ebbd5dac1f6bf848b6,bridge_comparison,Do both films The Falcon (Film) and Valentin T...,"[[The Falcon Takes Over, [The Falcon Takes Ove...","[[The Falcon (film), 0], [Valentin the Good, 0...","[[The Falcon (film), director, Vatroslav Mimic...",no
2,89bc944808a111ebbd79ac1f6bf848b6,bridge_comparison,"Which film whose director is younger, Charge I...","[[Danger: Diabolik, [Danger:, Diabolik is a 1...","[[Charge It to Me, 1], [Danger: Diabolik, 1], ...","[[Charge It to Me, director, Roy William Neill...",Danger: Diabolik
3,633f80660bdd11eba7f7acde48001122,compositional,What is the date of birth of Mina Gerhardsen's...,"[[Pamela Jain, [Pamela Jain is an Indian playb...","[[Mina Gerhardsen, 1], [Rune Gerhardsen, 0]]","[[Mina Gerhardsen, father, Rune Gerhardsen], [...",13 June 1946
4,2dc3f9740bda11eba7f7acde48001122,compositional,What nationality is the director of film Weddi...,"[[Weekend in Paradise (1931 film), [Weekend in...","[[Wedding Night in Paradise (1950 film), 0], [...","[[Wedding Night in Paradise, director, Géza vo...",Hungarian
...,...,...,...,...,...,...,...
167449,56100d300bdc11eba7f7acde48001122,compositional,What is the place of birth of the director of ...,"[[S. N. Mathur, [S.N. Mathur was the Director ...","[[Rolling in Money, 0], [Albert Parker (direct...","[[Rolling in Money, director, Albert Parker], ...",New York
167450,3df1a97108ad11ebbd83ac1f6bf848b6,comparison,"Who was born first, Dušan Ninić or Eszter Balint?","[[Tom Dickinson, [Thomas Eastwood Dickinson( 1...","[[Dušan Ninić, 0], [Eszter Balint, 0]]","[[Dušan Ninić, date of birth, September 6, 195...",Dušan Ninić
167451,8be4ef3e0bdc11eba7f7acde48001122,compositional,When did the director of film Morchha die?,"[[Thomas Scott (diver), [Thomas Scott( 1907- d...","[[Morchha, 0], [Ravikant Nagaich, 0]]","[[Morchha, director, Ravikant Nagaich], [Ravik...",6 January 1991
167452,12357df20bdc11eba7f7acde48001122,compositional,What is the date of birth of the director of f...,"[[Peter Levin, [Peter Levin is an American dir...","[[Double Cross (1951 film), 0], [Riccardo Fred...","[[Double Cross, director, Riccardo Freda], [Ri...",24 February 1909


In [4]:
query(truth_df.question.iloc[167453])

articles: Series([], Name: title, dtype: object)
sentences: 12946956    "The Blinding of Isaac Woodard" is a song writ...
2866679     The club is mentioned in the Motion City Sound...
23180998    The Place Where the Black Stars Hang is the fo...
9104057     "My Songs Know What You Did in the Dark (Light...
Name: sentences, dtype: object


In [5]:
query('White Noise (Linkin Park song)')

articles: 4353291    White Noise (Linkin Park song)
3173141               White Noise (novel)
2597077         White Noise (Pvris album)
2306136     White Noise (Disclosure song)
Name: title, dtype: object
sentences: Series([], Name: sentences, dtype: object)


In [6]:
g.V([5013434, 374345]).similarity('emb', [getem('Move (1970 film)')]).toArray()



array([0.99999982, 0.99999982])

In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

def extract(entsList):
    words = []
    for ents in entsList:
        row = []
        for ent in ents:
            row.append(ent['word'])
        words.append(row)
    return words

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")

ner = pipeline("ner", model=model, tokenizer=tokenizer, device=0, aggregation_strategy="max")

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
import numpy as np
vids = np.concatenate([
    g.V().like('emb', [getem(ent['word'])], 4).toArray()
    for ent in ner(truth_df.question.iloc[167453])
])

print(vids)

f = (vids < len(titles))
print('articles:', titles.iloc[vids[f].get()])
print('sentences:', sentences.iloc[vids[~f].get() - len(titles)])

[3173141 2597077 2306136 8684283 2196066 1930796   27608 5932958]
articles: 3173141              White Noise (novel)
2597077        White Noise (Pvris album)
2306136    White Noise (Disclosure song)
2196066                      Linkin Park
1930796                      King's Song
27608                          Song Defu
5932958                    Song Jong-sun
Name: title, dtype: object
sentences: 2694436    White Noise
Name: sentences, dtype: object


In [9]:
from pygremlinxx import GraphTraversal
__ = lambda : GraphTraversal()

# The subgraph step does not work due to nanobind limitations, so use this way instead
out = graph.subgraph_coo(
    g.V(vids).bothE().dedup()._as('h0').inV().bothE().dedup()._union([__().select('h0'), __().identity()]).dedup().toArray()
)

In [10]:
from torch_geometric.data import Data

def coo_to_data(coo):
    data = Data()
    data.edge_index = torch.stack([
        torch.as_tensor(coo['dst'].astype('int64'), device='cuda'),
        torch.as_tensor(coo['src'].astype('int64'), device='cuda'),
    ])
    data.x = torch.as_tensor(
        g.V(coo['vid']).encode('emb').toArray(),
        device='cuda'
    ).reshape((-1, 300))
    data.batch = torch.zeros((data.x.shape[0],), dtype=torch.int64, device='cuda')

    return data

In [11]:
from torch_geometric.nn import GRetriever, GAT
from torch_geometric.nn.nlp import LLM

llm = LLM(
    model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1',
    num_params=1,
)

gnn = GAT(
    in_channels=300,
    hidden_channels=256,
    out_channels=300,
    num_layers=4,
    heads=4,
)

model = GRetriever(llm=llm, gnn=gnn, mlp_out_channels=2048)

Setting up 'TinyLlama/TinyLlama-1.1B-Chat-v0.1' with configuration: {'revision': 'main', 'max_memory': {0: '9GiB', 1: '45GiB'}, 'low_cpu_mem_usage': True, 'device_map': 'auto', 'torch_dtype': torch.bfloat16}


  self.autocast_context = torch.cuda.amp.autocast(dtype=dtype)


In [19]:
ent_fanout = 4 # TODO train this
que_fanout = 4 # TODO train this

for i in range(3):
    question = truth_df.question.iloc[i]
    answer = truth_df.answer.iloc[i]
    emb_q = getem(question)

    vids_q = np.concatenate(
        [
            g.V().like('emb', [getem(ent['word'])], ent_fanout).toArray()
            for ent in ner(question)
        ] + [
            g.V().like('emb', [emb_q], que_fanout).toArray()
        ]
    )
    
    # TODO control hops
    # TODO use order step to pick closest to question (like pcst but better?)
    out = graph.subgraph_coo(
        g.V(vids_q).bothE().dedup()._as('h0').inV().bothE().dedup()._union([__().select('h0'), __().identity()]).dedup().toArray()
    )

    data = coo_to_data(out)
    print(data)

    loss = model(
        question=[f'question: {question}\nanswer:'],
        x=data.x,
        edge_index=data.edge_index,
        batch=data.batch,
        label=[answer],
        edge_attr=None, # edge features
        additional_text_context=None # additional context
    )
    print(loss)

Data(edge_index=[2, 2803], x=[2783, 300], batch=[2783])
tensor(3.9530, device='cuda:0', grad_fn=<ToCopyBackward0>)
Data(edge_index=[2, 55], x=[67, 300], batch=[67])
tensor(3.8345, device='cuda:0', grad_fn=<ToCopyBackward0>)
Data(edge_index=[2, 33926], x=[33715, 300], batch=[33715])
tensor(1.9914, device='cuda:0', grad_fn=<ToCopyBackward0>)


In [20]:
data

Data(edge_index=[2, 33926], x=[33715, 300], batch=[33715])

In [None]:
g.V(vids).out().toArray()

In [1]:
import torch

import sys
sys.path.append('/mnt/bitgraph')
sys.path.append('/mnt/gremlin++')
from pybitgraph import BitGraph


graph = BitGraph(
    'uint64',
    'uint64',
    'DEVICE',
    'MANAGED',
    'DEVICE',
)

src = torch.tensor([5, 4, 1, 0, 2, 3, 5, 1, 2, 0], dtype=torch.uint64)
dst = torch.tensor([1, 3, 2, 5, 1, 5, 4, 4, 4, 1], dtype=torch.uint64)

graph.add_vertices(6)
graph.add_edges(src, dst, 'e')

g = graph.traversal()


In [None]:
g.E().toArray()

In [None]:
graph.subgraph_coo(torch.tensor([0, 2, 4], dtype=torch.uint64))

In [None]:
g.V(2).bothE().toArray()

In [None]:
from pygremlinxx import GraphTraversal
__ = lambda : GraphTraversal()

g.V([0, ]).bothE().dedup()._as('h0').inV().bothE().dedup()._union([__().select('h0'), __().identity()]).dedup().toArray()