In [4]:
import pickle
# import graphvite as gv
import dataset
import nltk
import re
import spacy
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')



%reload_ext autoreload
%autoreload

[nltk_data] Downloading package punkt to /Users/rbanerjee/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rbanerjee/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/rbanerjee/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /Users/rbanerjee/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rbanerjee/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [7]:
file = open("simple_wikidata5m.pkl", "rb")
model = pickle.load(file)
entity2id = model.graph.entity2id
relation2id = model.graph.relation2id
entity_embeddings = model.solver.entity_embeddings
relation_embeddings = model.solver.relation_embeddings

In [12]:
alias2entity = dataset.wikidata5m.load_alias("entity.txt.gz")

In [13]:
print(entity2id[alias2entity["invented"]])

2580138


In [15]:
nlp = spacy.load('en_core_web_sm') 
sentence = "Who invented machine learning, was it Steve Jobs"
doc = nlp(sentence) 
print(doc)
for ent in doc.ents: 
    print(ent.text, ent.label_) 

Who invented machine learning, was it Steve Jobs
Steve Jobs PERSON


In [16]:
def clean_text(text):
    # Extract named entities
    nlp = spacy.load('en_core_web_sm') 
    named_entities = nlp(text).ents

    # Tokenize the text
    words = word_tokenize(text)

    # Remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # Part-of-speech tagging
    tagged_words = pos_tag(words)

    cleaned_text = ' '.join(words).lower()
    
    return cleaned_text, named_entities

# Example usage:
text = "Who invented machine learning, was it Steve Jobs?"
cleaned_text, named_entities = clean_text(text)
print("Cleaned Text:", cleaned_text)
print("Named Entities:", named_entities)

Cleaned Text: invented machine learning steve jobs
Named Entities: (Steve Jobs,)


In [17]:
def extract_continuous_word_sets(text, window_size):
    words = text.split()
    word_sets = []
    
    for i in range(len(words) - window_size + 1):
        word_set = ' '.join(words[i:i + window_size])
        word_sets.append(word_set)

    return word_sets

In [18]:
def query_to_kg_embeddings(query):
    wiki_embeddings = {}
    cleaned_text, named_entities = clean_text(query)
    for ne in named_entities: 
        try:
            ne = ne.orth_.lower()
            wiki_embeddings[ne] = entity_embeddings[entity2id[alias2entity[ne]]]
            cleaned_text = cleaned_text.replace(ne, "")
        except Exception as e:
            print("KeyError: ", e)
    num_of_words = len(word_tokenize(cleaned_text))
    for i in range(num_of_words, 0, -1):
        word_sets = extract_continuous_word_sets(cleaned_text, i)
        for word in word_sets:
            try:
                wiki_embeddings[word] = entity_embeddings[entity2id[alias2entity[word]]]
                cleaned_text = cleaned_text.replace(word, "")
            except Exception as e:
                print("KeyError: ", e)

    return wiki_embeddings

In [82]:
def query_to_node_id(query):
    node_ids = {}
    cleaned_text, named_entities = clean_text(query)
    for ne in named_entities: 
        try:
            ne = ne.orth_.lower()
            node_ids[ne] = alias2entity[ne]
            cleaned_text = cleaned_text.replace(ne, "")
        except Exception as e:
            pass
            #print("KeyError: ", e)
    num_of_words = len(word_tokenize(cleaned_text))
    for i in range(num_of_words, 0, -1):
        word_sets = extract_continuous_word_sets(cleaned_text, i)
        for word in word_sets:
            try:
                node_ids[word] = alias2entity[word]
                cleaned_text = cleaned_text.replace(word, "")
            except Exception as e:
                pass
                #print("KeyError: ", e)
    return node_ids    

In [84]:
node_ids = (query_to_node_id("Who invented machine learning, was it Isaac Newton?"))
node_ids

{'isaac newton': 'Q935', 'machine learning': 'Q2539', 'invented': 'Q18119757'}

In [32]:
import os
halueval_path = os.path.join(os.getcwd(), "submodules", "HaluEval", "data", "qa_data.json")
print(halueval_path)

/Users/rbanerjee/Documents/Projects/KG-LLM-Hallucination/submodules/HaluEval/data/qa_data.json


In [49]:
import json
with(open(halueval_path, "r")) as f:
    data_lst = f.readlines()

data_lst = [json.loads(data) for data in data_lst]

In [50]:
RELEVANT_KEYS = ['knowledge', 'question']

In [51]:
import tqdm

In [85]:
def process_data(data):
    result = {}
    for k in RELEVANT_KEYS:
        node_vals = query_to_node_id(data[k])
        result.update(node_vals)
        
    return result

In [86]:
processed_data = [process_data(i) for i in data_lst[:10]]

In [87]:
len(processed_data)

10

In [99]:
from tqdm.contrib.concurrent import process_map
import multiprocessing as mp
import time

In [90]:
mp.cpu_count()

8

In [92]:
start = time.time()
#result = process_map(process_data, data_lst, max_workers=5)
result = []
for data in tqdm.tqdm(data_lst):
    result.append(process_data(data))
end = time.time()
print(end-start)

  4%|▍         | 406/10000 [07:14<2:51:10,  1.07s/it]


KeyboardInterrupt: 

In [100]:
from tqdm.contrib.concurrent import thread_map
start = time.time()
if __name__ == "__main__":
    result = process_map(process_data, data_lst)
#result = []
#for data in tqdm.tqdm(data_lst):
#    result.append(process_data(data))
end = time.time()
print(end-start)

  result = process_map(process_data, data_lst)


Process SpawnProcess-51:
Traceback (most recent call last):
  File "/Users/rbanerjee/opt/miniconda3/envs/MLG/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/rbanerjee/opt/miniconda3/envs/MLG/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/rbanerjee/opt/miniconda3/envs/MLG/lib/python3.12/concurrent/futures/process.py", line 246, in _process_worker
    call_item = call_queue.get(block=True)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/rbanerjee/opt/miniconda3/envs/MLG/lib/python3.12/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'process_data' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>
Process SpawnProcess-52:
Traceback (most recent call last):
  File "/Users/rbanerjee/opt/miniconda3/envs/MLG/lib/python3.12/multiprocess

BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore