In [1]:
# needed to load the REBEL model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import math
import torch

# wrapper for wikipedia API
import wikipedia

# scraping of web articles
from newspaper import Article
# google news scraping
from GoogleNews import GoogleNews

# graph visualization
from pyvis.network import Network

# show HTML in notebook
import IPython

from keybert import KeyBERT

from sentence_transformers import SentenceTransformer
import numpy as np
import nltk
import re
from typing import List, Tuple, Union

import spacy

import inspect
import os

import ollama


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/niamatzawad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

In [12]:
def extract_relations_from_model_output(text): #From REBEL model code
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

In [3]:
class KeywordExtractor():
    def __init__(self):
        self.pipe = pipeline("token-classification", model="eventdata-utd/conflibert-named-entity-recognition", tokenizer="eventdata-utd/conflibert-named-entity-recognition")
        self.kw_model = KeyBERT()
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
        self.semantic_matcher = None
        
    def sanitize_and_repair(func):
        def wrapper(self, *args, **kwargs):
            phrases = func(self, *args, **kwargs)
            for i, phrase in enumerate(phrases):
                phrase = phrase.strip()
                phrase = phrase.lower()
                phrase = re.sub(r'[,\.!?]', ' ', phrase)
                phrase = re.sub(r'[^a-z0-9\s]', ' ', phrase)
                phrase = re.sub(r"^\s*(?:the|a)\s+", "", phrase, flags=re.IGNORECASE)
                phrase = re.sub(r'\s+', ' ', phrase)
                phrases[i] = phrase
            return phrases
        return wrapper

    def remove_stopwords(func):
        def wrapper(self, *args, **kwargs):
            phrases = func(self, *args, **kwargs)
            filtered_words = []
            for phrase in phrases:
                if phrase.lower() not in self.stopwords:
                    filtered_words.append(" ".join([token for token in phrase.split() if token.lower() not in self.stopwords]))
                   
            return filtered_words
        return wrapper

    @remove_stopwords 
    @sanitize_and_repair
    def extract_named_entities(self, doc):
        results = self.pipe(doc)
        named_entities = []
        for i, result in enumerate(results):
            if result["entity"] != "O" and result["entity"].split("-")[1] not in ["Quantity","Temporal","Money"]:
                if "B-" in result["entity"]:
                    j = i + 1
                    while j < len(results):
                        if "B-" in results[j]["entity"]:
                            break
                        j +=1
                    named_entities.append(" ".join(results[x]["word"] for x in range(i,j)))
        return named_entities
        
   

    @sanitize_and_repair
    @remove_stopwords
    def extract_keywords(self, doc):
        return [k[0] for k in self.kw_model.extract_keywords(doc)]
    
    
    

In [89]:
kwe = KeywordExtractor()
doc = """Several hundred residents of a drug-torn neighborhood marched through the
    streets Sunday in an protest over crack cocaine spurred by weekend shootings
    that left 2 people dead and 10 injured.
    
    ''We are going to march and shout and sing and pray till this crack cocaine is
    completely eradicated from our community,'' said the Rev. Cecil Williams, who
    led the march through the Ocean View neighborhood.
    
    Although the march and rally had been planned for several weeks, a drive-by
    ambush and another shooting less than two miles away in the Bayview-Hunters
    Point district gave people another reason to participate, Mr. Williams said."""
named_entities = (kwe.extract_named_entities(doc))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [94]:
doc = """Students at Columbia University Law School are protesting what they fear may be
the closing of a popular clinic that offers legal help to victims of AIDS
discrimination and practical experience for credit to the students who represent
them.

The students are also rallying to support the clinic's instructor, who does not
have tenure and who the students fear may not be rehired by the university.

At the heart of the dispute is a larger debate between advocates of such
practical studies in the education of a lawyer and traditionalists, who believe
the clinics detract from the school's academic image.

On Thursday, about 200 students occupied part of the law school's main building,
alternately chanting and studying.

No Decision on Clinic

''One-two-three-four, do not shut the clinic door!'' the students chanted,
clutching heavy textbooks with titles like ''Gratuitous Transfers.'' They also
sang a student-written song called ''A Kinder, Gentler Law School.''"""
named_entities = (kwe.extract_named_entities(doc))

In [4]:
class SemanticSimilarity():
    def __init__(self):
        self.threshold_value = 0.6
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        self.embedded_reference = []
        self.disambiguated_reference_list = []
        self.rels = []
        self.embedded_rels = []

    def generate_reference(self, reference_list):
        temp = self._populate_embedded_reference(reference_list)
        x, y = self._disambiguate_reference_list_entities(reference_list, temp)
        self.disambiguated_reference_list, self.embedded_reference = x, y       

    def _populate_embedded_reference(self, reference_list : str) -> List[List[float]] :
        embedded_reference = []
        for i, e in enumerate(reference_list):
            embedded_reference.append(self._get_embedding_token(e))
        return embedded_reference
        
    def _get_embedding_token(self, phrase):
        return self.model.encode([phrase])[0]
        
    def _cosine(self, u, v) -> float:
        return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

    def add_entity(self,e):
        if not self.get_most_similar_entity(e):
            self.disambiguated_reference_list.append(e)
            self.embedded_reference.append(self._get_embedding_token(e))

    def add_relation(self,r):
        if not self.get_most_similar_relation(r):
            self.rels.append(r)
            self.embedded_rels.append(self._get_embedding_token(r))

    def get_most_similar_entity(self, phrase):
        return self.get_most_similar_word(phrase, task="entity")

    def get_most_similar_relation(self, phrase):
        return self.get_most_similar_word(phrase, task="relation")

    def get_most_similar_word(self, phrase, task="entity") -> Union[None, str]:
        query = self._get_embedding_token(phrase)
        max_score = -999
        max_index = -1

        reference, embedded_reference = [],[]
        if task=="entity":
            reference = self.disambiguated_reference_list
            embedded_reference = self.embedded_reference
        elif task=="relation":
            reference = self.rels
            embedded_reference = self.embedded_rels

        
        for i, ref in enumerate(embedded_reference):
            score = self._cosine(query, ref)
            if score > max_score:
                max_score = score
                max_index = i
        if max_score > self.threshold_value:
            return reference[max_index]

        return None
    
    def _get_all_similarity_scores(self, embedded_reference : List):
        return np.dot(np.array(embedded_reference), np.array(embedded_reference).T)

    def _get_similar_entities(self, exclude_index : int, l : List[float]) -> List[Tuple[float,int]]:
        results = []
        for i,e in enumerate(l):
            if i != exclude_index and l[i] >= self.threshold_value:
                results.append((l[i],i))
        return results
        
    def _disambiguate_reference_list_entities(self, reference_list : List, embedded_reference_list : List) -> List[str]:
        reference_list = self.disambiguated_reference_list + reference_list
        embedded_reference_list = self.embedded_reference + embedded_reference_list
        similarity_scores = self._get_all_similarity_scores(embedded_reference_list)
        dissimilar,dissimilar_embedded, ignore = [],[],[]
        i = 0
        while i < len(similarity_scores):
            if i not in ignore:
                current = similarity_scores[i]
                max_score_indices = self._get_similar_entities(i, current)
                ignore.extend([i for v,i in max_score_indices])
                dissimilar_index = min([i] + [j for v,j in max_score_indices]) #Get the lowest index. This is because when we are parsing the second article, we want to keep any similar entities we found in the first article and remove entities any subsequent articles
                dissimilar.append(reference_list[dissimilar_index])
                dissimilar_embedded.append(embedded_reference_list[dissimilar_index])
                # dissimilar.append(max([reference_list[i]] + [reference_list[j] for v,j in max_score_indices])) #assuming larger text carries more information
            i += 1

        return dissimilar, dissimilar_embedded



In [95]:
named_entities

['students',
 'columbia university law school',
 'clinic',
 'victims',
 'students',
 'students',
 'clinic instructor',
 'students',
 'university',
 'advocates',
 'traditional ists',
 'school',
 '200',
 'students',
 'part',
 'law school main building',
 'clinic',
 'students',
 'student',
 'law']

In [91]:
s = SemanticSimilarity()
s.generate_reference(named_entities)
s.disambiguated_reference_list

['residents',
 'drug torn neighborhood',
 'streets',
 'people',
 'community',
 'rev cecil williams',
 'ocean view neighborhood',
 'bay view hunters point district']

In [93]:
s.generate_reference(named_entities)
s.disambiguated_reference_list

['residents',
 'drug torn neighborhood',
 'streets',
 'people',
 'community',
 'rev cecil williams',
 'ocean view neighborhood',
 'bay view hunters point district',
 'students',
 'columbia university law school',
 'clinic',
 'victims',
 'university',
 'advocates',
 'traditional ists',
 '200',
 'part',
 'law']

In [104]:
s.get_most_similar_relation("street")

In [5]:
class KB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

NameError: name 'named_entities' is not defined

In [40]:
def rebel_get_relations(text):
    model_inputs = tokenizer(text, max_length=512, padding=True, truncation=True,
                            return_tensors='pt')
    # Generate
    gen_kwargs = {
        "max_length": 216,obj = re.sub("\/","_", obj)
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3
    }

    
    generated_tokens = model.generate(
        **model_inputs,
        **gen_kwargs,
    )
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    for sentence_pred in decoded_preds:
        relations = extract_relations_from_model_output(sentence_pred)
        print(relations)


In [307]:
# build a knowledge base from text
def from_small_text_to_kb(text, verbose=False):
    kb = KB()

    # Tokenizer text
    model_inputs = tokenizer(text, max_length=512, padding=True, truncation=True,
                            return_tensors='pt')
    print(model_inputs)

    if verbose:
        print(f"Num tokens: {len(model_inputs['input_ids'][0])}")

    # Generate
    gen_kwargs = {
        "max_length": 216,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3
    }

    
    generated_tokens = model.generate(
        **model_inputs,
        **gen_kwargs,
    )
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    # create kb
    for sentence_pred in decoded_preds:
        relations = extract_relations_from_model_output(sentence_pred)
        for r in relations:
            kb.add_relation(r)

    return kb

In [50]:
# test the `from_small_text_to_kb` function

    text = """Several hundred residents of a drug-torn neighborhood marched through the
    streets Sunday in an protest over crack cocaine spurred by weekend shootings
    that left 2 people dead and 10 injured.
    
    ''We are going to march and shout and sing and pray till this crack cocaine is
    completely eradicated from our community,'' said the Rev. Cecil Williams, who
    led the march through the Ocean View neighborhood.
    
    Although the march and rally had been planned for several weeks, a drive-by
    ambush and another shooting less than two miles away in the Bayview-Hunters
    Point district gave people another reason to participate, Mr. Williams said."""

kb = from_small_text_to_kb(text, verbose=True)
kb.print()

{'input_ids': tensor([[    0, 29182,  6317,  1196,     9,    10,  1262,    12, 25566,  3757,
         15199,   149,     5, 50118, 24769,  2580,   395,    11,    41,  2790,
            81,  7009,  9890, 18272,    30,   983,  9272, 50118,  6025,   314,
           132,    82,  1462,     8,   158,  1710,     4, 50118, 50118, 17809,
           170,    32,   164,     7,  6674,     8, 18066,     8,  7884,     8,
         10745,  6612,    42,  7009,  9890,    16, 50118, 28655, 25193,  5554,
            31,    84,   435, 10559,    26,     5,  7161,     4, 28703,  1604,
             6,    54, 50118,  1329,     5,  6674,   149,     5,  5860,  3756,
          3757,     4, 50118, 50118, 13863,     5,  6674,     8,  2669,    56,
            57,  1904,    13,   484,   688,     6,    10,  1305,    12,  1409,
         50118,  3146,  3810,     8,   277,  1094,   540,    87,    80,  1788,
           409,    11,     5,  1501,  5877,    12, 38831,  2696, 50118, 20416,
          1418,   851,    82,   277,  

In [53]:
import ollama


template = f"""Task:Generate triplets from the following text.
    Instructions'
    The triplet should be in the format (<subject>, <relation type>, <object>)
    The subject and object should refer to specific entities.
    The relation type should refer to an action
    
    The text is:
    Several hundred residents of a drug-torn neighborhood marched through the
    streets Sunday in an protest over crack cocaine spurred by weekend shootings
    that left 2 people dead and 10 injured.
    
    ''We are going to march and shout and sing and pray till this crack cocaine is
    completely eradicated from our community,'' said the Rev. Cecil Williams, who
    led the march through the Ocean View neighborhood.
    
    Although the march and rally had been planned for several weeks, a drive-by
    ambush and another shooting less than two miles away in the Bayview-Hunters
    Point district gave people another reason to participate, Mr. Williams said"""


response = ollama.generate(model='mistral', prompt=template)
print(response)

{'model': 'mistral', 'created_at': '2024-04-18T19:30:56.99330289Z', 'response': ' 1. (Several hundred residents, marched, through the streets)\n2. (Residents of a drug-torn neighborhood, protested, crack cocaine)\n3. (Rev. Cecil Williams, led, march and shout and sing and pray)\n4. (People, participated, in the march and rally)\n5. (Weekend shootings, left, 2 people dead and 10 injured)\n6. (Drive-by ambush and another shooting, gave, people another reason to participate)\n7. (Rev. Cecil Williams, said, eradicated from our community)', 'done': True, 'context': [733, 16289, 28793, 28705, 10290, 28747, 23342, 22212, 1074, 477, 272, 2296, 2245, 28723, 13, 2287, 3133, 8373, 28742, 13, 2287, 415, 2629, 7081, 1023, 347, 297, 272, 5032, 325, 28789, 16566, 6550, 523, 21380, 1212, 6550, 523, 2814, 12970, 13, 2287, 415, 3817, 304, 1928, 1023, 3295, 298, 2948, 19810, 28723, 13, 2287, 415, 9378, 1212, 1023, 3295, 298, 396, 2992, 13, 260, 13, 2287, 415, 2245, 349, 28747, 13, 2287, 15223, 4682, 1130

In [150]:
# def get_key_phrase(phrase):
#     response = ollama.generate("mistral", prompt=f"Given the sentence {phrase}, extract the key phrase. Answer with just the key phrase. Don't give any explanations. For example  for the sentence 'in weekend shootings' answer with just 'weekend shootings'")
#     return (response["response"])
# get_key_phrase("the weekend shootings")

In [34]:
response['response']

' (Several hundred residents of a drug-torn neighborhood, protest, crack cocaine)\n(Residents, march, through the streets)\n(Protesters, shout, crack cocaine eradication)\n(Rev. Cecil Williams, lead, march and rally)\n(People, participate, in the march and rally)\n(Weeks-long planning, interrupted, drive-by ambush)\n(Shootings, leave, 2 people dead and 10 injured)\n(Community members, pray, complete eradication of crack cocaine)\n(Rev. Cecil Williams, say, shootings gave people another reason to participate)'

In [13]:
s.get_most_similar_word("Several hundred residents")

'residents'

In [28]:

import instructor
from pydantic import BaseModel, Field
from openai import OpenAI
from typing import List

# Define your desired output structure
class Character(BaseModel):
    name: str
    age: int
    fact: List[str] = Field(..., description="A list of facts about the character")

class Triplet(BaseModel):
    subject: str = Field(description='The subject of the triplet. It is an entity')
    relation: str = Field(description='The relation of the triplet. It is an action')
    object: str = Field(description='The object of the triplet. It it an entity')
    
client = instructor.from_openai(
    OpenAI(
        base_url="http://localhost:11434/v1",
        api_key="ollama",  # required, but unused
    ),
    mode=instructor.Mode.JSON,
)



In [33]:

# resp = client.chat.completions.create(
#     model="mistral",
#     messages=[
#         {
#             "role": "user",
#             "content": "Tell me about the Harry Potter",
#         }
#     ],
#     response_model=Character,
# )


template = f"""Task:Generate triplets from the following text.
    Instructions'
    The triplet should be in the format (<subject>, <relation>, <object>)

    The text is:
    Several hundred residents of a drug-torn neighborhood marched through the
    streets Sunday in an protest over crack cocaine spurred by weekend shootings
    that left 2 people dead and 10 injured.
    
    ''We are going to march and shout and sing and pray till this crack cocaine is
    completely eradicated from our community,'' said the Rev. Cecil Williams, who
    led the march through the Ocean View neighborhood.
    
    Although the march and rally had been planned for several weeks, a drive-by
    ambush and another shooting less than two miles away in the Bayview-Hunters
    Point district gave people another reason to participate, Mr. Williams said"""

     
# template = f"""Task:Decompose the triplet -> {triplet}
#     ##Instructions
#     ##If the triplet can be decomposed,return the new triplet. Else, return an empty string ""
#     ##The triplet should be in the format (<subject>, <relation type>, <object>)
#     ##The subject and object should refer to specific entities.
#     ##The relation type should refer to an action
#     ##Return only the triplets seperated by new lines. Do not explain the reason for returning the results
    
#     ##Example Given the triplet "People - participate - in the march and rally", return the 2 triplets
#     "People - participate - march"
#     "People - participate - rally"

#     ##Example Given the triplet "Several hundred residents of a drug-torn neighborhood -  protest -  crack cocaine", return the triplets
#     "residents - live in - drug-torn neighbourhood"
#     "residents - protest -  crack cocaine"

#     ##Example Given the triplet "residents -  march - streets", return an empty string "" as this triplet cannot be further decomposed
#     """

resp = client.chat.completions.create(
    model="mistral",
    messages=[
        {
            "role": "user",
            "content": template,
        }
    ],
    response_model=Triplet,
)


# resp = client.completions.create(
#     model="mistral",
#     prompt=template,
#     response_model=Triplet,
# )

print(resp.model_dump_json(indent=2))

APITimeoutError: Request timed out.

In [6]:
class KG:
    def __init__(self):
        self.triplets = []

    def add_triplets(self, triplets : List):
        self.triplets.extend(triplets)

In [7]:
class TripletGenerator:

    def __init__(self, text : str, s : SemanticSimilarity, e : KeywordExtractor, doc_name : str):
        self.s = s
        self.e = e
        self.doc_name = doc_name
        self.text = text
        self.pattern = r"\((.*?)\)"
        self.model_name = "mistral"
        self.template = f"""Task:Generate triplets from the following text.
            Instructions'
            The triplet should be in the format (<subject>, <relation type>, <object>)
            The subject and object should refer to specific entities.
            The relation type should refer to an action
            
            The text is:
            {self.text}"""
        
        self.triplets = self._extract_triplets()

    def _get_entities_in_phrase(self, phrase) -> List:
        results = self.e.extract_named_entities(phrase)
        return results if results else [phrase]

        
    def _link_entities(self,subj,rel,obj):
        available_subj = self.s.get_most_similar_word(subj)
        if available_subj:
            subj = available_subj

        available_rel = self.s.get_most_similar_relation(rel)
        if available_rel:
            rel = available_rel
        else:
            self.s.add_relation(rel)
            
        available_obj = self.s.get_most_similar_word(obj)
        if available_obj:
            obj = available_obj

        return subj, rel, obj

    def refine(self, phrase):
        phrase = phrase.strip()
        phrase = phrase.lower()
        phrase = re.sub(r'[,\.!?]', '', phrase)
        phrase = re.sub(r'[^a-z0-9\s]', ' ', phrase)
        phrase = re.sub(r"^\s*(?:the|a)\s+", "", phrase, flags=re.IGNORECASE)
        phrase = re.sub(r'\s+', '_', phrase)
        return phrase
        
    def _refine_triplets(self, subj, rel, obj): #for example for a triplet Bill and John - played - Baseball, it will be split into two triplets with subjects being Bill and John 
        subj_nouns = self._get_entities_in_phrase(subj)
        obj_nouns = self._get_entities_in_phrase(obj)
        results = []
        for s in subj_nouns:
            for o in obj_nouns:
                # s = re.sub(r'\s', '_', s.strip())
                # rel = re.sub(r'\s', '_', rel.strip())
                # o = re.sub(r'\s', '_', o.strip())
                s,rel,o = self.refine(s),self.refine(rel),self.refine(o)
                results.append((s,rel,o))
        return results

    def _add_doc_name_to_triplets(self, triplets):
        
        return [(w,x,y,z) for (w,x,y),z in zip(triplets,([self.doc_name] * len(triplets)))]

    def _add_doc_node(self, triplets):
        doc_triplets = []
        # doc_name = re.sub(r'[\/]', '_', self.doc_name)
        # doc_name = self.refine(doc_name)
        for t in triplets:
            # doc_triplets.append((t[0],"filepath",doc_name))
            doc_triplets.append((t[0],"filepath",self.doc_name))
        triplets.extend(doc_triplets)
        return triplets
        
    def _extract_triplets(self):
        triplet_results = []
        
        response = ollama.generate(model=self.model_name, prompt=self.template)
        
        for line in response['response'].split("\n"):
            matches = re.findall(self.pattern, line)
            if not matches:
                continue
            triplet = matches[0].split(",")
            if len(triplet) < 3:
                continue
            subj,rel,obj = triplet[0]," ".join(triplet[1:-1]), triplet[-1]
            subj,rel,obj = self._link_entities(subj, rel, obj)
            refined_triplets = self._refine_triplets(subj,rel,obj)
            # refined_triplets = self._add_doc_name_to_triplets(refined_triplets)
            refined_triplets = self._add_doc_node(refined_triplets)
            triplet_results.extend(refined_triplets)
            # triplet_results.append((subj.strip(),rel.strip(),obj.strip()))
        
        return triplet_results
    

        
    def get_triplets(self):
        return self.triplets
        
        
        
        

In [8]:
folder_path = "/home/niamatzawad/niamatzawad/Datasets/UTDBox/NYT Files/1-1000"
s = SemanticSimilarity()
kwe = KeywordExtractor()
kg = KG()

for filename in os.listdir(folder_path):
    
    if os.path.isfile(os.path.join(folder_path, filename)):
        with open(os.path.join(folder_path, filename), 'r') as file:
            text = file.read()[:2100]
            named_entities = kwe.extract_named_entities(text)
            s.generate_reference(named_entities)
            t = TripletGenerator(text, s, kwe, os.path.join(folder_path, filename))
            triplets = t.get_triplets()
            kg.add_triplets(triplets)
            print(triplets)
            print("---")
kg.triplets

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[('rose_dale_parents', 'resist', 'schools_chancellor_frank_j_mac_chi_aro_la'), ('rose_dale_parents', 'filepath', '/home/niamatzawad/niamatzawad/Datasets/UTDBox/NYT Files/1-1000/NYT-8102099-8103008.TXT'), ('charles', 'lead', 'rose_dale_parents'), ('charles', 'filepath', '/home/niamatzawad/niamatzawad/Datasets/UTDBox/NYT Files/1-1000/NYT-8102099-8103008.TXT'), ('sandra_pet_ker_school', 'encourage', 'rose_dale_parents'), ('sandra_pet_ker_school', 'encourage', 'new_school'), ('sandra_pet_ker_school', 'filepath', '/home/niamatzawad/niamatzawad/Datasets/UTDBox/NYT Files/1-1000/NYT-8102099-8103008.TXT'), ('sandra_pet_ker_school', 'filepath', '/home/niamatzawad/niamatzawad/Datasets/UTDBox/NYT Files/1-1000/NYT-8102099-8103008.TXT'), ('rose_dale_parents', 'vow', 'closed_annex_building'), ('rose_dale_parents', 'filepath', '/home/niamatzawad/niamatzawad/Datasets/UTDBox/NYT Files/1-1000/NYT-8102099-8103008.TXT'), ('schools_chancellor_frank_j', 'order', 'students'), ('schools_chancellor_frank_j', 'o

[('rose_dale_parents', 'resist', 'schools_chancellor_frank_j_mac_chi_aro_la'),
 ('rose_dale_parents',
  'filepath',
  '/home/niamatzawad/niamatzawad/Datasets/UTDBox/NYT Files/1-1000/NYT-8102099-8103008.TXT'),
 ('charles', 'lead', 'rose_dale_parents'),
 ('charles',
  'filepath',
  '/home/niamatzawad/niamatzawad/Datasets/UTDBox/NYT Files/1-1000/NYT-8102099-8103008.TXT'),
 ('sandra_pet_ker_school', 'encourage', 'rose_dale_parents'),
 ('sandra_pet_ker_school', 'encourage', 'new_school'),
 ('sandra_pet_ker_school',
  'filepath',
  '/home/niamatzawad/niamatzawad/Datasets/UTDBox/NYT Files/1-1000/NYT-8102099-8103008.TXT'),
 ('sandra_pet_ker_school',
  'filepath',
  '/home/niamatzawad/niamatzawad/Datasets/UTDBox/NYT Files/1-1000/NYT-8102099-8103008.TXT'),
 ('rose_dale_parents', 'vow', 'closed_annex_building'),
 ('rose_dale_parents',
  'filepath',
  '/home/niamatzawad/niamatzawad/Datasets/UTDBox/NYT Files/1-1000/NYT-8102099-8103008.TXT'),
 ('schools_chancellor_frank_j', 'order', 'students'),
 ('

In [13]:
def retrieve_text_from_filepath(filepath):
    with open(filepath,'r') as f:
        t = f.read()
        return t


In [9]:
a = "`"+"hi"+"`"
a

'`hi`'

In [17]:
from py2neo import Graph, Node, Relationship



# Connect to the Neo4j database
graph = Graph("neo4j://localhost:7689", auth=("neo4j", "password"))
graph.delete_all()

triplets = kg.triplets

# Create the nodes and relationships
for subject, predicate, obj in triplets:
    # Create the subject node
    try:
        n = graph.run(f"MATCH (n:{subject}) RETURN n;").data()
    except Exception as e:
        print(f"Problem with subject - {subject}")
        continue

    if n:
        subject_node = n[0]["n"]
    else:
        subject_node = Node(subject, id=subject)
        graph.create(subject_node)

    if predicate == "filepath":
        obj = re.sub("NYT Files", "NYT\sFiles",obj)
        obj = "`"+obj+"`"
        n = graph.run(f"MATCH (n:{obj}) RETURN n;").data()
        if n:
            object_node = n[0]["n"]
        else:
            txt = retrieve_text_from_filepath(obj) 
            object_node = Node(obj, text=txt)
            graph.create(object_node)
    else:
        try:
            n = graph.run(f"MATCH (n:{obj}) RETURN n;").data()
        except Exception as e:
            print(f"Problem with object - {obj}")
            continue
            
        if n:
            object_node = n[0]["n"]
        else:
            object_node = Node(obj, id=obj)
            graph.create(object_node)

    relationship = Relationship(subject_node, predicate, object_node)
    graph.merge(relationship)

print("Triplets added to the Neo4j database.")

  obj = re.sub("NYT Files", "NYT\sFiles",obj)
  obj = re.sub("NYT Files", "NYT\sFiles",obj)


error: bad escape \s at position 3