In [None]:
# PLEASE SEE SENTENCE BELOW:
# ANYTHING in the below line of code that has my comments will be designated by #Andrew# 

# Install the relevant libraries

In [None]:
!pip install transformers wikipedia newspaper3k GoogleNews pyvis

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.1/211.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting GoogleNews
  Downloading GoogleNews-1.6.8-py3-none-any.whl (8.1 kB)
Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch
import wikipedia
from newspaper import Article, ArticleException
from GoogleNews import GoogleNews
import IPython
from pyvis.network import Network

# Load the REBEL model

In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

# From short text to KB

In [None]:
#Andrew# Function to extract relations from a generated text
#Andrew# It identifies triplets using special tokens and extracts the subject, relation, and object

def extract_relations_from_model_output(text):
    
#Andrew# Extract triplets of the form "subject-relation-object" from model output text containing special tokens and markers
#Andrew# Returns a list of dictionaries representing the extracted relations
#Andrew# Args: text (Str): input text containing special tokens and markers
#Andrew# Returns: list: a list of dictionaries representing the extracted relations 
    
    relations = [] #Andrew# List to store the extracted relations
    relation, subject, relation, object_ = '', '', '', ''
    #Andrew# Preprocess the input text by removing unnecessary special tokens 
    text = text.strip()
    current = 'x'
    #Andrew# Iterate through the tokens after preprocessing
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                #Andrew#  If a relation is already present, store the completed triplet in the relations list and reset
                #Andrew# the variables 
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                #Andrew# If a relation is already present, store the subject with the relation in the relations list
                #Andrew# and reset the object_variable 
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = '' #Andrew# reset the relation for the new object
        else:
            #Andrew# Depending on the current context, append the token to the respective variable (subject, relation
            #Andrew# or object)
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
                
                #Andrew# After the loop, if a complete triplet is found (subject, relation, and object are 
                #Andrew#  all non-empty), store it in the relations list 
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations #Andrew# return the list of extracted relations 

In [None]:
#Andrew# Class to represent the Knowledge Base (KB)

class KB():
    def __init__(self):
        #Andrew# initialize an empty list to store relations
        self.relations = []

    def are_relations_equal(self, r1, r2):
        #Andrew# Compare two relations based on their attributes (head, type, and tail)
        #Andrew# Returns true if all attributes are equal, otherwise false 
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        #Andrew# Check if a relation with the same attributes as r1 already exists in the knowledge base
        #Andrew# Returns True if such a relation exists, otherwise False
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def add_relation(self, r):
        #Andrew#  Add a relation to the knowledge base if it does NOT already exist
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        #Andrew# Print all the relations in the knowledge base
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

In [None]:
#Andrew# Function to create a Knowledge Base from a short text input
def from_small_text_to_kb(text, verbose=False):
    #Andrew# Convert small text into a Knowledge Base (KB) object using a language model for relation extraction
    #Andrew# Args: text (str): Input text to be processed, verbose (bool, optional): If True, print additional information
    #Andrew# during processing
    #Andrew# Returns: KB: Knowledge Base object containing extracted relations
    
    kb = KB() #Andrew# Initialize a Knowledge Base object to store the extracted relations

    #Andrew# Tokenize the input text for model processing
    model_inputs = tokenizer(text, max_length=512, padding=True, truncation=True,
                            return_tensors='pt')
    if verbose:
        print(f"Num tokens: {len(model_inputs['input_ids'][0])}")

    #Andrew# Set up generation parameters for the language model
    gen_kwargs = {
        "max_length": 216,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3
    }
    
    #Andrew# Generate predictions using the language model
    generated_tokens = model.generate(
        **model_inputs,
        **gen_kwargs,
    )
    
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    #Andrew# Extract relations from each generated prediction and add them to the Knowledge Base
    # create kb
    for sentence_pred in decoded_preds:
        relations = extract_relations_from_model_output(sentence_pred)
        for r in relations:
            kb.add_relation(r)

    return kb #Andrew# Return the Knowledge Base containing the extracted relations

In [None]:
#Andrew# Given input text containing information about Napoleon Bonaparte
text = "Napoleon Bonaparte (born Napoleone di Buonaparte; 15 August 1769 – 5 " \
"May 1821), and later known by his regnal name Napoleon I, was a French military " \
"and political leader who rose to prominence during the French Revolution and led " \
"several successful campaigns during the Revolutionary Wars. He was the de facto " \
"leader of the French Republic as First Consul from 1799 to 1804. As Napoleon I, " \
"he was Emperor of the French from 1804 until 1814 and again in 1815. Napoleon's " \
"political and cultural legacy has endured, and he has been one of the most " \
"celebrated and controversial leaders in world history."

#Andrew# Create a Knowledge Base (KB) object by extracting relations from the given text
#Andrew# If 'verbose' is True, additional information will be printed during processing

kb = from_small_text_to_kb(text, verbose=True)

#Andrew# Print the Knowledge Base
#Andrew# The output will show the relations extracted from the input text

kb.print()
#Andrew# Expected output is below:
# Num tokens: 133
# Relations:
#   {'head': 'Napoleon Bonaparte', 'type': 'date of birth', 'tail': '15 August 1769'}
#   {'head': 'Napoleon Bonaparte', 'type': 'date of death', 'tail': '5 May 1821'}
#   {'head': 'Napoleon Bonaparte', 'type': 'participant in', 'tail': 'French Revolution'}
#   {'head': 'Napoleon Bonaparte', 'type': 'conflict', 'tail': 'Revolutionary Wars'}
#   {'head': 'Revolutionary Wars', 'type': 'part of', 'tail': 'French Revolution'}
#   {'head': 'French Revolution', 'type': 'participant', 'tail': 'Napoleon Bonaparte'}
#   {'head': 'Revolutionary Wars', 'type': 'participant', 'tail': 'Napoleon Bonaparte'}

Num tokens: 133
Relations:
  {'head': 'Napoleon Bonaparte', 'type': 'date of birth', 'tail': '15 August 1769'}
  {'head': 'Napoleon Bonaparte', 'type': 'date of death', 'tail': '5 May 1821'}
  {'head': 'Napoleon Bonaparte', 'type': 'participant in', 'tail': 'French Revolution'}
  {'head': 'Napoleon Bonaparte', 'type': 'conflict', 'tail': 'Revolutionary Wars'}
  {'head': 'Revolutionary Wars', 'type': 'part of', 'tail': 'French Revolution'}
  {'head': 'French Revolution', 'type': 'participant', 'tail': 'Napoleon Bonaparte'}
  {'head': 'Revolutionary Wars', 'type': 'participant', 'tail': 'Napoleon Bonaparte'}


# Split spans: from long text to KB

In [None]:
class KB():
    def __init__(self):
        #Andrew# Initialize an empty list to store relations
        self.relations = []

    def are_relations_equal(self, r1, r2):
        #Andrew# Compare two relations based on their attributes (head, type, and tail)
        #Andrew# Returns True if all attributes are equal, otherwise False
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        #Andrew# Check if a relation with the same attributes as r1 already exists in the knowledge base
        #Andrew# Returns True if such a relation exists, otherwise False
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r1):
        #Andrew# Merge the spans from r1 into an existing relation with the same attributes in the knowledge base
        #Andrew# Spans refer to additional information associated with a relation (specific phrases in the text)
        r2 = [r for r in self.relations
              if self.are_relations_equal(r1, r)][0]
        spans_to_add = [span for span in r1["meta"]["spans"]
                        if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add

    def add_relation(self, r):
        #Andrew# Add a relation to the knowledge base if it does NOT already exist
        #Andrew# If a relation with the same attributes exists, merge the spans from the new relation into the existing
        #Andrew# one
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        #Andrew# Print all the relations in the knowledge base
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

In [None]:
def from_text_to_kb(text, span_length=128, verbose=False):
    #Andrew# Convert the input text into a Knowledge Base (KB) object by extracting relations using
    #Andrew# span-based processing
    #Andrew# Args: text(str): Input text to be processed
    #Andrew# span_length (int, optional): Maximum length of each span for processing
    #Andrew# verbose (bool, optional): If True, print additional information during processing
    #Andrew# Returns: KB: Knowledge Base Object containing extracted relations
    
    #Andrew# Tokenize the whole input text
    inputs = tokenizer([text], return_tensors="pt")

    #Andrew# Compute span boundaries for span-based processing
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) /
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

  #Andrew# Generate relations using the language model
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    #Andrew# decode relations from the generated tokens
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    #Andrew# Create a Knowledge Base (KB) object
    kb = KB()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                "spans": [spans_boundaries[current_span_index]]
            }
            kb.add_relation(relation)
        i += 1

    return kb #Andrew# Return the Knowledge Base containing the extracted relations

In [None]:
#Andrew# Given input text containing information about Napolean Bonaparte
text = """
Napoleon Bonaparte (born Napoleone di Buonaparte; 15 August 1769 – 5 May 1821), and later known by his regnal name Napoleon I, was a French military and political leader who rose to prominence during the French Revolution and led several successful campaigns during the Revolutionary Wars. He was the de facto leader of the French Republic as First Consul from 1799 to 1804. As Napoleon I, he was Emperor of the French from 1804 until 1814 and again in 1815. Napoleon's political and cultural legacy has endured, and he has been one of the most celebrated and controversial leaders in world history. Napoleon was born on the island of Corsica not long after its annexation by the Kingdom of France.[5] He supported the French Revolution in 1789 while serving in the French army, and tried to spread its ideals to his native Corsica. He rose rapidly in the Army after he saved the governing French Directory by firing on royalist insurgents. In 1796, he began a military campaign against the Austrians and their Italian allies, scoring decisive victories and becoming a national hero. Two years later, he led a military expedition to Egypt that served as a springboard to political power. He engineered a coup in November 1799 and became First Consul of the Republic. Differences with the British meant that the French faced the War of the Third Coalition by 1805. Napoleon shattered this coalition with victories in the Ulm Campaign, and at the Battle of Austerlitz, which led to the dissolving of the Holy Roman Empire. In 1806, the Fourth Coalition took up arms against him because Prussia became worried about growing French influence on the continent. Napoleon knocked out Prussia at the battles of Jena and Auerstedt, marched the Grande Armée into Eastern Europe, annihilating the Russians in June 1807 at Friedland, and forcing the defeated nations of the Fourth Coalition to accept the Treaties of Tilsit. Two years later, the Austrians challenged the French again during the War of the Fifth Coalition, but Napoleon solidified his grip over Europe after triumphing at the Battle of Wagram. Hoping to extend the Continental System, his embargo against Britain, Napoleon invaded the Iberian Peninsula and declared his brother Joseph King of Spain in 1808. The Spanish and the Portuguese revolted in the Peninsular War, culminating in defeat for Napoleon's marshals. Napoleon launched an invasion of Russia in the summer of 1812. The resulting campaign witnessed the catastrophic retreat of Napoleon's Grande Armée. In 1813, Prussia and Austria joined Russian forces in a Sixth Coalition against France. A chaotic military campaign resulted in a large coalition army defeating Napoleon at the Battle of Leipzig in October 1813. The coalition invaded France and captured Paris, forcing Napoleon to abdicate in April 1814. He was exiled to the island of Elba, between Corsica and Italy. In France, the Bourbons were restored to power. However, Napoleon escaped Elba in February 1815 and took control of France.[6][7] The Allies responded by forming a Seventh Coalition, which defeated Napoleon at the Battle of Waterloo in June 1815. The British exiled him to the remote island of Saint Helena in the Atlantic, where he died in 1821 at the age of 51. Napoleon had an extensive impact on the modern world, bringing liberal reforms to the many countries he conquered, especially the Low Countries, Switzerland, and parts of modern Italy and Germany. He implemented liberal policies in France and Western Europe.
"""

#Andrew# Create a Knowledge Base (KB) object by extracting relations from the given text
#Andrew# If 'verbose' is True, additional information will be printed during processing

kb = from_text_to_kb(text, verbose=True)

#Andrew# Print the Knowledge Base
#Andrew# The output will show the relations extracted from the input text along with their corresponding
#Andrew# span boundaries 

kb.print()

#Andrew# Expected output:

# Input has 726 tokens
# Input has 6 spans
# Span boundaries are [[0, 128], [119, 247], [238, 366], [357, 485], [476, 604], [595, 723]]
# Relations:
#   {'head': 'Napoleon Bonaparte', 'type': 'date of birth',
#    'tail': '15 August 1769', 'meta': {'spans': [[0, 128]]}}
#   ...
#   {'head': 'Napoleon', 'type': 'place of birth',
#    'tail': 'Corsica', 'meta': {'spans': [[119, 247]]}}
#   ...
#   {'head': 'Fourth Coalition', 'type': 'start time',
#    'tail': '1806', 'meta': {'spans': [[238, 366]]}}
#   ...

Input has 726 tokens
Input has 6 spans
Span boundaries are [[0, 128], [119, 247], [238, 366], [357, 485], [476, 604], [595, 723]]
Relations:
  {'head': 'Napoleon Bonaparte', 'type': 'date of birth', 'tail': '15 August 1769', 'meta': {'spans': [[0, 128]]}}
  {'head': 'Napoleon Bonaparte', 'type': 'date of death', 'tail': '5 May 1821', 'meta': {'spans': [[0, 128]]}}
  {'head': 'Napoleon Bonaparte', 'type': 'participant in', 'tail': 'French Revolution', 'meta': {'spans': [[0, 128]]}}
  {'head': 'Napoleon Bonaparte', 'type': 'conflict', 'tail': 'Revolutionary Wars', 'meta': {'spans': [[0, 128]]}}
  {'head': 'Revolutionary Wars', 'type': 'part of', 'tail': 'French Revolution', 'meta': {'spans': [[0, 128]]}}
  {'head': 'French Revolution', 'type': 'participant', 'tail': 'Napoleon Bonaparte', 'meta': {'spans': [[0, 128]]}}
  {'head': 'Revolutionary Wars', 'type': 'participant', 'tail': 'Napoleon Bonaparte', 'meta': {'spans': [[0, 128]]}}
  {'head': 'French Revolution', 'type': 'country', 'tai

# Filter and normalize entities with Wikipedia

- remove all entities that doesn't have a page on Wikipedia
- merge entities if they have the same wikipedia page

In [None]:
class KB():
    def __init__(self):
        self.entities = {}
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r1):
        r2 = [r for r in self.relations
              if self.are_relations_equal(r1, r)][0]
        spans_to_add = [span for span in r1["meta"]["spans"]
                        if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add

    def get_wikipedia_data(self, candidate_entity):
        try:
            page = wikipedia.page(candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary
            }
            return entity_data
        except:
            return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

In [None]:
text = """
Napoleon Bonaparte (born Napoleone di Buonaparte; 15 August 1769 – 5 May 1821), and later known by his regnal name Napoleon I, was a French military and political leader who rose to prominence during the French Revolution and led several successful campaigns during the Revolutionary Wars. He was the de facto leader of the French Republic as First Consul from 1799 to 1804. As Napoleon I, he was Emperor of the French from 1804 until 1814 and again in 1815. Napoleon's political and cultural legacy has endured, and he has been one of the most celebrated and controversial leaders in world history. Napoleon was born on the island of Corsica not long after its annexation by the Kingdom of France.[5] He supported the French Revolution in 1789 while serving in the French army, and tried to spread its ideals to his native Corsica. He rose rapidly in the Army after he saved the governing French Directory by firing on royalist insurgents. In 1796, he began a military campaign against the Austrians and their Italian allies, scoring decisive victories and becoming a national hero. Two years later, he led a military expedition to Egypt that served as a springboard to political power. He engineered a coup in November 1799 and became First Consul of the Republic. Differences with the British meant that the French faced the War of the Third Coalition by 1805. Napoleon shattered this coalition with victories in the Ulm Campaign, and at the Battle of Austerlitz, which led to the dissolving of the Holy Roman Empire. In 1806, the Fourth Coalition took up arms against him because Prussia became worried about growing French influence on the continent. Napoleon knocked out Prussia at the battles of Jena and Auerstedt, marched the Grande Armée into Eastern Europe, annihilating the Russians in June 1807 at Friedland, and forcing the defeated nations of the Fourth Coalition to accept the Treaties of Tilsit. Two years later, the Austrians challenged the French again during the War of the Fifth Coalition, but Napoleon solidified his grip over Europe after triumphing at the Battle of Wagram. Hoping to extend the Continental System, his embargo against Britain, Napoleon invaded the Iberian Peninsula and declared his brother Joseph King of Spain in 1808. The Spanish and the Portuguese revolted in the Peninsular War, culminating in defeat for Napoleon's marshals. Napoleon launched an invasion of Russia in the summer of 1812. The resulting campaign witnessed the catastrophic retreat of Napoleon's Grande Armée. In 1813, Prussia and Austria joined Russian forces in a Sixth Coalition against France. A chaotic military campaign resulted in a large coalition army defeating Napoleon at the Battle of Leipzig in October 1813. The coalition invaded France and captured Paris, forcing Napoleon to abdicate in April 1814. He was exiled to the island of Elba, between Corsica and Italy. In France, the Bourbons were restored to power. However, Napoleon escaped Elba in February 1815 and took control of France.[6][7] The Allies responded by forming a Seventh Coalition, which defeated Napoleon at the Battle of Waterloo in June 1815. The British exiled him to the remote island of Saint Helena in the Atlantic, where he died in 1821 at the age of 51. Napoleon had an extensive impact on the modern world, bringing liberal reforms to the many countries he conquered, especially the Low Countries, Switzerland, and parts of modern Italy and Germany. He implemented liberal policies in France and Western Europe.
"""

kb = from_text_to_kb(text)
kb.print()
# Entities:
#  ('Napoleon', {'url': 'https://en.wikipedia.org/wiki/Napoleon',
#   'summary': "Napoleon Bonaparte (born Napoleone di Buonaparte; 15 August ..."})
#  ('French Revolution', {'url': 'https://en.wikipedia.org/wiki/French_Revolution',
#   'summary': 'The French Revolution (French: Révolution française..."})
#  ...
# Relations:
#  {'head': 'Napoleon', 'type': 'participant in', 'tail': 'French Revolution',
#   'meta': {'spans': [[0, 128], [119, 247]]}}
#  {'head': 'French Revolution', 'type': 'participant', 'tail': 'Napoleon',
#   'meta': {'spans': [[0, 128]]}}
#  ...



  lis = BeautifulSoup(html).find_all('li')


Entities:
  ('Napoleon', {'url': 'https://en.wikipedia.org/wiki/Napoleon', 'summary': "Napoleon Bonaparte (born Napoleone Buonaparte; 15 August 1769 – 5 May 1821), later known by his regnal name Napoleon I, was a French military commander and political leader who rose to prominence during the French Revolution and led successful campaigns during the Revolutionary Wars. He was the leader of the French Republic as First Consul from 1799 to 1804, then of the French Empire as Emperor of the French from 1804 until 1814 and again in 1815. Napoleon's political and cultural legacy endures as a celebrated and controversial leader. He initiated many liberal reforms that have persisted, and is considered one of the greatest ever military commanders. His campaigns are still studied at military academies worldwide. Between three and six million civilians and soldiers died in the Napoleonic Wars.Napoleon was born on the island of Corsica to a native family descending from Italian nobility. He suppor

# Extract KB from web article

In [None]:
def from_text_to_kb(text, article_url, span_length=128, article_title=None,
                    article_publish_date=None, verbose=False):
    
    #Andrew# Convert the input text into a Knowledge Base (KB) object by extracting relations using span-based processing
    #Andrew# Args: text (str) : Input text to be processed
    #Andrew# article_url (str): URL of the article from which the text originates
    #Andrew# span_length (int, optional): Maximum length of each span for processing
    #Andrew# article_title (str, optional): Title of the article
    #Andrew# article_publish_date (str, optional): Publish date of the article
    #Andrew# verbose (bool, optional): If True, print additional information during processing
    #Andrew# Returns: KB: Knowledge Base object containing extracted relations with associated metadata   
    
    
    # tokenize whole text
    inputs = tokenizer([text], return_tensors="pt")

  #Andrew# Compute span boundaries for span-based processing
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) /
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

   #Andrew# Generate relations using the language model
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

   #Andrew# Decode relations from the generated tokens
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

   #Andrew# Create a Knowledge Base (KB) object
    kb = KB()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                article_url: {
                    "spans": [spans_boundaries[current_span_index]]
                }
            }
            kb.add_relation(relation, article_title, article_publish_date)
        i += 1

    return kb #Andrew# Return the Knowledge Base containing the extracted relations with associated metadata

In [None]:
class KB():
    def __init__(self):
        #Andrew# Initialize dictionaries to store entities, relations, and sources
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }

    def merge_with_kb(self, kb2):
        #Andrew# Merge another KB's relations and metadata into this KB
        for r in kb2.relations:
            article_url = list(r["meta"].keys())[0]
            source_data = kb2.sources[article_url]
            self.add_relation(r, source_data["article_title"],
                              source_data["article_publish_date"])

    def are_relations_equal(self, r1, r2):
        #Andrew# Check if two relations are equal based on their head, type, and tail attributes
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        #Andrew# Check if a relation with the same attributes as r1 already exists in the KB
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r2):
        #Andrew# Merge spans from r2 into an existing relation in the KB with the same attributes
        r1 = [r for r in self.relations
              if self.are_relations_equal(r2, r)][0]

        #Andrew# If the relation is from a different article, and its metadata
        article_url = list(r2["meta"].keys())[0]
        if article_url not in r1["meta"]:
            r1["meta"][article_url] = r2["meta"][article_url]

        #Andrew# If the relation is from an existing article, add its spans if not already present
        else:
            spans_to_add = [span for span in r2["meta"][article_url]["spans"]
                            if span not in r1["meta"][article_url]["spans"]]
            r1["meta"][article_url]["spans"] += spans_to_add

    def get_wikipedia_data(self, candidate_entity):
        #Andrew# Get data from Wikipedia for a given candidate entity (title)
        #Andrew# Returns the title, URL, and summary of the Wikipedia page
        #Andrew# Returns None if the entity is not found on Wikipedia
        try:
            page = wikipedia.page(candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary
            }
            return entity_data
        except:
            return None

    def add_entity(self, e):
        #Andrew# Add an entity to the KB's entities dictionary, excluding the title key
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r, article_title, article_publish_date):
        #Andrew# Check Wikipedia for the head and tail entities of the relation
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        #Andrew# If one or both entities do not exist on Wikipedia, stop adding the relation
        if any(ent is None for ent in entities):
            return

       #Andrew# Add the entities to the KB's entity dictionary
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # add source if not in kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_publish_date": article_publish_date
            }

       #Andrew# Add the relation to the Knowledge Base, or merge it with an existing relation if applicable
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        #Andrew# Print all the entities, relations, and sources in the Knowledge Base
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
        print("Sources:")
        for s in self.sources.items():
            print(f"  {s}")

In [None]:
def get_article(url):
    #Andrew# Download and parse the article from the given URL using the 'newspaper3k' library 
    #Andrew# Args: url (str): The URL of the article to download and parse
    #Andrew# Returns: Article: a parsed article object containing the article's information
    article = Article(url)
    article.download()
    article.parse()
    return article

def from_url_to_kb(url):
    #Andrew# Convert the content of the article from the given URL into a Knowledge Base (KB) object
    #Andrew# Args: url (str): The URL of the article to convert into a KB
    #Andrew# Returns: KB: A Knowledge Base object containing extracted relations with associated metadata
    #Andrew# Get the article object by downloading and parsing the content from the URL
    
    article = get_article(url)
    #Andrew# Prepare configuration data (article title and publish date) for from_text_to_kb function
    config = {
        "article_title": article.title,
        "article_publish_date": article.publish_date
    }
    #Andrew# Convert the article's text into a Knowledge Base (KB) object using from_text_to_kb function
    kb = from_text_to_kb(article.text, article.url, **config)
    return kb

In [None]:
#Andrew# Define the URL of the article to process

url = "https://finance.yahoo.com/news/microstrategy-bitcoin-millions-142143795.html"

#Andrew# Convert the content of the article from the URL into a Knowledge Base (KB) object using 
#Andrew#  the from_url_to_kb function the from_url_to_kb function

kb = from_url_to_kb(url)

#Andrew# Print the entities, relations, and sources stored in the Knowledge Base (KB) object 

kb.print()

#Andrew# Output will show entities with their URLs and summaries, relations with their types, and the associated spans
#Andrew# The sources section displays the URL of the article, its title, and the publish date (if available)



# Entities:
#   ('MicroStrategy', {'url': 'https://en.wikipedia.org/wiki/MicroStrategy',
#     'summary': "MicroStrategy Incorporated is an American company that ..."})
#   ('Michael J. Saylor', {'url': 'https://en.wikipedia.org/wiki/Michael_J._Saylor',
#     'summary': 'Michael J. Saylor (born February 4, 1965) is an American ..."})
#   ...
# Relations:
#   {'head': 'MicroStrategy', 'type': 'founded by', 'tail': 'Michael J. Saylor',
#    'meta': {'https://finance.yahoo.com/news/microstrategy-bitcoin-millions-142143795.html':
#      {'spans': [[0, 128]]}}}
#   {'head': 'Michael J. Saylor', 'type': 'employer', 'tail': 'MicroStrategy',
#    'meta': {'https://finance.yahoo.com/news/microstrategy-bitcoin-millions-142143795.html':
#      {'spans': [[0, 128]]}}}
#   ...
# Sources:
#   ('https://finance.yahoo.com/news/microstrategy-bitcoin-millions-142143795.html',
#     {'article_title': "Microstrategy chief: 'Bitcoin is going to go into the millions'",
#      'article_publish_date': None})

Entities:
  ('MicroStrategy', {'url': 'https://en.wikipedia.org/wiki/MicroStrategy', 'summary': "MicroStrategy Incorporated is an American company that provides business intelligence (BI), mobile software, and cloud-based services. Founded in 1989 by Michael J. Saylor, Sanju Bansal, and Thomas Spahr, the firm develops software to analyze internal and external data in order to make business decisions and to develop mobile apps. It is a public company headquartered in Tysons Corner, Virginia, in the Washington metropolitan area. Its primary business analytics competitors include SAP AG Business Objects, IBM Cognos, and Oracle Corporation's BI Platform. Saylor is the Executive Chairman and, from 1989 to 2022, was the CEO.\n\n"})
  ('Michael J. Saylor', {'url': 'https://en.wikipedia.org/wiki/Michael_J._Saylor', 'summary': "Michael J. Saylor (born February 4, 1965) is an American entrepreneur and business executive. He is the executive chairman and a co-founder of MicroStrategy, a company t

# Google News: extract KB from multiple articles

In [None]:
def get_news_links(query, lang="en", region="US", pages=1, max_links=100000):
    googlenews = GoogleNews(lang=lang, region=region)
    googlenews.search(query)
    all_urls = []
    for page in range(pages):
        googlenews.get_page(page)
        all_urls += googlenews.get_links()
    return list(set(all_urls))[:max_links]

def from_urls_to_kb(urls, verbose=False):
    kb = KB()
    if verbose:
        print(f"{len(urls)} links to visit")
    for url in urls:
        if verbose:
            print(f"Visiting {url}...")
        try:
            kb_url = from_url_to_kb(url)
            kb.merge_with_kb(kb_url)
        except ArticleException:
            if verbose:
                print(f"  Couldn't download article at url {url}")
    return kb

In [None]:
import pickle

def save_kb(kb, filename):
    with open(filename, "wb") as f:
        pickle.dump(kb, f)

def load_kb(filename):
    res = None
    with open(filename, "rb") as f:
        res = pickle.load(f)
    return res

In [None]:
news_links = get_news_links("Google", pages=1, max_links=3)
kb = from_urls_to_kb(news_links, verbose=True)
kb.print()
# 3 links to visit
# Visiting https://www.hindustantimes.com/india-news/google-doodle-celebrates-india-s-gama-pehlwan-the-undefeated-wrestling-champion-101653180853982.html...
# Visiting https://tech.hindustantimes.com/tech/news/google-doodle-today-celebrates-gama-pehlwan-s-144th-birth-anniversary-know-who-he-is-71653191916538.html...
# Visiting https://www.moneycontrol.com/news/trends/current-affairs-trends/google-doodle-celebrates-gama-pehlwan-the-amritsar-born-wrestling-champ-who-inspired-bruce-lee-8552171.html...
# Entities:
#   ('Google', {'url': 'https://en.wikipedia.org/wiki/Google',
#     'summary': 'Google LLC is an American ...'})
#   ...
# Relations:
#   {'head': 'Google', 'type': 'owner of', 'tail': 'Google Doodle',
#     'meta': {'https://tech.hindustantimes.com/tech/news/google-doodle-today-celebrates-gama-pehlwan-s-144th-birth-anniversary-know-who-he-is-71653191916538.html':
#       {'spans': [[0, 128]]}}}
#   ...
# Sources:
#   ('https://www.hindustantimes.com/india-news/google-doodle-celebrates-india-s-gama-pehlwan-the-undefeated-wrestling-champion-101653180853982.html',
#     {'article_title': "Google Doodle celebrates India's Gama Pehlwan, the undefeated wrestling champion",
#     'article_publish_date': datetime.datetime(2022, 5, 22, 6, 59, 56, tzinfo=tzoffset(None, 19800))})
#   ('https://tech.hindustantimes.com/tech/news/google-doodle-today-celebrates-gama-pehlwan-s-144th-birth-anniversary-know-who-he-is-71653191916538.html',
#     {'article_title': "Google Doodle today celebrates Gama Pehlwan's 144th birth anniversary; know who he is",
#     'article_publish_date': datetime.datetime(2022, 5, 22, 9, 32, 38, tzinfo=tzoffset(None, 19800))})
#   ('https://www.moneycontrol.com/news/trends/current-affairs-trends/google-doodle-celebrates-gama-pehlwan-the-amritsar-born-wrestling-champ-who-inspired-bruce-lee-8552171.html',
#     {'article_title': 'Google Doodle celebrates Gama Pehlwan, the Amritsar-born wrestling champ who inspired Bruce Lee',
#     'article_publish_date': None})

3 links to visit
Visiting https://fortune.com/2023/07/26/google-cfo-search-ruth-porat-president-cio/...


Token indices sequence length is longer than the specified maximum sequence length for this model (1479 > 1024). Running this sequence through the model will result in indexing errors


Visiting https://www.wipro.com/partner-ecosystem/strategic-google/cloud-migration-modernization-with-wipro-fullstride-cloud-and-google-ramp/...
Visiting https://www.morningstar.com/stocks/alphabet-earnings-google-search-youtube-growth-doubts-are-subsiding-stock-remains-attractive...
Entities:
  ('Google', {'url': 'https://en.wikipedia.org/wiki/Google', 'summary': 'Google LLC ( (listen)) is an American multinational technology company focusing on artificial intelligence, online advertising, search engine technology, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics. It has been referred to as "the most powerful company in the world" and as one of the world\'s most valuable brands due to its market dominance, data collection, and technological advantages in the field of artificial intelligence. Google\'s parent company Alphabet Inc. is one of the Big Tech companies, alongside Amazon, Apple Inc., Meta Platforms, and Microsoft.\nGoogle was founded 

# Visualize KB

In [None]:
def save_network_html(kb, filename="network.html"):
    # create network
    net = Network(directed=True, width="700px", height="700px", bgcolor="#eeeeee")

    # nodes
    color_entity = "#00FF00"
    for e in kb.entities:
        net.add_node(e, shape="circle", color=color_entity)

    # edges
    for r in kb.relations:
        net.add_edge(r["head"], r["tail"],
                    title=r["type"], label=r["type"])

    # save network
    net.repulsion(
        node_distance=200,
        central_gravity=0.2,
        spring_length=200,
        spring_strength=0.05,
        damping=0.09
    )
    net.set_edge_smooth('dynamic')
    net.show(filename, notebook=False)

In [None]:
news_links = get_news_links("Google", pages=5, max_links=20)
kb = from_urls_to_kb(news_links, verbose=True)
filename = "network_3_google.html"
save_network_html(kb, filename=filename)
IPython.display.HTML(filename=filename)

20 links to visit
Visiting https://www.investopedia.com/dow-jones-today-07252023-7565039...
Visiting https://www.cnbc.com/2023/07/25/google-shows-it-can-prevail-despite-ai-threats-as-cloud-business-booms.html...
Visiting https://www.reuters.com/technology/ai-lesson-microsoft-google-spend-money-make-money-2023-07-25/...
Visiting https://www.cnn.com/2023/07/26/tech/ai-industry-group/index.html...
Visiting https://www.ft.com/content/709f4375-83bf-4037-878d-964d1ead8858...
Visiting https://www.reuters.com/technology/alphabet-rallies-google-search-unfazed-by-challenge-microsofts-bing-2023-07-26/...
Visiting https://www.cnbc.com/2023/07/25/three-green-flags-on-your-resume-according-to-a-former-google-recruiter.html...
Visiting https://www.nytimes.com/2023/07/25/technology/alphabet-google-earnings-second-quarter.html...
Visiting https://fortune.com/2023/07/26/google-cfo-search-ruth-porat-president-cio/...
Visiting https://fortune.com/2023/07/25/alphabet-google-cfo-ruth-porat-promoted-presiden

In [None]:
news_links = get_news_links("Amazon", pages=5, max_links=20)
kb = from_urls_to_kb(news_links, verbose=True)
filename = "network_3_amazon.html"
save_network_html(kb, filename=filename)
save_kb(kb, filename.split(".")[0] + ".p")
IPython.display.HTML(filename=filename)

NameError: ignored

In [None]:
news_links = get_news_links("Apple", pages=5, max_links=20)
kb = from_urls_to_kb(news_links, verbose=True)
filename = "network_3_apple.html"
save_network_html(kb, filename=filename)
save_kb(kb, filename.split(".")[0] + ".p")
IPython.display.HTML(filename=filename)

In [None]:
news_links = get_news_links("Elon Musk", pages=5, max_links=20)
kb = from_urls_to_kb(news_links, verbose=True)
filename = "network_3_musk.html"
save_network_html(kb, filename=filename)
save_kb(kb, filename.split(".")[0] + ".p")
IPython.display.HTML(filename=filename)

In [None]:
news_links = get_news_links("Kobe Bryant", pages=5, max_links=20)
kb = from_urls_to_kb(news_links, verbose=True)
filename = "network_3_bryant.html"
save_network_html(kb, filename=filename)
save_kb(kb, filename.split(".")[0] + ".p")
IPython.display.HTML(filename=filename)

In [None]:
text = "Napoleon Bonaparte (born Napoleone di Buonaparte; 15 August 1769 – 5 May 1821), and later known by his regnal name Napoleon I, was a French military and political leader who rose to prominence during the French Revolution and led several successful campaigns during the Revolutionary Wars. He was the de facto leader of the French Republic as First Consul from 1799 to 1804. As Napoleon I, he was Emperor of the French from 1804 until 1814 and again in 1815. Napoleon's political and cultural legacy has endured, and he has been one of the most celebrated and controversial leaders in world history."
kb = from_text_to_kb(text, "", verbose=True)
filename = "network_1_napoleon.html"
save_network_html(kb, filename=filename)
save_kb(kb, filename.split(".")[0] + ".p")
IPython.display.HTML(filename=filename)

In [None]:
text = "Kobe Bean Bryant (August 23, 1978 – January 26, 2020) was an American professional basketball player. A shooting guard, he spent his entire 20-year career with the Los Angeles Lakers in the National Basketball Association (NBA). Widely regarded as one of the greatest basketball players of all time, Bryant won five NBA championships, was an 18-time All-Star, a 15-time member of the All-NBA Team, a 12-time member of the All-Defensive Team, the 2008 NBA Most Valuable Player (MVP), and a two-time NBA Finals MVP. Bryant also led the NBA in scoring twice, and ranks fourth in league all-time regular season and postseason scoring. He was posthumously voted into the Naismith Memorial Basketball Hall of Fame in 2020 and named to the NBA 75th Anniversary Team in 2021."
kb = from_text_to_kb(text, "", verbose=True)
filename = "network_1_bryant.html"
save_network_html(kb, filename=filename)
save_kb(kb, filename.split(".")[0] + ".p")
IPython.display.HTML(filename=filename)

In [None]:
text = "Originally known as BackRub. Google is a search engine that started development in 1996 by Sergey Brin and Larry Page as a research project at Stanford University to find files on the Internet. Larry and Sergey later decided the name of their search engine needed to change and chose Google, which is inspired from the term googol. The company is headquartered in Mountain View, California."
kb = from_text_to_kb(text, "", verbose=True)
filename = "network_1_google.html"
save_network_html(kb, filename=filename)
save_kb(kb, filename.split(".")[0] + ".p")
IPython.display.HTML(filename=filename)

In [None]:
url = "https://www.investopedia.com/terms/c/cryptocurrency.asp"
kb = from_url_to_kb(url)
filename = "network_2_crypto.html"
save_network_html(kb, filename=filename)
save_kb(kb, filename.split(".")[0] + ".p")
IPython.display.HTML(filename=filename)

In [None]:
url = "https://www.britannica.com/biography/Johnny-Depp"
kb = from_url_to_kb(url)
filename = "network_2_depp.html"
save_network_html(kb, filename=filename)
save_kb(kb, filename.split(".")[0] + ".p")
IPython.display.HTML(filename=filename)

In [None]:
url = "https://www.timeout.com/rome/things-to-do/best-things-to-do-in-rome"
kb = from_url_to_kb(url)
filename = "network_2_rome.html"
save_network_html(kb, filename=filename)
save_kb(kb, filename.split(".")[0] + ".p")
IPython.display.HTML(filename=filename)