## Importing the packages and models

In [1]:
import numpy as np 
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch
import wikipedia
from newspaper import Article, ArticleException
from GoogleNews import GoogleNews
import IPython
from pyvis.network import Network

In [15]:
'''
We are using the hugging face pre-trained models for the text summarization and knowledge graph creation. The below 
code initializes a tokenizer and a model for sequence-to-sequence learning tasks using a pre-trained model provided 
by Babelscape. These pre-trained components can be further fine-tuned or used directly for various NLP tasks 
such as translation, summarization, or text generation
'''

tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

## From short text to knowledge base

In [2]:
def extract_relations_from_model_output(text):
    '''
    From a given text, the goal is to identify the relation, subject and object of the piece of text that is obtained.
    Input : Text
    Output : Relations
    Process :
    1. Special characters of the text are first removed (Tags)
    2. For every token in the text that is split, it checks whether the token is a subject, object or relation
    3. If it is a triplet, it sets the current variable to t and checks whether a relation exists. If exists -> append
    4. If the token is a subject, it indicates start of the subject. Append if relation exists. object_ -> ''
    5. If it is an object, set relation to blank and current to 'o'
    6. If it is neither of the three, it is assumed to be a part of one and is append to the corresponding current value
    7. If it is a last relation, it is appended

    Code can be found on the REBEL model card
    '''
    triplets = [] 
    subject, relation, object_ = '', '', '' 
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>": 
            current = 't'
            if relation != '':
                triplets.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>": 
            current = 's'
            if relation != '':
                triplets.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return triplets

In [22]:
class knowledgebase():
    '''
    Methods -
    1. exists_relation(r1)
        a. If a new relation is identified, it is checked whether it exists or not. If the new relation is r2, it is
        checked against existing relations whether it exists
    2. are_relations_equal(r1,r2)
        a. If there are two relations r1 and r2, check is done to see if they are same
        b. Every attribute in the relation (subject, object and type) is checked for similarity
    3. add_relation(r)
        a. If a relation does not exist, append it to the relations list
    4. print()
        a. Print all the relations
    
    '''
    def __init__(self):
        self.relations = []

    def relation_equality_check(self, r1, r2): 
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def relation_existence(self, r1):
        return any(self.relation_equality_check(r1, r2) for r2 in self.relations)

    def relation_add(self, r): 
        if not self.relation_existence(r):
            self.relations.append(r) 

    def print(self): #printing the relations
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

In [20]:
def from_small_text_to_kb(text, verbose=False):
    '''
    Code present in REBEL documentation
    This Python function, from_small_text_to_kb, takes a text input, processes it through a pre-trained model 
    to generate predictions, and extracts relations from the generated predictions to populate a knowledge base (KB) 
    instance.
    The tokenizer is controlled by the max length, padding and truncation. Return pytorch tensors
    '''
    kb = knowledgebase()

    model_inputs = tokenizer(text, max_length=512, padding=True, truncation=True,
                            return_tensors='pt') 

    if verbose:
        print(f"Num tokens: {len(model_inputs['input_ids'][0])}")

    gen_kwargs = {
        "max_length": 216,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3
    }
    generated_tokens = model.generate(
        **model_inputs,
        **gen_kwargs,
    )
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    for sentence_pred in decoded_preds:
        
        relations = extract_relations_from_model_output(sentence_pred)
        for r in relations:
            print(r)
            kb.relation_add(r)

    return kb

In [23]:
text = "Napoleon Bonaparte (born Napoleone di Buonaparte; 15 August 1769 – 5 " \
"May 1821), and later known by his regnal name Napoleon I, was a French military " \
"and political leader who rose to prominence during the French Revolution and led " \
"several successful campaigns during the Revolutionary Wars. He was the de facto " \
"leader of the French Republic as First Consul from 1799 to 1804. As Napoleon I, " \
"he was Emperor of the French from 1804 until 1814 and again in 1815. Napoleon's " \
"political and cultural legacy has endured, and he has been one of the most " \
"celebrated and controversial leaders in world history."

kb = from_small_text_to_kb(text, verbose=True)
kb.print()

Num tokens: 133
{'head': 'Napoleon Bonaparte', 'type': 'date of birth', 'tail': '15 August 1769'}
{'head': 'Napoleon Bonaparte', 'type': 'date of death', 'tail': '5 May 1821'}
{'head': 'Napoleon Bonaparte', 'type': 'participant in', 'tail': 'French Revolution'}
{'head': 'Napoleon Bonaparte', 'type': 'conflict', 'tail': 'Revolutionary Wars'}
{'head': 'Revolutionary Wars', 'type': 'part of', 'tail': 'French Revolution'}
{'head': 'Napoleon Bonaparte', 'type': 'date of birth', 'tail': '15 August 1769'}
{'head': 'Napoleon Bonaparte', 'type': 'date of death', 'tail': '5 May 1821'}
{'head': 'Napoleon Bonaparte', 'type': 'participant in', 'tail': 'French Revolution'}
{'head': 'Napoleon Bonaparte', 'type': 'conflict', 'tail': 'Revolutionary Wars'}
{'head': 'French Revolution', 'type': 'participant', 'tail': 'Napoleon Bonaparte'}
{'head': 'Revolutionary Wars', 'type': 'participant', 'tail': 'Napoleon Bonaparte'}
{'head': 'Napoleon Bonaparte', 'type': 'date of birth', 'tail': '15 August 1769'}
{'

In [8]:
text = "Napoleon Bonaparte (born Napoleone di Buonaparte; 15 August 1769 – 5 " \
"May 1821), and later known by his regnal name Napoleon I, was a French military " \
"and political leader who rose to prominence during the French Revolution and led " \
"several successful campaigns during the Revolutionary Wars. He was the de facto " \
"leader of the French Republic as First Consul from 1799 to 1804. As Napoleon I, " \
"he was Emperor of the French from 1804 until 1814 and again in 1815. Napoleon's " \
"political and cultural legacy has endured, and he has been one of the most " \
"celebrated and controversial leaders in world history."

kb = from_small_text_to_kb(text, verbose=True)
kb.print()

Num tokens: 133
Relations:
  {'head': 'Napoleon Bonaparte', 'type': 'date of birth', 'tail': '15 August 1769'}
  {'head': 'Napoleon Bonaparte', 'type': 'date of death', 'tail': '5 May 1821'}
  {'head': 'Napoleon Bonaparte', 'type': 'participant in', 'tail': 'French Revolution'}
  {'head': 'Napoleon Bonaparte', 'type': 'conflict', 'tail': 'Revolutionary Wars'}
  {'head': 'Revolutionary Wars', 'type': 'part of', 'tail': 'French Revolution'}
  {'head': 'French Revolution', 'type': 'participant', 'tail': 'Napoleon Bonaparte'}
  {'head': 'Revolutionary Wars', 'type': 'participant', 'tail': 'Napoleon Bonaparte'}


## From long text to knowledge base

In [9]:
class knowledgebase():
    '''
    Methods -
    1. exists_relation(r1)
        a. If a new relation is identified, it is checked whether it exists or not. If the new relation is r2, it is
        checked against existing relations whether it exists
    2. are_relations_equal(r1,r2)
        a. If there are two relations r1 and r2, check is done to see if they are same
        b. Every attribute in the relation (subject, object and type) is checked for similarity
    3. add_relation(r)
        a. If a relation does not exist, append it to the relations list
    4. print()
        a. Print all the relations
    5. relation_merge(r1) - If two relations are pretty much same, we add spans to it which is essentially the meta
    data information
    
    '''
    def __init__(self):
        self.relations = []

    def relation_equality_check(self, r1, r2): 
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def relation_existence(self, r1):
        return any(self.relation_equality_check(r1, r2) for r2 in self.relations)

    def relation_add(self, r): 
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.relation_merge(r)

    def relation_merge(self, r1): 
        r2 = [r for r in self.relations
              if self.relation_equality_check(r1, r)][0] 
        spans_to_add = [span for span in r1["meta"]["spans"]
                        if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add

    def print(self): 
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

In [26]:
def from_text_to_kb(text, length_of_span=128, verbose=False):
    '''
    If there is a need to extract information that is greater than 1000 words, we can leverage the concept of spans.
    The term "spans" likely refers to specific segments or sections within the text that are relevant to or associated 
    with each relation. In this case the span length is 128. We will first see how many chunks there are.
    Process:
    1. Tokenize the text and find the number of tokens
    2. Based on the span length, find the number of spans
    3. Find the span chunks and identify the overlapping regions
    4. For each span, it transforms the input into tensors
    '''
    inputs = tokenizer([text], return_tensors="pt")

    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / length_of_span)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * length_of_span - num_tokens) / 
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + length_of_span * i,
                                 start + length_of_span * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
                    
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    kb = knowledgebase()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                "spans": [spans_boundaries[current_span_index]]
            }
            kb.relation_add(relation)
        i += 1

    return kb

In [33]:
# Using wikipedia API to get HTML data
import requests
response = requests.get(
    'https://en.wikipedia.org/w/api.php',
     params={
         'action': 'query',
       'format': 'json',
        'titles': 'Sachin Tendulkar',
   'prop': 'extracts',
         'exintro': True,
        'explaintext': True,
    }
).json()
page = next(iter(response['query']['pages'].values()))
text_value = page['extract']
print(text_value)

kb = from_text_to_kb(text_value, verbose=True)
kb.print()

Sachin Ramesh Tendulkar, ( ; pronounced [sətɕin teːɳɖulkəɾ]; born 24 April 1973) is an Indian former international cricketer who captained the Indian national team. He is widely regarded as one of the greatest batsmen in the history of cricket. Hailed as the world's most prolific batsman of all time, he is the all-time highest run-scorer in both ODI and Test cricket with more than 18,000 runs and 15,000 runs, respectively. He also holds the record for receiving the most player of the match awards in international cricket. Tendulkar was a Member of Parliament, Rajya Sabha by nomination from 2012 to 2018.Tendulkar took up cricket at the age of eleven, made his Test match debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to represent Mumbai domestically and India internationally for over 24 years. In 2002, halfway through his career, Wisden ranked him the second-greatest Test batsman of all time, behind Don Bradman, and the second-greatest ODI batsma

## From article at URL to knowledge base

In [35]:
#Article URl
class knowledgebase():
    '''
    Methods -
    1. exists_relation(r1)
        a. If a new relation is identified, it is checked whether it exists or not. If the new relation is r2, it is
        checked against existing relations whether it exists
    2. are_relations_equal(r1,r2)
        a. If there are two relations r1 and r2, check is done to see if they are same
        b. Every attribute in the relation (subject, object and type) is checked for similarity
    3. add_relation(r)
        a. If a relation does not exist, append it to the relations list
    4. print()
        a. Print all the relations
    5. relation_merge(r1) - If two relations are pretty much same, we add spans to it which is essentially the meta
    data information
    6. add_relation(r, article title, article publish date) -  check if wikipedia data exists for this entity. Add the
    article publish date and source to the same relation so that the required information is captured
    '''
    def __init__(self):
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }

    def relation_equality_check(self, r1, r2): 
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def relation_existence(self, r1):
        return any(self.relation_equality_check(r1, r2) for r2 in self.relations)
    
    def merge_with_kb(self, kb2):
        for r in kb2.relations:
            article_url = list(r["meta"].keys())[0]
            source_data = kb2.sources[article_url]
            self.add_relation(r, source_data["article_title"],
                              source_data["article_publish_date"])
            
    def get_wikipedia_data(self, candidate_entity):
        try:
            page = wikipedia.page(candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary
            }
            return entity_data
        except:
            return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r, article_title, article_publish_date): 
        candidate_entities = [r["head"], r["tail"]] 
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        if any(ent is None for ent in entities):
            return
        
        for e in entities:
            self.add_entity(e) 
       
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # add source if not in kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_publish_date": article_publish_date
            }

        if not self.relation_existence(r):
            self.relations.append(r) 
        else:
            self.relation_merge(r)

    def relation_merge(self, r2): 
        r1 = [r for r in self.relations
              if self.relation_equality_check(r2, r)][0]

        article_url = list(r2["meta"].keys())[0]
        if article_url not in r1["meta"]:
            r1["meta"][article_url] = r2["meta"][article_url]

        else:
            spans_to_add = [span for span in r2["meta"][article_url]["spans"] 
                            if span not in r1["meta"][article_url]["spans"]]
            r1["meta"][article_url]["spans"] += spans_to_add

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
        print("Sources:")
        for s in self.sources.items():
            print(f"  {s}")

In [37]:
def from_text_to_kb(text, article_url, length_of_span=128, article_title=None,
                    article_publish_date=None, verbose=False):
    '''
    If there is a need to extract information that is greater than 1000 words, we can leverage the concept of spans.
    The term "spans" likely refers to specific segments or sections within the text that are relevant to or associated 
    with each relation. In this case the span length is 128. We will first see how many chunks there are.
    Process:
    1. Tokenize the text and find the number of tokens
    2. Based on the span length, find the number of spans
    3. Find the span chunks and identify the overlapping regions
    4. For each span, it transforms the input into tensors
    '''
    inputs = tokenizer([text], return_tensors="pt")

    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / length_of_span)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * length_of_span - num_tokens) / 
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + length_of_span * i,
                                 start + length_of_span * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
                    
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    kb = knowledgebase()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                article_url: {
                    "spans": [spans_boundaries[current_span_index]]
                }
            }
            kb.add_relation(relation, article_title, article_publish_date)
        i += 1

    return kb

def get_article(url):
    article = Article(url)
    article.download() #downloads html content
    article.parse() #parse to extract relevant data
    return article

def from_url_to_kb(url):
    article = get_article(url)
    config = {
        "article_title": article.title,
        "article_publish_date": article.publish_date
    }
    kb = from_text_to_kb(article.text, article.url, **config)
    return kb

## Network Development

In [40]:
url = "https://finance.yahoo.com/news/microstrategy-bitcoin-millions-142143795.html"

kb = from_url_to_kb(url)
kb.print()

Entities:
  ('MicroStrategy', {'url': 'https://en.wikipedia.org/wiki/MicroStrategy', 'summary': "MicroStrategy Incorporated is an American company that provides business intelligence (BI), mobile software, and cloud-based services. Founded in 1989 by Michael J. Saylor, Sanju Bansal, and Thomas Spahr, the firm develops software to analyze internal and external data in order to make business decisions and to develop mobile apps. It is a public company headquartered in Tysons Corner, Virginia, in the Washington metropolitan area. Its primary business analytics competitors include SAP AG Business Objects, IBM Cognos, and Oracle Corporation's BI Platform. Saylor is the Executive Chairman and, from 1989 to 2022, was the CEO.\n\n"})
  ('Michael J. Saylor', {'url': 'https://en.wikipedia.org/wiki/Michael_J._Saylor', 'summary': "Michael J. Saylor (born February 4, 1965) is an American entrepreneur and business executive. He is the executive chairman and co-founder of MicroStrategy, a company tha

In [43]:
def save_network_html(kb, filename="network.html"):
    # create network
    net = Network(directed=True, width="700px", height="700px", bgcolor="#eeeeee")

    # nodes
    color_entity = "#00FF00"
    for e in kb.entities:
        net.add_node(e, shape="circle", color=color_entity)

    # edges
    for r in kb.relations:
        net.add_edge(r["head"], r["tail"],
                    title=r["type"], label=r["type"])
        
    # save network
    net.repulsion(
        node_distance=200,
        central_gravity=0.2,
        spring_length=200,
        spring_strength=0.05,
        damping=0.09
    )
    net.set_edge_smooth('dynamic')
    net.write_html(filename)

filename = "url_kb.html"
save_network_html(kb, filename=filename)

## From multiple articles

In [44]:
#Google News
def get_news_links(query, lang="en", region="US", pages=1, max_links=100000):
    googlenews = GoogleNews(lang=lang, region=region)
    googlenews.search(query)
    all_urls = []
    for page in range(pages):
        googlenews.get_page(page)
        all_urls += googlenews.get_links()
    return list(set(all_urls))[:max_links]

def from_urls_to_kb(urls, verbose=False):
    kb = knowledgebase()
    if verbose:
        print(f"{len(urls)} links to visit")
    for url in urls:
        if verbose:
            print(f"Visiting {url}")
        try:
            kb_url = from_url_to_kb(url)
            kb.merge_with_kb(kb_url)
        except ArticleException:
            if verbose:
                print(f"  Couldn't download article at url {url}")
    return kb

In [48]:
news_links = get_news_links("Sachin Tendulkar", pages=5, max_links=20)
kb = from_urls_to_kb(news_links, verbose=True)

20 links to visit
Visiting https://www.indiatvnews.com/web-stories/sports/top-10-oldest-players-to-score-odi-hundred-mohammad-nabi-surpasses-sachin-tendulkar-gavaskar-gayle-sl-vs-afg-nabi-century-afghanistan-odi-records-2024-02-11-916240&ved=2ahUKEwierYLq0KKEAxVHmmoFHcZ8AmUQxfQBegQIAhAC&usg=AOvVaw0UdGrPF3lyS9qiO5OQ2Sij...
  Couldn't download article at url https://www.indiatvnews.com/web-stories/sports/top-10-oldest-players-to-score-odi-hundred-mohammad-nabi-surpasses-sachin-tendulkar-gavaskar-gayle-sl-vs-afg-nabi-century-afghanistan-odi-records-2024-02-11-916240&ved=2ahUKEwierYLq0KKEAxVHmmoFHcZ8AmUQxfQBegQIAhAC&usg=AOvVaw0UdGrPF3lyS9qiO5OQ2Sij
Visiting https://www.mensxp.com/ampstories/sports/cricket/154230-under-19-world-cup-sachin-dhas-named-after-sachin-tendulkar-fan-of-virat-kohli.html&ved=2ahUKEwjBrPDr0KKEAxXglGoFHRTWAG04HhDF9AF6BAgBEAI&usg=AOvVaw09WAmMyVS05FqGVRU3AS4Q...
  Couldn't download article at url https://www.mensxp.com/ampstories/sports/cricket/154230-under-19-world-cup



  lis = BeautifulSoup(html).find_all('li')


Visiting https://www.telegraphindia.com/sports/cricket/solid-in-crisis-19-year-old-sachin-dhas-of-beed-maharashtras-bid-to-emulate-sachin-tendulkar/cid/1999029&ved=2ahUKEwjBrPDr0KKEAxXglGoFHRTWAG04HhDF9AF6BAgEEAI&usg=AOvVaw2nU5XVSXGxuM07fAZnjiOj...
  Couldn't download article at url https://www.telegraphindia.com/sports/cricket/solid-in-crisis-19-year-old-sachin-dhas-of-beed-maharashtras-bid-to-emulate-sachin-tendulkar/cid/1999029&ved=2ahUKEwjBrPDr0KKEAxXglGoFHRTWAG04HhDF9AF6BAgEEAI&usg=AOvVaw2nU5XVSXGxuM07fAZnjiOj
Visiting https://www.timesnownews.com/web-stories/sports/4-indians-to-win-player-of-the-tournament-award-in-ipl/photostory/107578073.cms&ved=2ahUKEwi2v_Dq0KKEAxWplmoFHam0Ago4ChDF9AF6BAgBEAI&usg=AOvVaw3axo8fWzf-uajPo2eoq_S_...
  Couldn't download article at url https://www.timesnownews.com/web-stories/sports/4-indians-to-win-player-of-the-tournament-award-in-ipl/photostory/107578073.cms&ved=2ahUKEwi2v_Dq0KKEAxWplmoFHam0Ago4ChDF9AF6BAgBEAI&usg=AOvVaw3axo8fWzf-uajPo2eoq_S_
Visi

In [49]:
kb.print()

Entities:
  ('World Heritage Site', {'url': 'https://en.wikipedia.org/wiki/World_Heritage_Site', 'summary': 'A World Heritage Site is a landmark or area with legal protection by an international convention administered by the UNESCO. World Heritage Sites are designated by UNESCO for having cultural, historical, scientific or other forms of significance. The sites are judged to contain "cultural and natural heritage around the world considered to be of outstanding value to humanity".To be selected, a World Heritage Site is nominated by their host country and determined by the international committee to be a unique landmark which is geographically and historically identifiable and having a special cultural or physical significance. For example, World Heritage Sites might be ancient ruins or historical structures, buildings, cities, deserts, forests, islands, lakes, monuments, mountains, or wilderness areas.A World Heritage Site may signify a remarkable accomplishment of humanity, and ser

In [40]:
import pickle

def save_kb(kb, filename):
    with open(filename, "wb") as f:
        pickle.dump(kb, f)

def load_kb(filename):
    res = None
    with open(filename, "rb") as f:
        res = pickle.load(f)
    return res

In [50]:
filename = "sachin_kb.html"
save_network_html(kb, filename=filename)