In [1]:
from newspaper import Article
import IPython
from IPython.display import clear_output
clear_output()

In [2]:
# needed to load the REBEL model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch

# wrapper for wikipedia API
import wikipedia

# scraping of web articles
from newspaper import Article, ArticleException

# google news scraping
from GoogleNews import GoogleNews

# graph visualization
from pyvis.network import Network

# show HTML in notebook
import IPython

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

tokenizer_config.json: 100%|██████████| 1.23k/1.23k [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 990kB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 753kB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:01<00:00, 1.32MB/s]
added_tokens.json: 100%|██████████| 123/123 [00:00<?, ?B/s] 
special_tokens_map.json: 100%|██████████| 344/344 [00:00<?, ?B/s] 
config.json: 100%|██████████| 1.42k/1.42k [00:00<00:00, 258kB/s]
model.safetensors: 100%|██████████| 1.63G/1.63G [00:54<00:00, 30.1MB/s]


In [4]:
# from https://huggingface.co/Babelscape/rebel-large
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

In [5]:
# knowledge base class
class KB():
    def __init__(self):
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }

    def merge_relations(self, r2):
        r1 = [r for r in self.relations
              if self.are_relations_equal(r2, r)][0]

        # if different article
        article_url = list(r2["meta"].keys())[0]
        if article_url not in r1["meta"]:
            r1["meta"][article_url] = r2["meta"][article_url]

        # if existing article
        else:
            spans_to_add = [span for span in r2["meta"][article_url]["spans"]
                            if span not in r1["meta"][article_url]["spans"]]
            r1["meta"][article_url]["spans"] += spans_to_add

    def get_wikipedia_data(self, candidate_entity):
        try:
            page = wikipedia.page(candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary
            }
            return entity_data
        except:
            return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}


    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)


    def merge_with_kb(self, kb2):
        for r in kb2.relations:
            article_url = list(r["meta"].keys())[0]
            source_data = kb2.sources[article_url]
            self.add_relation(r, source_data["article_title"],
                              source_data["article_publish_date"])

    def add_relation(self, r, article_title, article_publish_date):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # add source if not in kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_publish_date": article_publish_date
            }

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
        print("Sources:")
        for s in self.sources.items():
            print(f"  {s}")

In [6]:
# extract relations for each span and put them together in a knowledge base
def from_text_to_kb(text, article_url, span_length=128, article_title=None,
                    article_publish_date=None, verbose=False):
    # tokenize whole text
    inputs = tokenizer([text], return_tensors="pt")

    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) /
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    kb = KB()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                article_url: {
                    "spans": [spans_boundaries[current_span_index]]
                }
            }
            kb.add_relation(relation, article_title, article_publish_date)
        i += 1

    return kb

In [7]:
# parse an article with newspaper3k
def get_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article

# extract the article from the url (along with metadata), extract relations and populate a KB
def from_url_to_kb(url):
    article = get_article(url)
    config = {
        "article_title": article.title,
        "article_publish_date": article.publish_date
    }
    kb = from_text_to_kb(article.text, article.url, **config)
    return kb

In [8]:
# get news links from google news
def get_news_links(query, lang="en", region="US", pages=1, max_links=100000):
    googlenews = GoogleNews(lang=lang, region=region)
    googlenews.search(query)
    all_urls = []
    for page in range(pages):
        googlenews.get_page(page)
        all_urls += googlenews.get_links()
    return list(set(all_urls))[:max_links]

# build a KB from multiple news links
def from_urls_to_kb(urls, verbose=False):
    kb = KB()
    if verbose:
        print(f"{len(urls)} links to visit")
    for url in urls:
        if verbose:
            print(f"Visiting {url}...")
        try:
            kb_url = from_url_to_kb(url)
            kb.merge_with_kb(kb_url)
        except ArticleException:
            if verbose:
                print(f"  Couldn't download article at url {url}")
    return kb

In [9]:
# test the `from_urls_to_kb` function
news_links = get_news_links("Google", pages=1, max_links=3)
kb = from_urls_to_kb(news_links, verbose=True)
kb.print()

3 links to visit
Visiting https://www.investors.com/market-trend/stock-market-today/dow-jones-futures-stocks-sell-off-google-tesla-give-up-buy-points-micron-earnings-strong/&ved=2ahUKEwiThZPHm6GDAxWBbmwGHbzcDP0QxfQBegQIABAC&usg=AOvVaw2akZ6zRIARS6Y7w-erL9Fu...
  Couldn't download article at url https://www.investors.com/market-trend/stock-market-today/dow-jones-futures-stocks-sell-off-google-tesla-give-up-buy-points-micron-earnings-strong/&ved=2ahUKEwiThZPHm6GDAxWBbmwGHbzcDP0QxfQBegQIABAC&usg=AOvVaw2akZ6zRIARS6Y7w-erL9Fu
Visiting https://9to5google.com/2023/12/21/android-find-my-device-unwanted-tracker-spec/&ved=2ahUKEwjL89PEm6GDAxXjSmwGHZBQBSUQxfQBegQIARAC&usg=AOvVaw1KWRZoSy1YJ0FQy-GpFNNQ...
  Couldn't download article at url https://9to5google.com/2023/12/21/android-find-my-device-unwanted-tracker-spec/&ved=2ahUKEwjL89PEm6GDAxXjSmwGHZBQBSUQxfQBegQIARAC&usg=AOvVaw1KWRZoSy1YJ0FQy-GpFNNQ
Visiting https://www.techradar.com/computing/chrome/dont-fall-prey-to-this-worrying-google-chrome-exp



  lis = BeautifulSoup(html).find_all('li')


Entities:
  ('Google Chrome', {'url': 'https://en.wikipedia.org/wiki/Google_Chrome', 'summary': 'Google Chrome is a cross-platform web browser developed by Google. It was first released in 2008 for Microsoft Windows, built with free software components from Apple WebKit and Mozilla Firefox. Versions were later released for Linux, macOS, iOS, and also for Android, where it is the default browser. The browser is also the main component of ChromeOS, where it serves as the platform for web applications.\nMost of Chrome\'s source code comes from Google\'s free and open-source software project Chromium, but Chrome is licensed as proprietary freeware. WebKit was the original rendering engine, but Google eventually forked it to create the Blink engine; all Chrome variants except iOS used Blink as of 2017.As of October 2022, StatCounter estimates that Chrome has a 67% worldwide browser market share (after peaking at 72.38% in November 2018) on personal computers (PC), is most used on tablets (h

In [10]:
# test the `from_url_to_kb` function
url = "https://finance.yahoo.com/news/microstrategy-bitcoin-millions-142143795.html"
kb = from_url_to_kb(url)
kb.print()

Entities:
  ('MicroStrategy', {'url': 'https://en.wikipedia.org/wiki/MicroStrategy', 'summary': "MicroStrategy Incorporated is an American company that provides business intelligence (BI), mobile software, and cloud-based services. Founded in 1989 by Michael J. Saylor, Sanju Bansal, and Thomas Spahr, the firm develops software to analyze internal and external data in order to make business decisions and to develop mobile apps. It is a public company headquartered in Tysons Corner, Virginia, in the Washington metropolitan area. Its primary business analytics competitors include SAP AG Business Objects, IBM Cognos, and Oracle Corporation's BI Platform. Saylor is the Executive Chairman and, from 1989 to 2022, was the CEO."})
  ('Michael J. Saylor', {'url': 'https://en.wikipedia.org/wiki/Michael_J._Saylor', 'summary': "Michael J. Saylor (born February 4, 1965) is an American entrepreneur and business executive. He is the executive chairman and co-founder of MicroStrategy, a company that pr

In [11]:
# from KB to HTML visualization
def save_network_html(kb, filename="network.html"):
    # create network
    net = Network(directed=True, width="auto", height="700px", bgcolor="#eeeeee")

    # nodes
    color_entity = "#00FF00"
    for e in kb.entities:
        net.add_node(e, shape="circle", color=color_entity)

    # edges
    for r in kb.relations:
        net.add_edge(r["head"], r["tail"],
                    title=r["type"], label=r["type"])

    # save network
    net.repulsion(
        node_distance=200,
        central_gravity=0.2,
        spring_length=200,
        spring_strength=0.05,
        damping=0.09
    )
    net.set_edge_smooth('dynamic')
    net.show(filename)

In [12]:
# extract KB from news about Google and visualize it
news_links = get_news_links("Google", pages=5, max_links=20)
kb = from_urls_to_kb(news_links, verbose=True)
filename = "network_3_google.html"
save_network_html(kb, filename=filename)

20 links to visit
Visiting https://www.cnet.com/tech/mobile/pixel-fold-vs-oneplus-open-a-very-expensive-camera-phone-comparison/&ved=2ahUKEwj4z8nhnKGDAxWjZ2wGHfAxBjM4ChDF9AF6BAgCEAI&usg=AOvVaw3bxxe0hCMACD1ghZwtqKwI...
  Couldn't download article at url https://www.cnet.com/tech/mobile/pixel-fold-vs-oneplus-open-a-very-expensive-camera-phone-comparison/&ved=2ahUKEwj4z8nhnKGDAxWjZ2wGHfAxBjM4ChDF9AF6BAgCEAI&usg=AOvVaw3bxxe0hCMACD1ghZwtqKwI
Visiting https://arstechnica.com/gadgets/2023/12/report-google-ads-restructure-could-replace-some-sales-jobs-with-ai/&ved=2ahUKEwjf6ezfnKGDAxX0e2wGHUVtDEMQxfQBegQICRAC&usg=AOvVaw01lpYpHuqyINdemd7F50Jp...
Visiting https://www.independent.co.uk/business/google-rolls-out-upgrade-to-ai-chatbot-bard-across-uk-b2466270.html&ved=2ahUKEwjf6ezfnKGDAxX0e2wGHUVtDEMQxfQBegQIBhAC&usg=AOvVaw2Gsb2JYbq04K_Wrzj8Nyho...
Visiting https://crypto.news/malicious-script-in-google-and-x-stole-58m-in-crypto-from-over-63000-users/&ved=2ahUKEwjf6ezfnKGDAxX0e2wGHUVtDEMQxfQBegQIARA

AttributeError: 'NoneType' object has no attribute 'render'