In [1]:
import spacy
from spacy import displacy
from collections import Counter # Counter stores elements as dictionary keys, and their counts are stored as dictionary values.

import en_core_web_sm
nlp = en_core_web_sm.load()

In [2]:
from bs4 import BeautifulSoup #used for pulling data out of HTML and XML files
import requests
import re

In [3]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [4]:
covid_news = url_to_string('https://www.bbc.com/news/world-52748894')
article = nlp(covid_news)
len(article.ents)

151

In [5]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'DATE': 24,
         'ORG': 38,
         'GPE': 29,
         'PERSON': 19,
         'LOC': 13,
         'EVENT': 1,
         'CARDINAL': 17,
         'WORK_OF_ART': 1,
         'TIME': 2,
         'MONEY': 1,
         'NORP': 3,
         'ORDINAL': 2,
         'PRODUCT': 1})

There are 132 entities in the article and they are represented as 11 unique labels

In [6]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('US', 7), ('WHO', 6), ('BBC', 4)]

The most frequent tokens are: US, WHO, Wednesday

In [7]:
sentences = [x for x in article.sents]
print(sentences[70])

In charts: Tracking the global outbreak All 50 US states move toward reopening Trump's claims against WHO fact-checked Dr Tedros


In [8]:
displacy.render(nlp(str(sentences[70])), jupyter=True, style='ent')

In [9]:
displacy.render(nlp(str(sentences[70])), style='dep', jupyter = True, options = {'distance': 120})

In [10]:
[(x.orth_,x.pos_, x.lemma_) for x in [y for y in nlp(str(sentences[70]))if not y.is_stop and y.pos_ != 'PUNCT']]

[('charts', 'NOUN', 'chart'),
 ('Tracking', 'VERB', 'track'),
 ('global', 'ADJ', 'global'),
 ('outbreak', 'NOUN', 'outbreak'),
 ('50', 'NUM', '50'),
 ('states', 'NOUN', 'state'),
 ('reopening', 'VERB', 'reopen'),
 ('Trump', 'PROPN', 'Trump'),
 ('claims', 'NOUN', 'claim'),
 ('fact', 'NOUN', 'fact'),
 ('checked', 'VERB', 'check'),
 ('Dr', 'PROPN', 'Dr'),
 ('Tedros', 'PROPN', 'Tedros')]

In [11]:
dict([(str(x), x.label_) for x in nlp(str(sentences[70])).ents])

{'50': 'CARDINAL', 'US': 'GPE', 'Trump': 'ORG', 'WHO': 'ORG'}

In [12]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[70]])

[(In, 'O', ''), (charts, 'O', ''), (:, 'O', ''), (Tracking, 'O', ''), (the, 'O', ''), (global, 'O', ''), (outbreak, 'O', ''), (All, 'O', ''), (50, 'B', 'CARDINAL'), (US, 'B', 'GPE'), (states, 'O', ''), (move, 'O', ''), (toward, 'O', ''), (reopening, 'O', ''), (Trump, 'B', 'ORG'), ('s, 'O', ''), (claims, 'O', ''), (against, 'O', ''), (WHO, 'B', 'ORG'), (fact, 'O', ''), (-, 'O', ''), (checked, 'O', ''), (Dr, 'O', ''), (Tedros, 'O', '')]


In [15]:
displacy.render(article, jupyter=True, style='ent')