In [1]:
# A short demonstration of some useful features of newspaper3k
import newspaper
from newspaper import Article

import time

In [2]:
# Various testing articles
urls = {
    "CNN": 'https://www.cnn.com/2020/06/18/politics/daca-immigration-supreme-court/index.html',
    "Fox News": 'https://www.foxnews.com/politics/biden-calls-supreme-court-daca-ruling-a-victory-vows-to-make-program-permanent',
    "NBC News": 'https://www.nbcnews.com/politics/donald-trump/trump-lashes-out-supreme-court-after-daca-ruling-doesn-t-n1231438',
    "no author listed": 'https://www.aljazeera.com/news/2020/06/supreme-court-rules-trump-capricious-daca-case-200618135613615.html',
    "poor formatting": 'https://www.natlawreview.com/article/daca-back-now-supreme-court-reverses-trump-administration-s-refusal-to-honor'
}

## Loading and processing articles

In [3]:
articles = {k: Article(v) for k, v in urls.items()}
print(f"newspaper3k articles have type {type(articles['CNN'])}")

start = time.time()
for k, v in articles.items():
    Article.download(v)
end = time.time()
print(f"It took {end - start} seconds to download {len(articles)} articles.")

newspaper3k articles have type <class 'newspaper.article.Article'>
It took 3.504413366317749 seconds to download 5 articles.


In [4]:
# Article.parse() processes an article, mostly serving to find and characterize meta-data
# The following useful fields become available after parsing:
# title, description, authors, tags, and all fields prefaced by 'meta_'

start = time.time()
for k, v in articles.items():
    v.parse()
end = time.time()
print(f"It took {end - start} seconds to parse {len(articles)} articles.")

It took 0.5623083114624023 seconds to parse 5 articles.


In [5]:
# Article.nlp() performs NLP on an article.
# The following fields become available after this step:
# keywords, summary

start = time.time()
for k, v in articles.items():
    v.nlp()
end = time.time()
print(f"It took {end - start} seconds to perform NLP on {len(articles)} articles.")

It took 0.6322817802429199 seconds to perform NLP on 5 articles.


## Meta-Data

In [6]:
# Example of meta-data found in the HTML of an article web-page
cnn_article = articles['CNN']
for k, v in cnn_article.meta_data.items():
    print(f"{k} : {v}\n")

viewport : width=device-width, initial-scale=1.0, minimum-scale=1.0

section : politics

referrer : unsafe-url

og : {'pubdate': '2020-06-18T14:07:23Z', 'url': 'https://www.cnn.com/2020/06/18/politics/daca-immigration-supreme-court/index.html', 'title': 'Supreme Court blocks Trump from ending DACA', 'description': "The Supreme Court on Thursday blocked the Trump administration's attempt to end Deferred Action for Childhood Arrivals, an Obama-era program that protects hundreds of thousands of immigrants brought to the US as children from deportation.", 'site_name': 'CNN', 'type': 'article', 'image': {'identifier': 'https://cdn.cnn.com/cnnnext/dam/assets/191112122425-02-daca-scotus-rally-1112-super-tease.jpg', 'width': 1100, 'height': 619}}

pubdate : 2020-06-18T14:07:23Z

lastmod : 2020-06-18T22:52:13Z

author : Ariane de Vogue, Devan Cole and Jamie Ehrlich, CNN

article : {'author': 'https://www.facebook.com/Ariane-de-Vogue-1005310649532336', 'opinion': 'false', 'content-tier': 'free'}

### Authors

In [7]:
# Author identification
for k, v in articles.items():
    print(f"{k} : {v.authors}\n")

CNN : ['Ariane De Vogue', 'Devan Cole', 'Jamie Ehrlich']

Fox News : ['Paul Steinhauser', 'Paul Steinhauser Is A Politics Reporter Based In New Hampshire.']

NBC News : ['Allan Smith', 'Allan Smith Is A Political Reporter For Nbc News.']

no author listed : []

poor formatting : ['Article By']



#### Issues:
* Sometimes an author's bio is found instead of their name
* Nonsense strings are sometimes fetched from poorly-formatted articles

### Keywords

In [8]:
# Keywords
# newspaper3k finds the most-used words in the article body AND title (>= 10 from each), excluding a custom list of stopwords
for k, v in articles.items():
    print(f"{k} : {v.keywords}\n")

CNN : ['court', 'blocks', 'decision', 'ending', 'program', 'dreamers', 'congress', 'daca', 'roberts', 'supreme', 'administration', 'trump']

Fox News : ['work', 'calls', 'court', 'president', 'young', 'trump', 'program', 'justices', 'daca', 'ruling', 'victory', 'supreme', 'biden', 'vows', 'permanent']

NBC News : ['lashes', 'court', 'doesnt', 'president', 'decision', 'way', 'united', 'daca', 'ruling', 'tweeted', 'supreme', 'trump', 'legal']

no author listed : ['capricious', 'court', 'decision', 'immigration', 'trumps', 'rules', 'voters', 'case', 'dreamers', 'daca', 'end', 'supreme', 'trump']

poor formatting : ['undocumented', 'federal', 'opinion', 'immigrants', 'tcpa', 'makes', 'honor', 'waste', 'daca', 'supreme', 'deferred', 'refusal', 'reverses', 'various', 'v', 'univ', 'work', 'entered', 'trump']



#### Issues:
* a bug causes the keywords to NOT be sorted by frequency, but we can fix that. I've been looking into it, and the pipeline is a bit of an unmaintained mess, but there are easy ways to hack it.

### Summaries

In [11]:
# Summaries
# newspaper3k does its best to summarize an article. It achieves this by prioritizing sentences which:
# * Share many words in common with the title
# * Are close to the beginning of the article
# * Contain approximately 20 words
# * Contain many keywords
# * Contain keywords in close promixity to each other
# Up to 5 sentences are selected.
for k, v in articles.items():
    print(f"{k} : {v.summary}\n")

CNN : The opinion is the second time in a week when the Supreme Court -- bolstered with two of President Donald Trump's nominees -- has ruled against the Trump administration.
Luz Chavez, a DACA recipient based in Maryland, was at the steps of the Supreme Court when the decision came down Thursday.
"We want to be in the Supreme Court on DACA," Trump said.
"I'm happy the Supreme Court upheld DACA to protect Dreamers from the crisis Trump created.
"Today's Supreme Court decision to uphold DACA is a happy end to the cruel uncertainty the Trump administration put these young people through.

Fox News : Former Vice President Joe Biden applauded a Supreme Court ruling Thursday that rejected the push by President Trump and his administration to scrap the Obama-era program that offers legal protections to young immigrants who entered the U.S. illegally as children.
SUPREME COURT RULES AGAINST TRUMP PUSH TO END DACA PROGRAMAnd Biden vowed that if elected to the White House, he would “immediatel

#### Issues:
* Overall, this seems like the most robust and intelligent feature in the library! We can tweak the weights and scoring algorithm if we want to, though.

## Other methods & fields

In [14]:
dir(articles['CNN'])

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'additional_data',
 'article_html',
 'authors',
 'build',
 'build_resource_path',
 'canonical_link',
 'clean_doc',
 'clean_top_node',
 'config',
 'doc',
 'download',
 'download_exception_msg',
 'download_state',
 'extractor',
 'fetch_images',
 'get_parse_candidate',
 'get_resource_path',
 'has_top_image',
 'html',
 'images',
 'imgs',
 'is_media_news',
 'is_parsed',
 'is_valid_body',
 'is_valid_url',
 'keywords',
 'link_hash',
 'meta_data',
 'meta_description',
 'meta_favicon',
 'meta_img',
 'meta_keywords',
 'meta_lang',
 'movies',
 'nlp',
 'parse',
 'publish_date',
 'release_resources',
 'set_article_html',
 'set_authors