In [13]:
# Here I try to fix some of the problem's I've identified with newspaper3k
import newspaper
from newspaper import Article

import re
from os import path
import string

from newspaper3k_fixes import get_authors, get_keywords

In [14]:
# Problem 1: Newspaper3k does not properly fetch the authors for this article (demonstrated in ./newspaper3k-demo.ipynb)
# it gets the author's bio when it shouldn't
url = "https://www.foxnews.com/politics/biden-calls-supreme-court-daca-ruling-a-victory-vows-to-make-program-permanent"
article = Article(url)
Article.download(article)

article.parse()
print(f"Original: {article.authors}\n")

Original: ['Paul Steinhauser', 'Paul Steinhauser Is A Politics Reporter Based In New Hampshire.']



In [15]:
# Problem #1 Fixed: get_authors no longer gives bios
print(f"Fix : {get_authors(article)}")

Fix : ['Paul Steinhauser']


In [16]:
# Problem 2: Newspaper3k does not sort keywords by frequency (it tries, but the code is out-of-date and no longer works)
# It also obfuscates the fact that the keywords come from both the headline AND article body. Up to 10 are selected from each,
# but there's no way of telling which ones came from where.
article.nlp()
print(f"Original: {article.keywords}\n")

Original: ['young', 'supreme', 'daca', 'ruling', 'biden', 'program', 'permanent', 'calls', 'work', 'justices', 'victory', 'vows', 'trump', 'court', 'president']



In [17]:
# Problem #2 Fixed: keywords are now properly sorted in descending (most frequent -> least frequent) order,
#                   and are split up between the title and the body
print(f"Fix : {get_keywords(article)}")

Fix : {'text': ['president', 'court', 'trump', 'supreme', 'daca', 'ruling', 'program', 'justices', 'young', 'work'], 'title': ['vows', 'victory', 'supreme', 'ruling', 'program', 'permanent', 'daca', 'court', 'calls', 'biden']}
