In [92]:
import newspaper
from newspaper import Article

from newspaper3k_fixes import get_authors, get_keywords

import math

## Here I'm just experimenting with the text in articles, seeing what can be done with simple algorithms/API's

In [14]:
# A variety of articles with similar/different viewpoints and topics
urls = [
    # (CNN) Breaking news report on the Supreme Court's DACA decision
    'https://www.cnn.com/2020/06/18/politics/daca-immigration-supreme-court/index.html',

    # (Fox News) Report on Biden's statement on the decision, which he strongly celebrated
    'https://www.foxnews.com/politics/biden-calls-supreme-court-daca-ruling-a-victory-vows-to-make-program-permanent',

    # (Fox News) Fox News columnist calling the decision a 'travesty'
    'https://www.foxnews.com/media/guy-benson-supreme-court-daca-decision-travesty',

    # (The Heritage Foundation) From a conservative foundation, arguing that DACA is unconstitutional
    'https://www.heritage.org/courts/commentary/its-time-end-daca-its-unconstitutional-unless-approved-congress',

    # (MSNBC) Report on Trump's reaction, described as 'lashing out' after it doesn't 'go his way'
    'https://www.nbcnews.com/politics/donald-trump/trump-lashes-out-supreme-court-after-daca-ruling-doesn-t-n1231438',
]

articles = list(map(Article, urls))
for article in articles:
    Article.download(article)
    article.parse()
    
print("Titles:\n")
for i, article in enumerate(articles):
    print(f"{i} : {article.title}")

Titles:

0 : Supreme Court blocks Trump from ending DACA
1 : Biden calls Supreme Court DACA ruling a ‘victory,’ vows to make program ‘permanent’
2 : Guy Benson rips Supreme Court DACA decision as a 'travesty that could set a bad precedent'
3 : It’s Time to End DACA – It’s Unconstitutional Unless Approved by Congress
4 : Trump lashes out at Supreme Court after DACA ruling doesn't go his way


In [17]:
keywords = {}
for article in articles:
    keywords[article] = get_keywords(article)

In [26]:
# Split title on keywords
for article in articles:
    split_title = []

    acc = []
    for word in article.title.lower().split(" "):
        if word in keywords[article]['text']:
            if acc != []:
                split_title.append(" ".join(acc))
            split_title.append(word)
            acc = []
        else:
            acc.append(word)
            
    if(acc != []):
        split_title.append(" ".join(acc))

    print(split_title)

['supreme', 'court', 'blocks', 'trump', 'from ending', 'daca']
['biden calls', 'supreme', 'court', 'daca', 'ruling', 'a ‘victory,’ vows to make', 'program', '‘permanent’']
['guy', 'benson', 'rips supreme court', 'daca', 'decision', "as a 'travesty that could set a bad precedent'"]
['it’s time to end', 'daca', '– it’s unconstitutional unless approved by', 'congress']
['trump', 'lashes out at', 'supreme', 'court', 'after', 'daca', 'ruling', "doesn't go his way"]


### Title stuff and proper nouns

In [199]:
# Finding proper nouns
article = articles[1]
title_words = article.title.split(" ")
print(f"words in the title: {title_words}\n")

# Just looking for anything that starts with a capital letter will
# also catch the first word in a headline/sentence.
uppers = list(filter(lambda w: w[0].isupper(), title_words))
print(f"start w/ uppercase in title: {uppers}\n")

# So maybe look in the body and see if the first word in the headline
# is usually capitalized or not. Exclude words at the start of sentences.
word_counts = {}
for sentence in article.text.split('.'):
    sentence = sentence.strip('\n').replace(',', '').split(' ')

    if '' in sentence:
        sentence.remove('')

    for word in sentence[1:]:
        if word not in word_counts:
            word_counts[word] = {'total': 0, 'starts_with_cap': 0}
        word_counts[word]['total'] += 1
        if word[0].isupper():
            word_counts[word]['starts_with_cap'] += 1

propers = []
for word in word_counts:
    if word_counts[word]['starts_with_cap'] > math.ceil(word_counts[word]['total'] / 2):
        propers.append(word)

print(f"usually start w/ uppercase in body: {propers}\n")

# Combining multiple proper nouns in a row
# There are cases where this totally fails
split_title = []
combo = []
for i, word in enumerate(title_words):
    if word in propers:
        combo.append(word)
    elif word not in propers or i == len(title_words):
        if(combo != []):
            split_title.append(" ".join(combo))
            combo = []
        split_title.append(word)

print(f"naive combining propers: {split_title}\n")

# Partial solution:
# only combine propers which are almost always seen in that order together
# i.e. 'Supreme Court'
combos = {}

def remove_punct(s):
    # ORDERED list of punctuation to remove
    to_remove = [',', '.', "'s", "'"]
    for t in to_remove:
        s = s.replace(t, '')
    return s

body_words = article.text.split(" ")
for i, word in enumerate(body_words):
    word = remove_punct(word)
    combo = []
    if word in propers:
        for next_word in body_words[i+1:]:
            next_word = remove_punct(next_word)
            if(next_word in propers):
                if(word not in combo):
                    combo = [word]
                combo.append(next_word)
            else:
                break
        if combo != []:
            combo = tuple(combo)
            if combo not in combos:
                combos[combo] = 1
            else:
                combos[combo] += 1

# This part is a work in progress
# The math.floor(.../5) is totally arbitrary, and should be changed
to_remove = []
for combo, cnt in combos.items():
    remove = True
    for word in combo:
        if cnt > math.floor(word_counts[word]['total'] / 5):
            remove = False 
    if(remove):
        to_remove.append(combo)
        
for combo in to_remove:
    del combos[combo]
    
combos = list(combos.keys())
            
print(f"commonly paired propers: {combos}\n")

# Combining multiple proper nouns in a row (take 2)
# Only using the combos in combos
split_title = []
pieces = []

for i, word in enumerate(title_words):
    if word in propers:
        if all(map(lambda p : (i,word) not in p, pieces)):
            piece = [(i, word)]
            for j, next_word in enumerate(title_words[i+1:]):
                if next_word in propers:
                    piece.append((i + 1 + j, next_word))
                else:
                    break
            if(len(piece) == 1 or tuple(map(lambda w : w[1], piece)) in combos):
                pieces.append(piece)
            else:
                is_combo = False
                for k in range(len(piece)-1):
                    del piece[-1]
                    if tuple(map(lambda w : w[1], piece)) in combos:
                        pieces.append(piece)
                        is_combo = True
                        break
                if not is_combo:
                    pieces.append([piece[0]])
    else:
        pieces.append([(i,word)])

# Put the pieces back together
split_title2 = list(map(lambda p : " ".join(tuple(map(lambda w : w[1], p))), pieces))
print(f"smarter combining propers: {split_title2}\n")

# Next: combine non-proper nouns?

words in the title: ['Biden', 'calls', 'Supreme', 'Court', 'DACA', 'ruling', 'a', '‘victory,’', 'vows', 'to', 'make', 'program', '‘permanent’']

start w/ uppercase in title: ['Biden', 'Supreme', 'Court', 'DACA']

usually start w/ uppercase in body: ['President', 'Biden', 'Supreme', 'Court', 'Trump', 'DACA', 'COURT', 'I']

naive combining propers: ['Biden', 'calls', 'Supreme Court DACA', 'ruling', 'a', '‘victory,’', 'vows', 'to', 'make', 'program', '‘permanent’']

commonly paired propers: [('Supreme', 'Court'), ('President', 'Trump')]

smarter combining propers: ['Biden', 'calls', 'Supreme Court', 'DACA', 'ruling', 'a', '‘victory,’', 'vows', 'to', 'make', 'program', '‘permanent’']



### NLK (Natural Language Toolkit) Library 

In [200]:
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/blake/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [42]:
# Identifying type of words
words = ['amazing', 'interesting', 'great', 'blocks']

for w in words:
    tmp = wn.synsets(w)[0].pos()
    print(w + ":" + tmp)

amazing:v
interesting:v
great:n
blocks:n
