In [92]:
import newspaper
from newspaper import Article

from newspaper3k_fixes import get_authors, get_keywords

import math

## Here I'm just experimenting with the text in articles, seeing what can be done with simple algorithms/API's

In [14]:
# A variety of articles with similar/different viewpoints and topics
urls = [
    # (CNN) Breaking news report on the Supreme Court's DACA decision
    'https://www.cnn.com/2020/06/18/politics/daca-immigration-supreme-court/index.html',

    # (Fox News) Report on Biden's statement on the decision, which he strongly celebrated
    'https://www.foxnews.com/politics/biden-calls-supreme-court-daca-ruling-a-victory-vows-to-make-program-permanent',

    # (Fox News) Fox News columnist calling the decision a 'travesty'
    'https://www.foxnews.com/media/guy-benson-supreme-court-daca-decision-travesty',

    # (The Heritage Foundation) From a conservative foundation, arguing that DACA is unconstitutional
    'https://www.heritage.org/courts/commentary/its-time-end-daca-its-unconstitutional-unless-approved-congress',

    # (MSNBC) Report on Trump's reaction, described as 'lashing out' after it doesn't 'go his way'
    'https://www.nbcnews.com/politics/donald-trump/trump-lashes-out-supreme-court-after-daca-ruling-doesn-t-n1231438',
]

articles = list(map(Article, urls))
for article in articles:
    Article.download(article)
    article.parse()
    
print("Titles:\n")
for i, article in enumerate(articles):
    print(f"{i} : {article.title}")

Titles:

0 : Supreme Court blocks Trump from ending DACA
1 : Biden calls Supreme Court DACA ruling a ‘victory,’ vows to make program ‘permanent’
2 : Guy Benson rips Supreme Court DACA decision as a 'travesty that could set a bad precedent'
3 : It’s Time to End DACA – It’s Unconstitutional Unless Approved by Congress
4 : Trump lashes out at Supreme Court after DACA ruling doesn't go his way


In [17]:
keywords = {}
for article in articles:
    keywords[article] = get_keywords(article)

In [26]:
# Split title on keywords
for article in articles:
    split_title = []

    acc = []
    for word in article.title.lower().split(" "):
        if word in keywords[article]['text']:
            if acc != []:
                split_title.append(" ".join(acc))
            split_title.append(word)
            acc = []
        else:
            acc.append(word)
            
    if(acc != []):
        split_title.append(" ".join(acc))

    print(split_title)

['supreme', 'court', 'blocks', 'trump', 'from ending', 'daca']
['biden calls', 'supreme', 'court', 'daca', 'ruling', 'a ‘victory,’ vows to make', 'program', '‘permanent’']
['guy', 'benson', 'rips supreme court', 'daca', 'decision', "as a 'travesty that could set a bad precedent'"]
['it’s time to end', 'daca', '– it’s unconstitutional unless approved by', 'congress']
['trump', 'lashes out at', 'supreme', 'court', 'after', 'daca', 'ruling', "doesn't go his way"]


In [338]:
print(articles)

[<newspaper.article.Article object at 0x7fd62d0f3520>, <newspaper.article.Article object at 0x7fd62d0f3c10>, <newspaper.article.Article object at 0x7fd62d0f3490>, <newspaper.article.Article object at 0x7fd62d0f3f70>, <newspaper.article.Article object at 0x7fd62d0f3970>]


### Title stuff and proper nouns

In [363]:
# Finding proper nouns
article = articles[1]
print(f"url: {article.url}\n")
title_words = article.title.split(" ")
print(f"words in the title: {title_words}\n")

# Just looking for anything that starts with a capital letter will
# also catch the first word in a headline/sentence.
uppers = list(filter(lambda w: w[0].isupper(), title_words))
print(f"start w/ uppercase in title: {uppers}\n")

# So maybe look in the body and see if the first word in the headline
# is usually capitalized or not. Exclude words at the start of sentences.
word_counts = {}
for sentence in article.text.split('.'):
    sentence = sentence.strip('\n').replace(',', '').split(' ')

    if '' in sentence:
        sentence.remove('')

    for word in sentence[1:]:
        if word not in word_counts:
            word_counts[word] = {'total': 0, 'starts_with_cap': 0}
        word_counts[word]['total'] += 1
        if (word[0].isupper() and not all([letter.isupper() for letter in word]) and not "\n" in word):
            word_counts[word]['starts_with_cap'] += 1

propers = []
for word in word_counts:
    if word_counts[word]['starts_with_cap'] > math.floor(word_counts[word]['total'] / 2):
        propers.append(word)

print(f"usually start w/ uppercase in body: {propers}\n")

# Combining multiple proper nouns in a row
# There are cases where this totally fails
split_title = []
combo = []
for i, word in enumerate(title_words):
    if word in propers:
        combo.append(word)
    elif word not in propers or i == len(title_words):
        if(combo != []):
            split_title.append(" ".join(combo))
            combo = []
        split_title.append(word)

print(f"naive combining propers: {split_title}\n")

# Partial solution:
# only combine propers which are almost always seen in that order together
# i.e. 'Supreme Court'
combos = {}

def remove_punct(s):
    # ORDERED list of punctuation to remove
    to_remove = [',', '.', "'s", "’s", "'", "’"]
    for t in to_remove:
        s = s.replace(t, '')
    return s

body_words = article.text.split(" ")
for i, word in enumerate(body_words):
    word = remove_punct(word)
    combo = []
    if word in propers:
        for next_word in body_words[i+1:]:
            next_word = remove_punct(next_word)
            if(next_word in propers):
                if(word not in combo):
                    combo = [word]
                combo.append(next_word)
            else:
                break
        if combo != []:
            combo = tuple(combo)
            if combo not in combos:
                combos[combo] = 1
            else:
                combos[combo] += 1

# This part is a work in progress
# The math.floor(.../5) is totally arbitrary, and should be changed
to_remove = []
for combo, cnt in combos.items():
    remove = True
    for word in combo:
        if cnt > math.floor(word_counts[word]['total'] / 5):
            remove = False 
    if(remove):
        to_remove.append(combo)
        
for combo in to_remove:
    del combos[combo]
    
combos = list(combos.keys())
            
print(f"commonly paired propers: {combos}\n")

# Combining multiple proper nouns in a row (take 2)
# Only using the combos in combos
split_title = []
pieces = []

for i, word in enumerate(title_words):
    if word in propers:
        if all(map(lambda p : (i,word) not in p, pieces)):
            piece = [(i, word)]
            for j, next_word in enumerate(title_words[i+1:]):
                if next_word in propers:
                    piece.append((i + 1 + j, next_word))
                else:
                    break
            if(len(piece) == 1):
                pieces.append(piece)
            else:
                is_combo = False
                for k in range(len(piece)-1):
                    if tuple(map(lambda w : w[1], piece)) in combos:
                        pieces.append(piece)
                        is_combo = True
                        break
                    else:
                        del piece[-1]
                if not is_combo:
                    pieces.append([piece[0]])
    else:
        pieces.append([(i,word)])

# Put the pieces back together
split_title2 = list(map(lambda p : " ".join(tuple(map(lambda w : w[1], p))), pieces))
print(f"smarter combining propers: {split_title2}\n")

# Next: combine non-proper nouns?

url: https://www.foxnews.com/politics/biden-calls-supreme-court-daca-ruling-a-victory-vows-to-make-program-permanent

words in the title: ['Biden', 'calls', 'Supreme', 'Court', 'DACA', 'ruling', 'a', '‘victory,’', 'vows', 'to', 'make', 'program', '‘permanent’']

start w/ uppercase in title: ['Biden', 'Supreme', 'Court', 'DACA']

usually start w/ uppercase in body: ['Vice', 'President', 'Joe', 'Biden', 'Supreme', 'Court', 'Thursday', 'Trump', 'Obama-era', 'Court’s', 'Democratic', 'White', 'House', 'Congress', 'Chief', 'Justice', 'John', 'Roberts', 'Deferred', 'Action', 'Childhood', 'Arrivals', 'Administrative', 'Procedure', 'Act', 'Barack', 'Obama', 'American', "I'm", 'Obama’s', 'November', 'Monday', 'Republicans', 'Conservatives”', 'Election', 'Day', 'Justices', 'United', 'States', 'News’', 'Bill', 'Mears', 'Ronn', 'Blitzer', 'Madeleine', 'Rivera']

naive combining propers: ['Biden', 'calls', 'Supreme Court', 'DACA', 'ruling', 'a', '‘victory,’', 'vows', 'to', 'make', 'program', '‘perma

### NLK (Natural Language Toolkit) Library 

In [420]:
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 

from nltk.stem import WordNetLemmatizer 

import nltk.corpus
nltk.download('stopwords')

nltk.download('averaged_perceptron_tagger')

from nltk.tokenize.treebank import TreebankWordDetokenizer

[nltk_data] Downloading package wordnet to /home/blake/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/blake/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/blake/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [371]:
# Identifying type of words
words = ['amazing', 'interesting', 'great', 'blocks']
for w in words:
    tmp = wn.synsets(w)[0].pos()
    print(w + ":" + tmp)

n
amazing:v
interesting:v
great:n
blocks:n


#### NLTK Stemmer

In [299]:
ps = PorterStemmer() 
  
# choose some words to be stemmed
for p in split_title2: 
    if(len(p.split(" ")) == 1 and p not in propers):
        print(p, " : ", ps.stem(p))
    else:
        print(p)
    pass

Biden
calls  :  call
Supreme Court
DACA
ruling  :  rule
a  :  a
‘victory,’  :  ‘victory,’
vows  :  vow
to  :  to
make  :  make
program  :  program
‘permanent’  :  ‘permanent’


#### NLTK Lemmatizer

In [300]:
lemmatizer = WordNetLemmatizer() 

# choose some words to be lemmatized
for p in split_title2: 
    if(len(p.split(" ")) == 1 and p not in propers):
        print(p, " : ", lemmatizer.lemmatize(p))
    else:
        print(p)
    pass

Biden
calls  :  call
Supreme Court
DACA
ruling  :  ruling
a  :  a
‘victory,’  :  ‘victory,’
vows  :  vow
to  :  to
make  :  make
program  :  program
‘permanent’  :  ‘permanent’


### Using Wikipedia articles to get info on proper nouns

In [None]:
# Wikipedia!
import wikipedia

In [303]:
# Still using the same article from the blocks above
terms = list(filter(lambda p: len(p.split(" ")) > 1 or p in propers, split_title2))

stopwords = nltk.corpus.stopwords.words('english')

# not stopping at 'and' is sometimes necessary to get a meaningful description
stopwords.remove('and')

# find the bit of text between the first two stopwords
# i.e. "an American politician who served for 41 years in the Senate" -> "American politician"
def condense_desc(desc):
    desc_words = desc.split(" ")
    short_description = ''
    for i, word in enumerate(desc_words):
        if word in stopwords:
            for j, next_word in enumerate(desc_words[i+1:]):
                if next_word in stopwords:
                    return " ".join(desc_words[i+1:i+1+j])

# The first sentence of most Wikipedia articles usually has the following structure:
# (topic/title of article) (is/are/was/were) (description).
# We want the (description) part of this structure, so we split on the first is/are/was/were.
splits = ["is", "are", "was", "were"]
for term in terms:
    article_name = wikipedia.search(term)[0]
    summary = wikipedia.summary(article_name)
    
    first = ''
    for word in summary.split(" "):
        if word in splits:
            first = word
            break

    description = summary.split(first)[1]
    short_description = condense_desc(description)

    print(f"{term} : {short_description}")

Biden : American politician
Supreme Court : highest court within
DACA : United States immigration policy


In [302]:
print(split_title2)

['Biden', 'calls', 'Supreme Court', 'DACA', 'ruling', 'a', '‘victory,’', 'vows', 'to', 'make', 'program', '‘permanent’']


In [307]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/blake/nltk_data...


True

#### Sentiment analysis

In [337]:
sid = SentimentIntensityAnalyzer()

# must remove quotation marks for the analyzer to work
# (NOTE: should do this earlier? proper nouns may show up in quotes...)
def remove_quotes(text):
    quotes = ['‘', '’', '"']
    for q in quotes:
        text = text.replace(q, '')
    return text

title_noquotes = remove_quotes(" ".join(title_words))
print(title_noquotes)

ss1 = sid.polarity_scores(title_noquotes)
print(ss1)

Biden calls Supreme Court DACA ruling a victory, vows to make program permanent
{'neg': 0.0, 'neu': 0.753, 'pos': 0.247, 'compound': 0.5574}


#### Random stuff

In [435]:
# News headlines sometimes use a format like this:
# [subject] [present tense verb] [...], [another present tense verb] [...]
# where the subject is not repeated, and no 'and' is included
 
# To make it easier to compare with other articles, we can try to turn headlines
#  with this format into two cohesive sentences.

# Find present-tense 3rd person verbs
def verb_indices(tokens):
    # NLTK's parts-of-speech tagger
    # here's a useful page with all the tags: 
    # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    result = nltk.pos_tag(tokens)

    # Verb, 3rd person singular present
    tag = "VBZ"

    return [i for i in range(len(tokens)) if result[i][1] == tag]

text = remove_quotes(article.title)
print(text)
if text.count(',') == 1:
    tokens = nltk.word_tokenize(text)
    v_indices = verb_indices(tokens)
    comma_index = tokens.index(',')
    
    # Check if second part starts with a verb
    # the +1 comes from the comma being its own token
    if comma_index + 1 in v_indices:
        first_index = min(v_indices)
        
        if(first_index < comma_index):
            
            detokenizer = TreebankWordDetokenizer()
            before_first_verb = detokenizer.detokenize(tokens[:first_index])

            # Approach #1 : Just copy paste everything that comes before the first verb and put it 
            # at the start of the second sentence.
            second_sentence = before_first_verb + text.split(",")[1]
            print(f"Approach 1: {second_sentence}")
            
            # Approach #2 : Test if what comes before the first verb is a proper noun (i.e. 'President Trump')
            if(before_first_verb == split_title2[0]):
                second_sentence = before_first_verb + text.split(",")[1]
                print(f"Approach 2: {second_sentence}")
            else:
                print("Approach 2 Failed")

Biden calls Supreme Court DACA ruling a victory, vows to make program permanent
Approach 1: Biden vows to make program permanent
Approach 2: Biden vows to make program permanent


In [433]:
print(split_title2)

['Biden', 'calls', 'Supreme Court', 'DACA', 'ruling', 'a', '‘victory,’', 'vows', 'to', 'make', 'program', '‘permanent’']
