### Adapted from Tutorial
https://www.kdnuggets.com/2019/11/getting-started-automated-text-summarization.html

In [1]:
from collections import Counter 
from string import punctuation
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stop_words


In [2]:
# https://www.cnn.com/2019/11/26/politics/judiciary-committee-hearing/index.html

text = """
The House Judiciary Committee has invited President Donald Trump or his counsel to participate in the panel's first impeachment hearing next week as the House moves another step closer to impeaching the President. 
The committee announced that it would hold a hearing December 4 on the "constitutional grounds for presidential impeachment," with a panel of expert witnesses testifying.
House Judiciary Chairman Jerry Nadler sent a letter to Trump on Tuesday notifying him of the hearing and inviting the President or his counsel to participate, including asking questions of the witnesses.
"I write to ask if you or your counsel plan to attend the hearing or make a request to question the witness panel," the New York Democrat wrote.
In the letter, Nadler said the hearing would "serve as an opportunity to discuss the historical and constitutional basis of impeachment, as well as the Framers' intent and understanding of terms like 'high crimes and misdemeanors.' "
"We expect to discuss the constitutional framework through which the House may analyze the evidence gathered in the present inquiry," Nadler added. "We will also discuss whether your alleged actions warrant the House's exercising its authority to adopt articles of impeachment."
The Judiciary Committee hearing is the latest sign that House Democrats are moving forward with impeachment proceedings against the President following the two-month investigation led by the House Intelligence Committee into allegations that Trump pushed Ukraine to investigate his political rivals while a White House meeting and $400 million in security aid were withheld from Kiev.
The hearing announcement comes as the Intelligence Committee plans to release its report summarizing the findings of its investigation to the House Judiciary Committee soon after Congress returns from its Thanksgiving recess next week.
Democratic aides declined to say what additional hearings they will schedule as part of the impeachment proceedings.
The Judiciary Committee is expected to hold multiple hearings related to impeachment, and the panel would debate and approve articles of impeachment before a vote on the House floor.
The aides said the first hearing was a "legal hearing" that would include some history of impeachment, as well as evaluating the seriousness of the allegations and the evidence against the President.
Nadler asked Trump to respond by Sunday on whether the White House wanted to participate in the hearings, as well as who would act as the President's counsel for the proceedings. The letter was copied to White House Counsel Pat Cipollone.
"""

In [3]:
def tokenizer(s):
    tokens = []
    for word in s.split(' '):
        tokens.append(word.strip().lower())
    return tokens

def sent_tokenizer(s):
    sents = []
    for sent in s.split('.'):
        sents.append(sent.strip())
    return sents

In [4]:
# For tokens
tokens = tokenizer(text)
print(tokens)

['the', 'house', 'judiciary', 'committee', 'has', 'invited', 'president', 'donald', 'trump', 'or', 'his', 'counsel', 'to', 'participate', 'in', 'the', "panel's", 'first', 'impeachment', 'hearing', 'next', 'week', 'as', 'the', 'house', 'moves', 'another', 'step', 'closer', 'to', 'impeaching', 'the', 'president.', 'the', 'committee', 'announced', 'that', 'it', 'would', 'hold', 'a', 'hearing', 'december', '4', 'on', 'the', '"constitutional', 'grounds', 'for', 'presidential', 'impeachment,"', 'with', 'a', 'panel', 'of', 'expert', 'witnesses', 'testifying.\nhouse', 'judiciary', 'chairman', 'jerry', 'nadler', 'sent', 'a', 'letter', 'to', 'trump', 'on', 'tuesday', 'notifying', 'him', 'of', 'the', 'hearing', 'and', 'inviting', 'the', 'president', 'or', 'his', 'counsel', 'to', 'participate,', 'including', 'asking', 'questions', 'of', 'the', 'witnesses.\n"i', 'write', 'to', 'ask', 'if', 'you', 'or', 'your', 'counsel', 'plan', 'to', 'attend', 'the', 'hearing', 'or', 'make', 'a', 'request', 'to', 

In [5]:
# For sentences
sents = sent_tokenizer(text)
print(sents)

["The House Judiciary Committee has invited President Donald Trump or his counsel to participate in the panel's first impeachment hearing next week as the House moves another step closer to impeaching the President", 'The committee announced that it would hold a hearing December 4 on the "constitutional grounds for presidential impeachment," with a panel of expert witnesses testifying', 'House Judiciary Chairman Jerry Nadler sent a letter to Trump on Tuesday notifying him of the hearing and inviting the President or his counsel to participate, including asking questions of the witnesses', '"I write to ask if you or your counsel plan to attend the hearing or make a request to question the witness panel," the New York Democrat wrote', 'In the letter, Nadler said the hearing would "serve as an opportunity to discuss the historical and constitutional basis of impeachment, as well as the Framers\' intent and understanding of terms like \'high crimes and misdemeanors', '\' "\n"We expect to d

In [6]:
# Frequency of tokens
def count_words(tokens):
    word_counts = {}
    for token in tokens:
        if token not in stop_words and token not in punctuation:
            if token not in word_counts.keys():
                word_counts[token] = 1
            else:
                word_counts[token] += 1
    return word_counts

word_counts = count_words(tokens)
word_counts

{'house': 10,
 'judiciary': 5,
 'committee': 7,
 'invited': 1,
 'president': 3,
 'donald': 1,
 'trump': 4,
 'counsel': 5,
 'participate': 2,
 "panel's": 1,
 'impeachment': 4,
 'hearing': 8,
 'week': 1,
 'moves': 1,
 'step': 1,
 'closer': 1,
 'impeaching': 1,
 'president.': 1,
 'announced': 1,
 'hold': 2,
 'december': 1,
 '4': 1,
 '"constitutional': 1,
 'grounds': 1,
 'presidential': 1,
 'impeachment,"': 1,
 'panel': 2,
 'expert': 1,
 'witnesses': 1,
 'testifying.\nhouse': 1,
 'chairman': 1,
 'jerry': 1,
 'nadler': 3,
 'sent': 1,
 'letter': 2,
 'tuesday': 1,
 'notifying': 1,
 'inviting': 1,
 'participate,': 1,
 'including': 1,
 'asking': 1,
 'questions': 1,
 'witnesses.\n"i': 1,
 'write': 1,
 'ask': 1,
 'plan': 1,
 'attend': 1,
 'make': 1,
 'request': 1,
 'question': 1,
 'witness': 1,
 'panel,"': 1,
 'new': 1,
 'york': 1,
 'democrat': 1,
 'wrote.\nin': 1,
 'letter,': 1,
 'said': 2,
 '"serve': 1,
 'opportunity': 1,
 'discuss': 3,
 'historical': 1,
 'constitutional': 2,
 'basis': 1,
 'imp

In [7]:
# Relative frequency
def word_freq_distribution(word_counts):
    freq_dist = {}
    max_freq = max(word_counts.values())
    for word in word_counts.keys():  
        freq_dist[word] = (word_counts[word]/max_freq)
    return freq_dist

freq_dist = word_freq_distribution(word_counts)
freq_dist

{'house': 1.0,
 'judiciary': 0.5,
 'committee': 0.7,
 'invited': 0.1,
 'president': 0.3,
 'donald': 0.1,
 'trump': 0.4,
 'counsel': 0.5,
 'participate': 0.2,
 "panel's": 0.1,
 'impeachment': 0.4,
 'hearing': 0.8,
 'week': 0.1,
 'moves': 0.1,
 'step': 0.1,
 'closer': 0.1,
 'impeaching': 0.1,
 'president.': 0.1,
 'announced': 0.1,
 'hold': 0.2,
 'december': 0.1,
 '4': 0.1,
 '"constitutional': 0.1,
 'grounds': 0.1,
 'presidential': 0.1,
 'impeachment,"': 0.1,
 'panel': 0.2,
 'expert': 0.1,
 'witnesses': 0.1,
 'testifying.\nhouse': 0.1,
 'chairman': 0.1,
 'jerry': 0.1,
 'nadler': 0.3,
 'sent': 0.1,
 'letter': 0.2,
 'tuesday': 0.1,
 'notifying': 0.1,
 'inviting': 0.1,
 'participate,': 0.1,
 'including': 0.1,
 'asking': 0.1,
 'questions': 0.1,
 'witnesses.\n"i': 0.1,
 'write': 0.1,
 'ask': 0.1,
 'plan': 0.1,
 'attend': 0.1,
 'make': 0.1,
 'request': 0.1,
 'question': 0.1,
 'witness': 0.1,
 'panel,"': 0.1,
 'new': 0.1,
 'york': 0.1,
 'democrat': 0.1,
 'wrote.\nin': 0.1,
 'letter,': 0.1,
 'sai

In [8]:
# Scoring sentences by occurence of frequent words
def score_sentences(sents, freq_dist, max_len=40):
    sent_scores = {}  
    for sent in sents:
        words = sent.split(' ')
        for word in words:
            if word.lower() in freq_dist.keys():
                if len(words) < max_len:
                    if sent not in sent_scores.keys():
                        sent_scores[sent] = freq_dist[word.lower()]
                    else:
                        sent_scores[sent] += freq_dist[word.lower()]
    return sent_scores

sent_scores = score_sentences(sents, freq_dist)
sent_scores

{"The House Judiciary Committee has invited President Donald Trump or his counsel to participate in the panel's first impeachment hearing next week as the House moves another step closer to impeaching the President": 6.899999999999999,
 'The committee announced that it would hold a hearing December 4 on the "constitutional grounds for presidential impeachment," with a panel of expert witnesses testifying': 2.8000000000000007,
 'House Judiciary Chairman Jerry Nadler sent a letter to Trump on Tuesday notifying him of the hearing and inviting the President or his counsel to participate, including asking questions of the witnesses': 5.099999999999999,
 '"I write to ask if you or your counsel plan to attend the hearing or make a request to question the witness panel," the New York Democrat wrote': 2.5000000000000004,
 'In the letter, Nadler said the hearing would "serve as an opportunity to discuss the historical and constitutional basis of impeachment, as well as the Framers\' intent and u

In [9]:
# Produce summary using top k sentences
def summarize(sent_scores, k):
    top_sents = Counter(sent_scores) 
    summary = ''
    scores = []
    
    top = top_sents.most_common(k)
    for t in top: 
        summary += t[0].strip() + '. '
        scores.append((t[1], t[0]))
    return summary[:-1], scores

In [10]:
# Summary
summary, summary_sent_scores = summarize(sent_scores, 3)
print(summary)

The House Judiciary Committee has invited President Donald Trump or his counsel to participate in the panel's first impeachment hearing next week as the House moves another step closer to impeaching the President. The hearing announcement comes as the Intelligence Committee plans to release its report summarizing the findings of its investigation to the House Judiciary Committee soon after Congress returns from its Thanksgiving recess next week. House Judiciary Chairman Jerry Nadler sent a letter to Trump on Tuesday notifying him of the hearing and inviting the President or his counsel to participate, including asking questions of the witnesses.


In [11]:
# Verifying sentence scores
for score in summary_sent_scores: print(score[0], '->', score[1], '\n')

6.899999999999999 -> The House Judiciary Committee has invited President Donald Trump or his counsel to participate in the panel's first impeachment hearing next week as the House moves another step closer to impeaching the President 

5.399999999999999 -> The hearing announcement comes as the Intelligence Committee plans to release its report summarizing the findings of its investigation to the House Judiciary Committee soon after Congress returns from its Thanksgiving recess next week 

5.099999999999999 -> House Judiciary Chairman Jerry Nadler sent a letter to Trump on Tuesday notifying him of the hearing and inviting the President or his counsel to participate, including asking questions of the witnesses 



### Try for Noble text
Reference: https://github.com/biplav-s/course-nl/blob/master/common-data/nobel-2020/dataset-nyt-nobel2020.txt

In [12]:
# Doing this for a new example
text = """
2020 Nobel Prize Winners: Full List
Nobel Prize season begins every October as committees in Sweden and Norway name laureates in a variety of prizes in the sciences, literature and economics, as well as peace work. The announcements started last week with the awarding of the prize in Physiology or Medicine. They wrapped up on Monday, when the Sveriges Riksbank Prize in Economic Sciences in Memory of Alfred Nobel was announced.
The Nobel Prizes most years are presented to recipients in Stockholm and Oslo in December. Because of the coronavirus pandemic, the committees are changing their approaches. Some of the events in Stockholm will be canceled in favor of a digital ceremony for the Nobelists, and medals and diplomas are to be distributed to the recipients? embassies and handed over in their home countries. Recipients may be invited to the award ceremony for 2021, if possible.
The Oslo ceremony for the peace prize will be smaller than in most years, with a limited audience.
The Nobel committee also announced another change last month: Each prize will rise to 10 million Swedish krona, 1 million more than in the previous year. That?s a hike in the prize value of about $112,000 in current exchange rates.
The 2020 Winners
Physiology or Medicine
Drs. Harvey J. Alter, Michael Houghton and Charles M. Rice on Monday received the prize for their discovery of the hepatitis C virus. The Nobel committee said the three scientists had ?made possible blood tests and new medicines that have saved millions of lives.?
Physics
Roger Penrose, Reinhard Genzel and Andrea Ghez are the recipients of the Nobel Prize in Physics for 2020.Credit...Pool photo by Fredrik Sandberg
Roger Penrose, Reinhard Genzel and Andrea Ghez received the prize on Tuesday for their discoveries that have improved understanding of the universe, including work on black holes.
Chemistry
The Nobel Prize in Chemistry was jointly awarded on Wednesday to Emmanuelle Charpentier and Jennifer A. Doudna for their work on the development of Crispr-Cas9, a method for genome editing.
Literature
The Nobel Prize in Literature was awarded on Thursday to Louise Glack, one of America?s most celebrated poets, ?for her unmistakable poetic voice that with austere beauty makes individual existence universal.?
Peace Prize
The Nobel Peace Prize was awarded on Friday to the World Food Program for its efforts to combat a surge in global hunger amid the coronavirus pandemic, which has swept around the world with devastating impact.
Economic Science
Paul R. Milgrom and Robert B. Wilson were awarded the Nobel in economic science on Monday for improvements to auction theory and inventions of new auction formats.
"""

In [13]:
# Process for tokens and word frequencey
tokens = tokenizer(text)
word_counts = count_words(tokens)
freq_dist = word_freq_distribution(word_counts)


In [14]:
# Process for sentences based on word frequency
sents = sent_tokenizer(text)
sent_scores = score_sentences(sents, freq_dist)
summary, summary_sent_scores = summarize(sent_scores, 3)

In [15]:
# Get summary
print(summary)

2020 Nobel Prize Winners: Full List
Nobel Prize season begins every October as committees in Sweden and Norway name laureates in a variety of prizes in the sciences, literature and economics, as well as peace work. ?
Peace Prize
The Nobel Peace Prize was awarded on Friday to the World Food Program for its efforts to combat a surge in global hunger amid the coronavirus pandemic, which has swept around the world with devastating impact. Literature
The Nobel Prize in Literature was awarded on Thursday to Louise Glack, one of America?s most celebrated poets, ?for her unmistakable poetic voice that with austere beauty makes individual existence universal.


In [16]:
# Verifying sentence scores
for score in summary_sent_scores: print(score[0], '->', score[1], '\n')

4.615384615384617 -> 2020 Nobel Prize Winners: Full List
Nobel Prize season begins every October as committees in Sweden and Norway name laureates in a variety of prizes in the sciences, literature and economics, as well as peace work 

3.846153846153848 -> ?
Peace Prize
The Nobel Peace Prize was awarded on Friday to the World Food Program for its efforts to combat a surge in global hunger amid the coronavirus pandemic, which has swept around the world with devastating impact 

3.3846153846153872 -> Literature
The Nobel Prize in Literature was awarded on Thursday to Louise Glack, one of America?s most celebrated poets, ?for her unmistakable poetic voice that with austere beauty makes individual existence universal 

