In [48]:
from article_urls import simple_article_urls

In [49]:
from newspaper import Article

def get_data(article_urls: list):
    articles = []
    for url in article_urls:
        article = Article(url)
        article.download()
        article.parse()
        articles.append(article)
    return articles


In [50]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest

def summarize(text, per, model):
    nlp = spacy.load(model)
    doc = nlp(text)
    tokens = [token.text for token in doc]
    word_frequencies = {}

    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1

    max_frequency = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word]/max_frequency

    sentence_tokens = [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent] += word_frequencies[word.text.lower()]
  
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores, key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    return ''.join(final_summary)

In [51]:
def print_summary(article, summaries):
    print('SUMMARIES\n\n')
    print('en_core_web_sm')
    print(f'{summaries["sm_summary"]}\n\n')
    print('en_core_web_lg')
    print(f'{summaries["lg_summary"]}\n\n')
    print('ARTICLE\n\n')
    print(f'{article.text}\n\n')

In [52]:
articles = get_data(simple_article_urls)

for article in articles:
    summaries = {}
    summaries['sm_summary'] = summarize(article.text, 0.1, 'en_core_web_sm')
    summaries['lg_summary'] = summarize(article.text, 0.1, 'en_core_web_lg')
    print_summary(article, summaries)

SUMMARIES


en_core_web_sm
The team divided species into either nondiving birds, or one of three diving types: foot-propelled pursuit (such as loons and grebes), wing-propelled pursuit (like penguins and auks) and the plunge divers.

Among 236 diving bird species, 75, or 32 percent, were part of lineages that are experiencing 0.02 more species extinctions per million years than the generation of new species.


en_core_web_lg
The team divided species into either nondiving birds, or one of three diving types: foot-propelled pursuit (such as loons and grebes), wing-propelled pursuit (like penguins and auks) and the plunge divers.

Among 236 diving bird species, 75, or 32 percent, were part of lineages that are experiencing 0.02 more species extinctions per million years than the generation of new species.


ARTICLE


Birds that dive underwater — such as penguins, loons and grebes — may be more likely to go extinct than their nondiving kin, a new study finds.

Many water birds have evolved