### Prototype code to scrape the top headlines and their corresponding article text from News API

In [26]:
import pandas as pd
import json

In [27]:
import pprint
import requests

In [28]:
# API Key for News API
secret = 'e3eca7fe616e4895bbd8c162ec9d567d'

In [29]:
# Define the endpoint to extract all the top headlines 
url = 'https://newsapi.org/v2/top-headlines?'

In [30]:
# Specify the query and number of returns - Limit the headlines to country US, for now
parameters = {
    'country': 'us',
    'category': 'business',
    'pageSize': 100,
    'apiKey': secret 
}

In [31]:
# Make the request
response = requests.get(url, params=parameters)

# Convert the response to JSON format and pretty print it
response_json = response.json()
pprint.pprint(response_json)

{'articles': [{'author': 'Jeff Cox',
               'content': 'Amid historically aggressive policy moves from the '
                          'U.S. central bank and Congress, Dallas Federal '
                          'Reserve President Robert Kaplan said the most '
                          'important thing for the economy now is wearing '
                          'protecti… [+1714 chars]',
               'description': '"If we all wore a mask, it would substantially '
                              'mute the transmission of this disease and we '
                              'would grow faster," he added.',
               'publishedAt': '2020-07-10T14:16:00Z',
               'source': {'id': None, 'name': 'CNBC'},
               'title': 'Wearing masks is now the most important thing for the '
                        "economy, Fed's Kaplan says - CNBC",
               'url': 'https://www.cnbc.com/2020/07/10/wearing-masks-is-now-the-most-important-thing-for-the-economy-feds-kaplan-sa

In [32]:
# Convert the json response to a dataframe
df = pd.DataFrame.from_dict(response_json)

In [33]:
print(df)

   status  totalResults                                           articles
0      ok            70  {'source': {'id': None, 'name': 'CNBC'}, 'auth...
1      ok            70  {'source': {'id': 'the-hill', 'name': 'The Hil...
2      ok            70  {'source': {'id': 'techcrunch', 'name': 'TechC...
3      ok            70  {'source': {'id': None, 'name': 'MarketWatch'}...
4      ok            70  {'source': {'id': None, 'name': 'Motley Fool'}...
..    ...           ...                                                ...
65     ok            70  {'source': {'id': 'the-wall-street-journal', '...
66     ok            70  {'source': {'id': None, 'name': 'Motley Fool'}...
67     ok            70  {'source': {'id': None, 'name': 'CNBC'}, 'auth...
68     ok            70  {'source': {'id': None, 'name': 'The Japan Tim...
69     ok            70  {'source': {'id': 'cbs-news', 'name': 'CBS New...

[70 rows x 3 columns]


In [34]:
df = pd.concat([df.drop(['articles'], axis=1), df['articles'].apply(pd.Series)], axis=1) 

In [35]:
print(df)

   status  totalResults                                             source  \
0      ok            70                       {'id': None, 'name': 'CNBC'}   
1      ok            70             {'id': 'the-hill', 'name': 'The Hill'}   
2      ok            70         {'id': 'techcrunch', 'name': 'TechCrunch'}   
3      ok            70                {'id': None, 'name': 'MarketWatch'}   
4      ok            70                {'id': None, 'name': 'Motley Fool'}   
..    ...           ...                                                ...   
65     ok            70  {'id': 'the-wall-street-journal', 'name': 'The...   
66     ok            70                {'id': None, 'name': 'Motley Fool'}   
67     ok            70                       {'id': None, 'name': 'CNBC'}   
68     ok            70            {'id': None, 'name': 'The Japan Times'}   
69     ok            70             {'id': 'cbs-news', 'name': 'CBS News'}   

                       author  \
0                    Jeff Cox 

In [36]:
df.shape

(70, 10)

In [37]:
df.head(50)

Unnamed: 0,status,totalResults,source,author,title,description,url,urlToImage,publishedAt,content
0,ok,70,"{'id': None, 'name': 'CNBC'}",Jeff Cox,Wearing masks is now the most important thing ...,"""If we all wore a mask, it would substantially...",https://www.cnbc.com/2020/07/10/wearing-masks-...,https://image.cnbcfm.com/api/v1/image/10610962...,2020-07-10T14:16:00Z,Amid historically aggressive policy moves from...
1,ok,70,"{'id': 'the-hill', 'name': 'The Hill'}",J. Edward Moreno,"WaWa, Giant begin asking for 'exact change' as...",Businesses are begging to ask customers to pay...,https://thehill.com/homenews/news/506722-wawa-...,https://thehill.com/sites/default/files/wawa.jpg,2020-07-10T13:44:51Z,Businesses are begging to ask customers to pay...
2,ok,70,"{'id': 'techcrunch', 'name': 'TechCrunch'}",Kirsten Korosec,Rivian raises $2.5 billion as it pushes to bri...,"Rivian, the electric vehicle company aiming to...",http://techcrunch.com/2020/07/10/rivian-raises...,https://techcrunch.com/wp-content/uploads/2020...,2020-07-10T13:41:15Z,"Rivian, the electric vehicle company aiming to..."
3,ok,70,"{'id': None, 'name': 'MarketWatch'}",Mark DeCambre,Dow and broader stock-market futures turn posi...,,https://www.marketwatch.com/story/dow-and-broa...,https://s.wsj.net/public/resources/MWimages/MW...,2020-07-10T13:10:00Z,U.S. stock-index futures turned positive on Fr...
4,ok,70,"{'id': None, 'name': 'Motley Fool'}",Bram Berkowitz,How Much Will Wells Fargo Cut Its Dividend in ...,The bank announced it would cut its cash payou...,https://www.fool.com/investing/2020/07/10/how-...,https://g.foolcdn.com/editorial/images/581578/...,2020-07-10T12:57:00Z,After the Federal Reserve announced it would b...
5,ok,70,"{'id': None, 'name': 'Benzinga'}",Globe Newswire,Insights on the Public Cloud Global Market to ...,"Dublin, July 10, 2020 (GLOBE NEWSWIRE) -- The ...",https://www.benzinga.com/pressreleases/20/07/g...,,2020-07-10T12:33:00Z,"Dublin, July 10, 2020 (GLOBE NEWSWIRE) -- The ..."
6,ok,70,"{'id': None, 'name': 'CNBC'}",Reuters,U.S. producer prices unexpectedly fall in June...,U.S. producer prices unexpectedly fell in June...,https://www.cnbc.com/2020/07/10/producer-price...,https://image.cnbcfm.com/api/v1/image/10661000...,2020-07-10T12:32:00Z,U.S. producer prices unexpectedly fell in June...
7,ok,70,"{'id': None, 'name': 'The Points Guy'}",Gene Sloan,15 ways that cruising newbies waste money on t...,"It's easy to overspend on a cruise, particular...",http://thepointsguy.com/guide/first-time-cruis...,https://i0.wp.com/thepointsguy.com/wp-content/...,2020-07-10T12:22:21Z,I can still remember my first rookie mistake o...
8,ok,70,"{'id': 'the-wall-street-journal', 'name': 'The...",Saabira Chaudhuri,"Soap Makers Are Cleaning Up Amid the Pandemic,...","Companies are redesigning packaging, pushing r...",https://www.wsj.com/articles/soap-makers-are-c...,https://images.wsj.net/im-207789/social,2020-07-10T12:16:00Z,A surge in demand for soap and hand sanitizer ...
9,ok,70,"{'id': None, 'name': 'fox8.com'}",fox8.com,Bed Bath & Beyond plans to close 200 stores ov...,Bed Bath & Beyond has announced their plans to...,https://fox8.com/news/coronavirus/bed-bath-bey...,https://fox8.com/wp-content/uploads/sites/12/2...,2020-07-10T11:34:00Z,(CNN) — Bed Bath &amp; Beyond has announced th...


In [38]:
# Extract the URL for each of the top headlines and the title
count = 0
for i in response_json['articles']:
    print(i['title'])
    print(i['url'])
    count = count + 1

print('Total number of articles: '+str(count))

Wearing masks is now the most important thing for the economy, Fed's Kaplan says - CNBC
https://www.cnbc.com/2020/07/10/wearing-masks-is-now-the-most-important-thing-for-the-economy-feds-kaplan-says.html
WaWa, Giant begin asking for 'exact change' as coronavirus leads to nationwide coin shortage | TheHill - The Hill
https://thehill.com/homenews/news/506722-wawa-giant-begin-asking-for-exact-change-as-coronavirus-leads-to-nationwide
Rivian raises $2.5 billion as it pushes to bring its electric RT1 pickup, R1S SUV to market - TechCrunch
http://techcrunch.com/2020/07/10/rivian-raises-massive-2-5-billion-as-it-pushes-to-bring-its-electric-pickup-suv-to-market/
Dow and broader stock-market futures turn positive amid upbeat report on Gilead's experimental coronavirus treatment - MarketWatch
https://www.marketwatch.com/story/dow-and-broader-stock-market-futures-turn-positive-amid-upbeat-report-on-gileads-experimental-coronavirus-treatment-2020-07-10
How Much Will Wells Fargo Cut Its Dividend i

In [39]:
headline_urls = df['url']
print(headline_urls)

0     https://www.cnbc.com/2020/07/10/wearing-masks-...
1     https://thehill.com/homenews/news/506722-wawa-...
2     http://techcrunch.com/2020/07/10/rivian-raises...
3     https://www.marketwatch.com/story/dow-and-broa...
4     https://www.fool.com/investing/2020/07/10/how-...
                            ...                        
65    https://www.wsj.com/articles/meatpackers-covid...
66    https://www.fool.com/investing/2020/07/09/is-i...
67    https://www.cnbc.com/2020/07/09/walgreens-wba-...
68    https://www.japantimes.co.jp/news/2020/07/09/b...
69    https://www.cbsnews.com/news/toxic-hand-saniti...
Name: url, Length: 70, dtype: object


# Trying to use the newsdatascraper package
from newsdatascraper import Scraper

new_scraper = Scraper(secret, mode = 'NEWSPAPER')
articles = new_scraper.fetch_all_articles("2020-07-07&language=en", page_size = 100)

# helper functions to serialize the data

articles.to_csv('test.csv')
#articles.toPickle('test.pickle')
#articles.toJson()

# Extract the complete article text and and the NLP summary, for each of the top headline using the URL collected above
from newspaper import Article
from newspaper import Config


article_title = []
article_authors = []
article_text = []
article_summary = []
article_date = []
article_top_image = []

for h in headline_urls:
#news_article = Article(url=h)

    news_article = Article(url=h)
    try:
        news_article.download()
        #print(first_article.html)
        news_article.parse()
        news_article.nlp()
    except newspaper.article.ArticleException:
        print("Unable to download the article")
        pass            
    article_title = news_article.title
    #print(article_title)
    article_authors = news_article.title
    #print(article_authors)
    article_text.append(news_article.text)
    print(news_article.text)
    
    article_summary.append(news_article.summary)
    print(news_article.summary)
    article_publish_date = news_article.publish_date
    #print(article_publish_date)
    article_top_image = news_article.top_image
    #print(article_top_image)

In [46]:
# Extract the complete article text and and the NLP summary, for each of the top headline using the URL collected above
from newspaper import Article
from newspaper import Config
from summarizer import Summarizer

article_title = []
article_authors = []
article_text = []
article_summary = []
article_date = []
article_top_image = []
failed_url = []
full_summary = []
for h in headline_urls:
        user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        config = Config()
        config.browser_user_agent = user_agent
        page = Article(h, config=config)
        try :
            page.download()
            page.parse()
            page.nlp()
            article_text.append(page.text)
            #print(page.text)
            #article_summary.append(page.summary)
            #print(page.summary)
            model = Summarizer()
            result = model(page.text, min_length=60)
            full_summary = ''.join(result)
            article_summary.append(full_summary)
            #print(full_summary)
        except :
            failed_url = h
            print(failed_url)
            df = df[df.url != failed_url]
            pass

https://www.washingtonpost.com/business/2020/07/09/uber-postmates-merger-will-only-deliver-bad-tiding-restaurants/


In [47]:
# Drop the row(s) in the dataframe containing the URL that failed to download
print(df.shape)

(69, 11)


In [48]:
# Create new columns in dataframe to for the text and summary of the article
df['articleText'] = article_text
df['articleSummary'] = article_summary

In [20]:
print(df.head())

  status  totalResults                                  source  \
0     ok            70     {'id': None, 'name': 'Motley Fool'}   
1     ok            70     {'id': None, 'name': 'MarketWatch'}   
2     ok            70     {'id': None, 'name': 'MarketWatch'}   
3     ok            70  {'id': 'nbc-news', 'name': 'NBC News'}   
4     ok            70     {'id': None, 'name': 'Motley Fool'}   

                          author  \
0                  Daniel Sparks   
1              Barbara Kollmeyer   
2                Steve Goldstein   
3  Benjy Sarlin, Stephanie Ruhle   
4                  Dan Caplinger   

                                               title  \
0  Is Tesla Stock a Buy Ahead of Earnings? - Motl...   
1  European stocks rise, but focus remains on cor...   
2  Why one strategist is wondering whether the tr...   
3  Views on recovery are impacting the debate ove...   
4  3 Things NIO Stock Bulls Need to Happen Soon -...   

                                         descript

In [49]:
df.get_value(0, 'articleText') 

  """Entry point for launching an IPython kernel.


'Amid historically aggressive policy moves from the U.S. central bank and Congress, Dallas Federal Reserve President Robert Kaplan said the most important thing for the economy now is wearing protective face coverings.\n\nMasks in public, Kaplan said, are key to stopping the coronavirus spread, which is increasing in record numbers and threatening to roll back the progress made since the U.S. went into lockdown in mid-March.\n\n"The main message I\'d have today about the economy from here and how to grow it probably has to do with managing this virus," he told Fox Business\'s Maria Bartiromo in an interview Friday morning. "While monetary and fiscal policy are very important, they\'re not as important right now as us doing a good job flattening the curve on this virus. If we do that, we\'ll grow faster."\n\nThe Fed has instituted programs that could provide $2.3 trillion in liquidity and lending while taking its key interest rate down to near zero. At the same time, Congress has provid

In [50]:
df.get_value(0, 'articleSummary') 

  """Entry point for launching an IPython kernel.


'Amid historically aggressive policy moves from the U.S. central bank and Congress, Dallas Federal Reserve President Robert Kaplan said the most important thing for the economy now is wearing protective face coverings. Masks in public, Kaplan said, are key to stopping the coronavirus spread, which is increasing in record numbers and threatening to roll back the progress made since the U.S. went into lockdown in mid-March.'

In [51]:
json = df.to_json() 
print(json) 



In [52]:
# Write the dataframe to a json file with orient set to index
df.to_json('top_headline_business_data.json', orient='index')

In [53]:
df.to_csv('article_business_text.csv', columns=['articleText'],index=False)