### Prototype code to scrape the top headlines and their corresponding article text from News API

In [1]:
import pandas as pd
import json

In [2]:
import pprint
import requests

In [3]:
# API Key for News API
secret = 'e3eca7fe616e4895bbd8c162ec9d567d'

In [4]:
# Define the endpoint to extract all the top headlines 
url = 'https://newsapi.org/v2/top-headlines?'

In [5]:
# Specify the query and number of returns - Limit the headlines to country US, for now
parameters = {
    'country': 'us',
    'category': 'business',
    'pageSize': 100,
    'apiKey': secret 
}

In [6]:
# Make the request
response = requests.get(url, params=parameters)

# Convert the response to JSON format and pretty print it
response_json = response.json()
pprint.pprint(response_json)

{'articles': [{'author': 'Francis Scarcella fscarcella@dailyitem.com',
               'content': 'HUMMELS WHARF Two people are dead following a '
                          'shooting, but there is no threat to the public at '
                          'this time, according to Pennsylvania state '
                          'police.\r\n'
                          'Trooper Mark Reasner, a state police spokesman '
                          'from… [+791 chars]',
               'description': 'HUMMELS WHARF — Two people are dead following a '
                              'shooting, but there is no threat to the public '
                              'at this time, according to Pennsylvania state '
                              'police.',
               'publishedAt': '2020-07-11T02:00:00Z',
               'source': {'id': None, 'name': 'Sunbury Daily Item'},
               'title': 'UPDATE Two people reported dead; no threat to public; '
                        'Monroe Marketplace evacuated 

In [7]:
# Convert the json response to a dataframe
df = pd.DataFrame.from_dict(response_json)

In [8]:
print(df)

   status  totalResults                                           articles
0      ok            70  {'source': {'id': None, 'name': 'Sunbury Daily...
1      ok            70  {'source': {'id': 'cnn', 'name': 'CNN'}, 'auth...
2      ok            70  {'source': {'id': 'usa-today', 'name': 'USA To...
3      ok            70  {'source': {'id': None, 'name': 'Gizmodo.com'}...
4      ok            70  {'source': {'id': None, 'name': 'OregonLive'},...
..    ...           ...                                                ...
65     ok            70  {'source': {'id': None, 'name': 'New York Time...
66     ok            70  {'source': {'id': None, 'name': 'Chicago Tribu...
67     ok            70  {'source': {'id': None, 'name': 'Lifehacker.co...
68     ok            70  {'source': {'id': None, 'name': 'Triangle Busi...
69     ok            70  {'source': {'id': None, 'name': 'CNBC'}, 'auth...

[70 rows x 3 columns]


In [9]:
df = pd.concat([df.drop(['articles'], axis=1), df['articles'].apply(pd.Series)], axis=1) 

In [10]:
print(df)

   status  totalResults                                             source  \
0      ok            70         {'id': None, 'name': 'Sunbury Daily Item'}   
1      ok            70                       {'id': 'cnn', 'name': 'CNN'}   
2      ok            70           {'id': 'usa-today', 'name': 'USA Today'}   
3      ok            70                {'id': None, 'name': 'Gizmodo.com'}   
4      ok            70                 {'id': None, 'name': 'OregonLive'}   
..    ...           ...                                                ...   
65     ok            70             {'id': None, 'name': 'New York Times'}   
66     ok            70            {'id': None, 'name': 'Chicago Tribune'}   
67     ok            70             {'id': None, 'name': 'Lifehacker.com'}   
68     ok            70  {'id': None, 'name': 'Triangle Business Journal'}   
69     ok            70                       {'id': None, 'name': 'CNBC'}   

                                        author  \
0   Francis S

In [11]:
df.shape

(70, 10)

In [12]:
df.head(50)

Unnamed: 0,status,totalResults,source,author,title,description,url,urlToImage,publishedAt,content
0,ok,70,"{'id': None, 'name': 'Sunbury Daily Item'}",Francis Scarcella fscarcella@dailyitem.com,UPDATE Two people reported dead; no threat to ...,HUMMELS WHARF — Two people are dead following ...,https://www.dailyitem.com/news/update-state-po...,https://bloximages.chicago2.vip.townnews.com/d...,2020-07-11T02:00:00Z,HUMMELS WHARF Two people are dead following a ...
1,ok,70,"{'id': 'cnn', 'name': 'CNN'}","Lauren del Valle and Amir Vera, CNN",3 people test positive for Covid-19 after taki...,The airline says it is following the guidance ...,https://www.cnn.com/2020/07/10/us/delta-air-li...,https://cdn.cnn.com/cnnnext/dam/assets/2003201...,2020-07-11T00:25:00Z,(CNN)Three people have tested positive for cor...
2,ok,70,"{'id': 'usa-today', 'name': 'USA Today'}",Josh Rivera,Five Guys says employees who refused to serve ...,A group of Five Guys employees who refused to ...,https://www.usatoday.com/story/money/food/2020...,https://www.gannett-cdn.com/-mm-/1e50282dc070f...,2020-07-11T00:13:20Z,A group of Five Guys employees who refused to ...
3,ok,70,"{'id': None, 'name': 'Gizmodo.com'}",Catie Keck,I Have Good News and Bad News About NBC's Stre...,"Well, which do you want first?",https://gizmodo.com/i-have-good-news-and-bad-n...,https://i.kinja-img.com/gawker-media/image/upl...,2020-07-11T00:10:00Z,"Well, which do you want first?\r\nLets start w..."
4,ok,70,"{'id': None, 'name': 'OregonLive'}",The Oregonian/OregonLive.com,"MUJI U.S.A. files for bankruptcy, blames coron...","The retail outlet has a Portland store, which ...",https://www.oregonlive.com/business/2020/07/mu...,https://www.oregonlive.com/resizer/pjarGAcVw1t...,2020-07-10T23:20:00Z,"MUJI U.S.A., a division of the Japanese retail..."
5,ok,70,"{'id': None, 'name': 'CNET'}",Steven Ewing,2021 Ford Bronco leaks ahead of Monday's debut...,Here's our best look yet at Ford's new two-doo...,https://www.cnet.com/roadshow/news/2021-ford-b...,https://cnet3.cbsistatic.com/img/fiXO1pTD_I5pq...,2020-07-10T23:08:00Z,"Holy moly, does this thing look cool.\r\nBronc..."
6,ok,70,"{'id': None, 'name': 'Greeley Tribune'}",Cuyler Meade,JBS employees walk off the job over pay disput...,Employees of all levels reportedly walked off ...,https://www.greeleytribune.com/jbs-employees-w...,https://www.greeleytribune.com/wp-content/uplo...,2020-07-10T22:59:27Z,Employees of all levels reportedly walked off ...
7,ok,70,"{'id': None, 'name': 'Seeking Alpha'}",Stone Fox Capital,Wells Fargo: Fed Overreacts On Dividend Cut - ...,Wells Fargo passed the Fed stress test easily....,https://seekingalpha.com/article/4357892-wells...,https://static2.seekingalpha.com/uploads/2020/...,2020-07-10T22:58:00Z,After the Fed stress test results were release...
8,ok,70,"{'id': 'cnn', 'name': 'CNN'}","Allison Morrow, CNN Business",Elon Musk just became richer than Warren Buffe...,Tesla CEO Elon Musk just zoomed soared past Wa...,https://www.cnn.com/2020/07/10/business/elon-m...,https://cdn.cnn.com/cnnnext/dam/assets/2006301...,2020-07-10T22:49:00Z,
9,ok,70,"{'id': None, 'name': 'The Mercury News'}",Wes Goldberg,Santa Clara County opening new free coronaviru...,The sites will provide free COVID-19 nasal swa...,https://www.mercurynews.com/santa-clara-county...,https://www.mercurynews.com/wp-content/uploads...,2020-07-10T22:39:00Z,Santa Clara County will open new coronavirus t...


In [13]:
# Extract the URL for each of the top headlines and the title
count = 0
for i in response_json['articles']:
    print(i['title'])
    print(i['url'])
    count = count + 1

print('Total number of articles: '+str(count))

UPDATE Two people reported dead; no threat to public; Monroe Marketplace evacuated - Sunbury Daily Item
https://www.dailyitem.com/news/update-state-police-two-dead-after-shooting-no-threat-to-public/article_22b78758-c30a-11ea-a658-57cb54b1c099.html
3 people test positive for Covid-19 after taking Delta flight from Atlanta to Albany, airline says - CNN
https://www.cnn.com/2020/07/10/us/delta-air-lines-coronavirus-passengers/index.html
Five Guys says employees who refused to serve police have been fired, suspended - USA TODAY
https://www.usatoday.com/story/money/food/2020/07/10/five-guys-fires-employees-after-they-refusing-police-service/5418419002/
I Have Good News and Bad News About NBC's Streaming Service Peacock - Gizmodo
https://gizmodo.com/i-have-good-news-and-bad-news-about-nbcs-streaming-serv-1844343727
MUJI U.S.A. files for bankruptcy, blames coronavirus - OregonLive
https://www.oregonlive.com/business/2020/07/muji-usa-files-for-bankruptcy-blames-coronavirus.html
2021 Ford Bronc

In [14]:
headline_urls = df['url']
print(headline_urls)

0     https://www.dailyitem.com/news/update-state-po...
1     https://www.cnn.com/2020/07/10/us/delta-air-li...
2     https://www.usatoday.com/story/money/food/2020...
3     https://gizmodo.com/i-have-good-news-and-bad-n...
4     https://www.oregonlive.com/business/2020/07/mu...
                            ...                        
65    https://www.nytimes.com/2020/07/09/health/anti...
66    https://www.chicagotribune.com/marijuana/sns-t...
67    https://lifehacker.com/find-out-who-got-a-ppp-...
68    https://www.bizjournals.com/denver/news/2020/0...
69    https://www.cnbc.com/2020/07/09/walgreens-wba-...
Name: url, Length: 70, dtype: object


# Trying to use the newsdatascraper package
from newsdatascraper import Scraper

new_scraper = Scraper(secret, mode = 'NEWSPAPER')
articles = new_scraper.fetch_all_articles("2020-07-07&language=en", page_size = 100)

# helper functions to serialize the data

articles.to_csv('test.csv')
#articles.toPickle('test.pickle')
#articles.toJson()

# Extract the complete article text and and the NLP summary, for each of the top headline using the URL collected above
from newspaper import Article
from newspaper import Config


article_title = []
article_authors = []
article_text = []
article_summary = []
article_date = []
article_top_image = []

for h in headline_urls:
#news_article = Article(url=h)

    news_article = Article(url=h)
    try:
        news_article.download()
        #print(first_article.html)
        news_article.parse()
        news_article.nlp()
    except newspaper.article.ArticleException:
        print("Unable to download the article")
        pass            
    article_title = news_article.title
    #print(article_title)
    article_authors = news_article.title
    #print(article_authors)
    article_text.append(news_article.text)
    print(news_article.text)
    
    article_summary.append(news_article.summary)
    print(news_article.summary)
    article_publish_date = news_article.publish_date
    #print(article_publish_date)
    article_top_image = news_article.top_image
    #print(article_top_image)

In [15]:
# Extract the complete article text and and the NLP summary, for each of the top headline using the URL collected above
from newspaper import Article
from newspaper import Config
from summarizer import Summarizer

article_title = []
article_authors = []
article_text = []
article_summary = []
article_date = []
article_top_image = []
failed_url = []
full_summary = []
for h in headline_urls:
        user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        config = Config()
        config.browser_user_agent = user_agent
        page = Article(h, config=config)
        try :
            page.download()
            page.parse()
            page.nlp()
            article_text.append(page.text)
            #print(page.text)
            #article_summary.append(page.summary)
            #print(page.summary)
            model = Summarizer()
            result = model(page.text, min_length=60)
            full_summary = ''.join(result)
            article_summary.append(full_summary)
            #print(full_summary)
        except :
            failed_url = h
            print(failed_url)
            df = df[df.url != failed_url]
            pass

ModuleNotFoundError: No module named 'newspaper'

In [16]:
# Drop the row(s) in the dataframe containing the URL that failed to download
print(df.shape)

(70, 10)


In [17]:
# Create new columns in dataframe to for the text and summary of the article
df['articleText'] = article_text
df['articleSummary'] = article_summary

NameError: name 'article_text' is not defined

In [18]:
print(df.head())

  status  totalResults                                      source  \
0     ok            70  {'id': None, 'name': 'Sunbury Daily Item'}   
1     ok            70                {'id': 'cnn', 'name': 'CNN'}   
2     ok            70    {'id': 'usa-today', 'name': 'USA Today'}   
3     ok            70         {'id': None, 'name': 'Gizmodo.com'}   
4     ok            70          {'id': None, 'name': 'OregonLive'}   

                                       author  \
0  Francis Scarcella fscarcella@dailyitem.com   
1         Lauren del Valle and Amir Vera, CNN   
2                                 Josh Rivera   
3                                  Catie Keck   
4                The Oregonian/OregonLive.com   

                                               title  \
0  UPDATE Two people reported dead; no threat to ...   
1  3 people test positive for Covid-19 after taki...   
2  Five Guys says employees who refused to serve ...   
3  I Have Good News and Bad News About NBC's Stre...   
4  M

In [19]:
df.get_value(0, 'articleText') 

AttributeError: 'DataFrame' object has no attribute 'get_value'

In [20]:
df.get_value(0, 'articleSummary') 

AttributeError: 'DataFrame' object has no attribute 'get_value'

In [21]:
json = df.to_json() 
print(json) 

{"status":{"0":"ok","1":"ok","2":"ok","3":"ok","4":"ok","5":"ok","6":"ok","7":"ok","8":"ok","9":"ok","10":"ok","11":"ok","12":"ok","13":"ok","14":"ok","15":"ok","16":"ok","17":"ok","18":"ok","19":"ok","20":"ok","21":"ok","22":"ok","23":"ok","24":"ok","25":"ok","26":"ok","27":"ok","28":"ok","29":"ok","30":"ok","31":"ok","32":"ok","33":"ok","34":"ok","35":"ok","36":"ok","37":"ok","38":"ok","39":"ok","40":"ok","41":"ok","42":"ok","43":"ok","44":"ok","45":"ok","46":"ok","47":"ok","48":"ok","49":"ok","50":"ok","51":"ok","52":"ok","53":"ok","54":"ok","55":"ok","56":"ok","57":"ok","58":"ok","59":"ok","60":"ok","61":"ok","62":"ok","63":"ok","64":"ok","65":"ok","66":"ok","67":"ok","68":"ok","69":"ok"},"totalResults":{"0":70,"1":70,"2":70,"3":70,"4":70,"5":70,"6":70,"7":70,"8":70,"9":70,"10":70,"11":70,"12":70,"13":70,"14":70,"15":70,"16":70,"17":70,"18":70,"19":70,"20":70,"21":70,"22":70,"23":70,"24":70,"25":70,"26":70,"27":70,"28":70,"29":70,"30":70,"31":70,"32":70,"33":70,"34":70,"35":70,"36"

In [26]:
# Write the dataframe to a json file with orient set to index
df.to_json('daily_news_data/JSON/top_headline_business_data.json', orient='index')

In [35]:
# Write the dataframe to a csv file with index set to false
df.to_csv('daily_news_data/CSV/top_headline_business_data.csv', index=False)

In [36]:
df.head()

Unnamed: 0,status,totalResults,source,author,title,description,url,urlToImage,publishedAt,content
0,ok,70,"{'id': None, 'name': 'Sunbury Daily Item'}",Francis Scarcella fscarcella@dailyitem.com,UPDATE Two people reported dead; no threat to ...,HUMMELS WHARF — Two people are dead following ...,https://www.dailyitem.com/news/update-state-po...,https://bloximages.chicago2.vip.townnews.com/d...,2020-07-11T02:00:00Z,HUMMELS WHARF Two people are dead following a ...
1,ok,70,"{'id': 'cnn', 'name': 'CNN'}","Lauren del Valle and Amir Vera, CNN",3 people test positive for Covid-19 after taki...,The airline says it is following the guidance ...,https://www.cnn.com/2020/07/10/us/delta-air-li...,https://cdn.cnn.com/cnnnext/dam/assets/2003201...,2020-07-11T00:25:00Z,(CNN)Three people have tested positive for cor...
2,ok,70,"{'id': 'usa-today', 'name': 'USA Today'}",Josh Rivera,Five Guys says employees who refused to serve ...,A group of Five Guys employees who refused to ...,https://www.usatoday.com/story/money/food/2020...,https://www.gannett-cdn.com/-mm-/1e50282dc070f...,2020-07-11T00:13:20Z,A group of Five Guys employees who refused to ...
3,ok,70,"{'id': None, 'name': 'Gizmodo.com'}",Catie Keck,I Have Good News and Bad News About NBC's Stre...,"Well, which do you want first?",https://gizmodo.com/i-have-good-news-and-bad-n...,https://i.kinja-img.com/gawker-media/image/upl...,2020-07-11T00:10:00Z,"Well, which do you want first?\r\nLets start w..."
4,ok,70,"{'id': None, 'name': 'OregonLive'}",The Oregonian/OregonLive.com,"MUJI U.S.A. files for bankruptcy, blames coron...","The retail outlet has a Portland store, which ...",https://www.oregonlive.com/business/2020/07/mu...,https://www.oregonlive.com/resizer/pjarGAcVw1t...,2020-07-10T23:20:00Z,"MUJI U.S.A., a division of the Japanese retail..."


In [30]:
#df.to_csv('article_business_text.csv', columns=['articleText'],index=False)