In [45]:
import requests
import json
from contextlib import closing

# get API key for newsapi.org
APIKEY_FILE = '../newsapi_key.txt'

def read_api_key(api_key_fname):
    """
        read in api key from file. relative path, file just contains api key. return key as string
    """
    with open(api_key_fname) as f:
        api_key = f.read()
    
    return api_key

api_key = read_api_key(APIKEY_FILE)

In [46]:
def get_news_source_name_id_map(api_key):
    # Get news source id : name map
    # parameter api key
    url_template = "https://newsapi.org/v2/sources?apiKey=%s"
    
    # create get request url for news source names and ids
    rurl = url_template % api_key
    
    # make request
    with closing(requests.get(rurl)) as r:
        response = r.json()

    # make sure results were returned
    assert response['status'] == 'ok'
    
    # create python dictonary key: news source name, value: news source id (used in request for articles)
    name_id_map = {}
    sources = response['sources']
    for s in sources:
        name_id_map[s['name']] = s['id']
    
    return name_id_map
    
#get_news_source_name_id_map(api_key)

In [100]:
# or by using python package
#--> pip install newsapi-python
from newsapi import NewsApiClient

# Init
newsapi = NewsApiClient(api_key=api_key)

# get sources
sources = newsapi.get_sources()

{'status': 'ok', 'sources': [{'id': 'abc-news', 'name': 'ABC News', 'description': 'Your trusted source for breaking news, analysis, exclusive interviews, headlines, and videos at ABCNews.com.', 'url': 'http://abcnews.go.com', 'category': 'general', 'language': 'en', 'country': 'us'}, {'id': 'abc-news-au', 'name': 'ABC News (AU)', 'description': "Australia's most trusted source of local, national and world news. Comprehensive, independent, in-depth analysis, the latest business, sport, weather and more.", 'url': 'http://www.abc.net.au/news', 'category': 'general', 'language': 'en', 'country': 'au'}, {'id': 'aftenposten', 'name': 'Aftenposten', 'description': 'Norges ledende nettavis med alltid oppdaterte nyheter innenfor innenriks, utenriks, sport og kultur.', 'url': 'https://www.aftenposten.no', 'category': 'general', 'language': 'no', 'country': 'no'}, {'id': 'al-jazeera-english', 'name': 'Al Jazeera English', 'description': 'News, analysis from the Middle East and worldwide, multime

In [47]:
# get list news source ids, which can be used for the article get requests
news_sources_ids = list(get_news_source_name_id_map(api_key).values())
print(news_sources_ids)

['abc-news', 'abc-news-au', 'aftenposten', 'al-jazeera-english', 'ansa', 'argaam', 'ars-technica', 'ary-news', 'associated-press', 'australian-financial-review', 'axios', 'bbc-news', 'bbc-sport', 'bild', 'blasting-news-br', 'bleacher-report', 'bloomberg', 'breitbart-news', 'business-insider', 'business-insider-uk', 'buzzfeed', 'cbc-news', 'cbs-news', 'cnbc', 'cnn', 'cnn-es', 'crypto-coins-news', 'daily-mail', 'der-tagesspiegel', 'die-zeit', 'el-mundo', 'engadget', 'entertainment-weekly', 'espn', 'espn-cric-info', 'financial-post', 'financial-times', 'focus', 'football-italia', 'fortune', 'four-four-two', 'fox-news', 'fox-sports', 'globo', 'google-news', 'google-news-ar', 'google-news-au', 'google-news-br', 'google-news-ca', 'google-news-fr', 'google-news-in', 'google-news-is', 'google-news-it', 'google-news-ru', 'google-news-sa', 'google-news-uk', 'goteborgs-posten', 'gruenderszene', 'hacker-news', 'handelsblatt', 'ign', 'il-sole-24-ore', 'independent', 'infobae', 'info-money', 'la-gac

In [114]:
"""
Article Scraper from a given url

requirements: 
	-Newspaper3k, https://github.com/codelucas/newspaper, http://newspaper.readthedocs.io/en/latest/, https://newsapi.org/docs/client-libraries/python
    --> pip install newspaper3k
Notes: 	
	newspaper offers nlp summary
	article.nlp()
	print(article.summary)

"""
# external
from newspaper import Article

def get_full_article(url):
    # does not work for video news sources etc
    article = Article(url)
    article.download()
    article.parse()
    text = article.text
    if len(text) < 10:
        #print("warning with article url when extracting full text. Function: get_full_article")
        return None
    return text

# testing
test_article_url = "http://www.foxnews.com/world/2018/08/28/likely-lula-replacement-denies-corruption-charges-in-brazil.html"
print(get_full_article(test_article_url))

next

prev

The man in line to replace jailed presidential candidate Luiz Inacio Lula da Silva as the Brazilian Workers' Party standard-bearer on Tuesday denied accusations of corruption.

Prosecutors accuse Fernando Haddad of receiving indirect payments to his 2012 campaign for Sao Paulo mayor.

Construction company UTC Engenharia got preferential treatment on bids after covering about US$1.6 million of debt associated with Haddad's campaign, according to a former accusation filed Monday. Although Haddad did not request payment directly, he had full control over the scheme, according to prosecutors.

On Tuesday, Haddad responded while campaigning in Rio de Janeiro. He said that he cancelled a multimillion dollar project with a company belonging to the UTC group after an employee alerted him that the company was overcharging the government.

"How is it that a mayor who cancels a corrupt construction project gets put through this instead of being thanked for saving the city tens of mill

In [115]:
# example selected news sources ids
selected_news_sources_ids = ["fox-news", "msnbc.com", "vice-news", "cnn", "bbc-news", "breitbart-news", "the-huffington-post", "the-new-york-times"]

# generate a get request url for given news source for newsapi.org
def generate_articles_get_request_url(news_source_id, api_key):
    """ 
        ex: https://newsapi.org/v2/everything?sources=fox-news,msnbc.com&apiKey=f86cb840aef7406cbf34bc6d78d31297
        :param news_sources: list of strings to include into formatted get request
        :param api_key: string api key for newsapi.org
        :returns: string, get request url
    """
    # first %s : comma seperated news sources
    # second %s : api key string
    url_template = "https://newsapi.org/v2/everything?sources=%s&apiKey=%s"
    
    return url_template % (news_source_id, api_key)

# make request for articles from selected news source
def news_api_response_articles(news_source_id, api_key):
    # generate request url using news source id and api key
    rurl = generate_articles_get_request_url(news_source_id, api_key)

    # make request
    with closing(requests.get(rurl)) as r:
        response = r.json()

    # make sure results were returned
    assert response['status'] == 'ok'
    
    return response
    

# get list of article objects from response from newsapi.org
def get_list_of_article_objects(response, include_full_articles=True):
    
    # get array of article json objects/list
    articles = response['articles']
    
    # for article in articles look at 'source', 'title', 'description', 'url'
    # extract article sources titles descriptions and links to urls of actual articles
    article_objects = []
    for a in articles:
        #news_source_name = a['source']['name']
        
        # attempt to get full article text using t using Newspaper3k
        #text = get_full_article(a['url'])
        
        article_object = {
            'news_source_id' : a['source']['id'], # article news source id
            'title' : a['title'], # article title 
            'description' : a['description'], # breif article description
            'url' : a['url'], # full article url
        }
        
        if include_full_articles:
            article_object['text'] = get_full_article(a['url']) # get full article text using Newspaper3k -- often None
        
        article_objects.append(article_object)
        
    return article_objects



# get article description objects from source test
selected_source_id = selected_news_sources_ids[0]
print(selected_source_id)
response = news_api_response_articles(selected_source_id, api_key)
source_article_objects = get_list_of_article_objects(response)
print(source_article_objects)



fox-news
[{'news_source_id': 'fox-news', 'title': 'Trump takes on Google, Twitter and Facebook', 'description': "President accuses social media giants of unfair treatment of conservatives; the 'Special Report' All-Star panel reacts.", 'url': 'http://video.foxnews.com/v/5828021406001/', 'text': None}, {'news_source_id': 'fox-news', 'title': "Iran's parliament rebukes Rouhani over economic woes", 'description': "Iranian lawmakers vote to reject President Rouhani's response to the faltering economy; correspondent Benjamin Hall reports on the deep political divisions forming as U.S. sanctions take hold.", 'url': 'http://video.foxnews.com/v/5828019565001/', 'text': None}, {'news_source_id': 'fox-news', 'title': 'US has no plans to suspend more South Korea military drills', 'description': 'Defense Secretary James Mattis says exercises are ongoing as Pyongyang threatens to resume missile tests; national security correspondent Jennifer Griffin reports on the fallout from the canceled U.S. visi

In [127]:
# or by using python package
# -- https://github.com/mattlisiv/newsapi-python
#--> pip install newsapi-python
#from newsapi import NewsApiClient

def get_all_articles(source_id, api_key, max_articles=500, include_full_article_text=True):
    
    # init
    newsapi = NewsApiClient(api_key=api_key)
    
    # get all articles incrementing page number
    #total_results = response['totalResults']
    
    all_article_objects = []
    page_number = 1
    
    while len(all_article_objects) < max_articles:
        page_number = page_number + 1
        response = newsapi.get_everything(sources=selected_source_id,
                                      language='en',
                                      page=page_number, # can also use dates
                                        page_size=25) # 100 is maximum page size
    
        assert response['status'] == 'ok'
        all_article_objects += get_list_of_article_objects(response, include_full_articles=include_full_article_text)
        print("number of articles collected: %s" % len(all_article_objects))
        
    return all_article_objects
        

# example -- set include full article text to True for scrapping the actual site
all_article_objects = get_all_articles(selected_source_id, api_key, max_articles=50, include_full_article_text=False)

print(len(all_article_objects))
print(all_article_objects)

number of articles collected: 25
number of articles collected: 50
50
[{'news_source_id': 'fox-news', 'title': 'Student arrested after alleged freakout over pro-Trump cap', 'description': "A high school senior in California was arrested after she went on a profanity-laced tirade against a classmate for wearing a 'Make America Great Again' hat.", 'url': 'http://video.foxnews.com/v/5827963169001/'}, {'news_source_id': 'fox-news', 'title': 'Technical issues may extend polling hours in Arizona', 'description': 'Over 200 polling locations in the Phoenix area were down due to voting machine issues; Peter Doocy has the latest from Tempe.', 'url': 'http://video.foxnews.com/v/5827966961001/'}, {'news_source_id': 'fox-news', 'title': "Incredible Maya discovery: Ancient king's mask uncovered in Mexico", 'description': 'An ancient mask depicting a 7th-century Maya king has been discovered in southern Mexico.', 'url': 'http://www.foxnews.com/science/2018/08/28/incredible-maya-discovery-ancient-kings