# Todays News Trends

### Program gets the current to headlines from 
*All News Media*
### and performs sentiment analysis on the news to identify the current sentiment.

In [1]:
import tweepy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import time

import requests

from pprint import pprint

%matplotlib inline

# News API
import newsapi
from newsapi.newsapi_client import NewsApiClient
from newsapi.articles import Articles
from newsapi.sources import Sources

# NLTK
import nltk
from nltk import sent_tokenize
from string import punctuation

In [2]:
# Twitter API Keys
from config import (consumer_key, consumer_secret, access_token, access_token_secret)
#News API Keys
from config import newapi_key

In [3]:
# Import and Initialize Sentiment Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

#from config import (consumer_key, consumer_secret,
#                    access_token, access_token_secret)

# Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

# Init
newsapi = NewsApiClient(api_key='4b7e7bb7670e40f788f92667a6a2dd64')
newsapi_Articles = Articles(API_KEY=newapi_key)
newsapi_Sources = Sources(API_KEY=newapi_key)

### Method: getCurrentHeadlines() 
### Get the top headlines from newsapi. 
Default value country=US, lanuage=english, page_size=100 (max allowed) <br/>
Other paramers like q (query), sources, categories and pages can be passed in.

##### Note: country and category params cannot be used with sources param
*Category possible values:* 'business','entertainment','general','health','science','sports','technology'

**Returns a Dictionary** <br/>
Dictionary["totalResults"] key will provide the total number of articles that were returned by newsapi. <br/>
The method returns only Max of 500 articles <br/>

In [4]:
def getCurrentHeadlines(q=None, 
                        sources=None, 
                        language='en', 
                        country='us', 
                        category=None, 
                        page_size=100, 
                        page=None):
    if(sources != None):
        country = None
        category = None
        
    print(f"Calling => getCurrentHeadlines(q={q}, \n\
                                sources={sources},\n\
                                language={language},\n\
                                country={country},\n\
                                category={category},\n\
                                page_size={page_size},\n\
                                page={page})")

    results = newsapi.get_top_headlines(q, 
                                              sources, 
                                              language, 
                                              country, 
                                              category, 
                                              page_size, 
                                              page)
    results_count = results['totalResults'] 
    # return value
    top_headlines = results['articles']
    
    # the first 100 has already been pulled in the first call, 
    # so start pulling from page 2
    loopcount = int(results_count / 100)
    page = 2
    
    while(loopcount > 0):
        #print('INSIDE LOOP')
        #print(f"output list size = {len(top_headlines)}")
        
        results = None
        results = newsapi.get_top_headlines(q, 
                                          sources, 
                                          language, 
                                          country, 
                                          category, 
                                          page_size, 
                                          page)
        #print(f"page {page} pulled {results['totalResults']}")
        #print(f"loopcount {loopcount} pulled {len(results['articles'])}")
        
        top_headlines = top_headlines + results['articles']
        page += 1
        loopcount -= 1
        
        # break at 500 articles
        if(page > 5):
            break
         
    print(f"totalResults from newsapi.get_top_headlines() = {results_count} \n \
          top_headlines returned by the method = {len(top_headlines)}")
    return top_headlines

**Call Details**

*Default value country=US, lanuage=english, page_size=25
Other paramers like q (query), sources, categories and pages can be passed in.*

##### Note: country and category params cannot be used with sources param
Category possible values: 'business','entertainment','general','health','science','sports','technology'

*Sample:* 

**Correct Call:**

getCurrentHeadlines() *returns top headlines for US, English* <br/>
getCurrentHeadlines(sources='bbc-news') *returns top headlines from bbc-news, English*<br/>
getCurrentHeadlines(category='sports') *returns top headlines for sports, in English (not restricted to US)* <br/>
getCurrentHeadlines(q='FIFA World Cup', country=None) *returns top headlines for sports, in English (not restricted to US)* <br/>

**Incorrect Call: Category will be made None because sources is set.**

getCurrentHeadlines(sources='bbc-news', *'category='sports'*)

**Returns a Dictionary** <br/>
Dictionary["totalResults"] key will provide the total number of articles that were returned by newsapi. <br/>
The method returns only Max of 500 articles <br/>

In [5]:
news_sources = 'google-news,abc-news,cbs-news,cnbc,bbc-news,cnn,fox-news, \
                nbc-news,the-washington-post,the-washington-times'
#headlines = getCurrentHeadlines(sources=news_sources)
#headlines = getCurrentHeadlines(country=None)
headlines = getCurrentHeadlines()

Calling => getCurrentHeadlines(q=None, 
                                sources=None,
                                language=en,
                                country=us,
                                category=None,
                                page_size=100,
                                page=None)
totalResults from newsapi.get_top_headlines() = 20 
           top_headlines returned by the method = 20


In [7]:
pprint(headlines)

[{'author': 'Daniella Silva',
  'description': 'Federal officers in riot gear moved to reopen an Immigration '
                 'and Customs Enforcement headquarters in Oregon Thursday '
                 'morning following a blockade by protesters.',
  'publishedAt': '2018-06-28T20:12:25Z',
  'source': {'id': 'nbc-news', 'name': 'NBC News'},
  'title': 'Federal officers move to reopen Portland ICE building closed over '
           'protests',
  'url': 'https://www.nbcnews.com/news/us-news/federal-officers-move-reopen-portland-ice-building-closed-over-protests-n887326',
  'urlToImage': 'https://media4.s-nbcnews.com/j/newscms/2018_26/2480976/180628-ice-portland-protest-ew-403p_ed6bd04a5f03346c2cb0667366d3da71.1200;630;7;70;5.jpg'},
 {'author': 'ABC News',
  'description': 'President Donald Trump highlighted his economic policies '
                 'Thursday at the groundbreaking for a massive $10 billion '
                 'FoxConn factory complex that may bring thousands of jobs to '
  

In [8]:
type(headlines)

list

In [9]:
news_top_headlines_df = pd.DataFrame(headlines)

In [10]:
news_top_headlines_df.head()

Unnamed: 0,author,description,publishedAt,source,title,url,urlToImage
0,Daniella Silva,Federal officers in riot gear moved to reopen ...,2018-06-28T20:12:25Z,"{'id': 'nbc-news', 'name': 'NBC News'}",Federal officers move to reopen Portland ICE b...,https://www.nbcnews.com/news/us-news/federal-o...,https://media4.s-nbcnews.com/j/newscms/2018_26...
1,ABC News,President Donald Trump highlighted his economi...,2018-06-28T20:03:00Z,"{'id': 'abc-news', 'name': 'ABC News'}","Amid Harley feud, Trump hails economy at futur...",https://abcnews.go.com/Politics/wireStory/trum...,https://s.abcnews.com/images/Politics/WireAP_2...
2,NBC10 Staff,A suspect has been found in the mystery explos...,2018-06-28T19:53:49Z,"{'id': None, 'name': 'Nbcphiladelphia.com'}",Suspect Arrested in Mystery Bucks County Bombi...,https://www.nbcphiladelphia.com/news/local/ATF...,https://media.nbcphiladelphia.com/images/1200*...
3,Erin Jensen,"""I’m a business owner and people can run their...",2018-06-28T19:39:00Z,"{'id': 'usa-today', 'name': 'USA Today'}",Spike Mendelsohn condemns Red Hen's ouster of ...,https://www.usatoday.com/story/life/entertaint...,https://www.gannett-cdn.com/-mm-/1d9e9070b233b...
4,,There are reports that multiple people have be...,2018-06-28T19:29:14Z,"{'id': 'cnn', 'name': 'CNN'}",Shooting at Maryland newspaper,https://www.cnn.com/us/live-news/maryland-news...,https://cdn.cnn.com/cnnnext/dam/assets/1806281...


In [11]:
news_channel_name_list = [article['source']['name'] for article in headlines]

**News Channel Name List**

In [12]:
news_top_headlines_df['News Channel'] = news_channel_name_list

In [13]:
news_top_headlines_df.head()

Unnamed: 0,author,description,publishedAt,source,title,url,urlToImage,News Channel
0,Daniella Silva,Federal officers in riot gear moved to reopen ...,2018-06-28T20:12:25Z,"{'id': 'nbc-news', 'name': 'NBC News'}",Federal officers move to reopen Portland ICE b...,https://www.nbcnews.com/news/us-news/federal-o...,https://media4.s-nbcnews.com/j/newscms/2018_26...,NBC News
1,ABC News,President Donald Trump highlighted his economi...,2018-06-28T20:03:00Z,"{'id': 'abc-news', 'name': 'ABC News'}","Amid Harley feud, Trump hails economy at futur...",https://abcnews.go.com/Politics/wireStory/trum...,https://s.abcnews.com/images/Politics/WireAP_2...,ABC News
2,NBC10 Staff,A suspect has been found in the mystery explos...,2018-06-28T19:53:49Z,"{'id': None, 'name': 'Nbcphiladelphia.com'}",Suspect Arrested in Mystery Bucks County Bombi...,https://www.nbcphiladelphia.com/news/local/ATF...,https://media.nbcphiladelphia.com/images/1200*...,Nbcphiladelphia.com
3,Erin Jensen,"""I’m a business owner and people can run their...",2018-06-28T19:39:00Z,"{'id': 'usa-today', 'name': 'USA Today'}",Spike Mendelsohn condemns Red Hen's ouster of ...,https://www.usatoday.com/story/life/entertaint...,https://www.gannett-cdn.com/-mm-/1d9e9070b233b...,USA Today
4,,There are reports that multiple people have be...,2018-06-28T19:29:14Z,"{'id': 'cnn', 'name': 'CNN'}",Shooting at Maryland newspaper,https://www.cnn.com/us/live-news/maryland-news...,https://cdn.cnn.com/cnnnext/dam/assets/1806281...,CNN


In [None]:
# remove the stopwords
remove_words = ['the', 'an', 'a', 'is', 'i', 'and', 'then', 'like', 'there', 'their']

In [None]:
#remove the punctuation
def remove_punctuation(chars):
    return ''.join([c for c in chars if c not in punctuation])

# The DataFrame is 'news_top_headlines_df'
df = news_top_headlines_df

In [None]:
#import our dataframe

df =

In [None]:
#tokenize the text within the dataframe
df.loc[:, 'text_split'] = df.text.map(sent_tokenize)

In [None]:
#get the text ready to analyze
sentences = []
for _, r in df.iterrows():
    for s in r("Headline").text_split:
        filtered_words = [remove_punctuation(w) for w in s.split() if w.lower() not in remove_words]
        # or using nltk.word_tokenize
        # filtered_words = [w for w in word_tokenize(s) if w.lower() not in remove_words and w not in punctuation]
        sentences.append({'sent_id': r.sent_id, 
                          'text': s.strip('.'), 
                          'words': filtered_words})
    wordfrq = r("Headline").text_split()

In [None]:
#masterlist
masterlist = []

for channel in df:
    masterlist.append({'Channel': wordfrq})

In [None]:
#rename the dataframe
df_words = pd.DataFrame(sentences)

In [None]:
#word_cloud
#word freq
freq = nltk.FreqDist(wordfrq)
#tfidf
#stemming
#limitization
#thematic analysis

In [None]:
#start the vader analysis
#run the vader on the orginal text
#import vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()