In [3]:
from pytextrank import *
from eventregistry import *
from datetime import datetime
from nltk import download
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
download('stopwords')
download('punkt')
import json

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hardik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/hardik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
source_uri = [
    'bbc.com',                          # news
    'blogs.wsj.com', 
    'npr.org',
    'pbs.org',
    'abcnews.go.com',
    'cbsnews.com',
    'nbcnews.com',
    'apnews.com',
    'edition.cnn.com',
    'nytimes.com',
    'nypost.com',
    'washingtonpost.com',
    'msnbc.com',
    'theguardian.com',
    'bloomberg.com',
    'newyorker.com',
    'politico.com',
    'foxnews.com',
    'huffingtonpost.com',
    'economist.com',
    'buzzfeednews.com',
    'vox.com',
    'reuters.com',
    'in.reuters.com',
    'forbes.com',
    'ndtv.com',
    'timesofindia.indiatimes.com',
    'economictimes.indiatimes.com',
    'ibtimes.co.in',
    'huffingtonpost.in',
    'indiatoday.in'
    'foxsports.com',                    # sports
    'espn.com',
    'nfl.com',
    'cbssports.com',
    'fifa.com',
    'techcrunch.com',                   # technology
    'wired.com',
    'lifehacker.com',
    'macworld.com',
    'pcworld.com',
    'engadget.com',
    'readwrite.com',
    'mashable.com',
    'gizmodo.com',
    'venturebeat.com',
    'recode.net',
    'cnet.com',
    'howtogeek.com',
    'entrepreneur.com',                 # business
    'hbr.org',
    'freakonomics.com',
    'ritholtz.com',
    'fortune.com',
    'business-standard.com',
    'businessinsider.com',
    'foxbusiness.com',
    'businesstimes.com.sg',
    'factly.in',                        # fact-checking
    'factcheck.org',
    'snopes.com',
    'checkyourfact.com',
    'politifact.com',
    'thequint.com'
    ]

In [5]:
def find_phrases():
    # file paths
    ip = "./data/input.json"
    op1 = "./data/op1.json"
    op2 = "./data/op2.json"
    op3 = "./data/op3.json"

    # Perform statistical parsing/tagging on a document in JSON format
    with open(op1, 'w') as f:
        for graf in parse_doc(json_iter(ip)):
            f.write("%s\n" % pretty_print(graf._asdict()))

    # Collect and normalize the key phrases from a parsed document      
    graph, ranks = text_rank(op1)
    render_ranks(graph, ranks)

    with open(op2, 'w') as f:
        for rl in normalize_key_phrases(op1, ranks):
            f.write("%s\n" % pretty_print(rl._asdict()))

    # Summarize a document based on most significant sentences and key phrases
    phrases = ", ".join(set([p for p in limit_keyphrases(op2, phrase_limit=12)]))
    phrases = [phrase.strip() for phrase in phrases.split(',')]
    phrases.sort(key=lambda x: len(x.split()), reverse=True)

    # remove stop words from each phrase
    stop_words = set(stopwords.words('english'))
    for index, phrase in enumerate(phrases):
        word_tokens = word_tokenize(phrase) 
        phrase = " ".join([w for w in word_tokens if not w in stop_words])
        phrases[index] = phrase

    # select longest phrases while maximizing keyword limit for API
    phrases_list = [phrase.split() for phrase in phrases]
    phrases_final = []
    counter = 0
    for phrase in phrases_list:
        if (counter+len(phrase)) <= 15:
            phrases_final.append(" ".join(phrase))
            counter += len(phrase)
        else:
            continue

    return phrases_final

In [6]:
phrases = find_phrases()
print(phrases)

['anomalous atmospheric wave train', 'pacific extreme pattern', 'eastern us', 'extreme heat waves', 'new study', 'event']


In [7]:
API_KEY = "c40d087b-97be-4617-9008-19d209368072"
er = EventRegistry(apiKey = API_KEY)

using user provided API key for making requests
Event Registry host: http://eventregistry.org
Text analytics host: http://analytics.eventregistry.org


In [8]:
it = QueryArticlesIter(
        keywords = QueryItems.OR(phrases),
        dataType = ["news", "pr", "blog"],
        keywordsLoc = "body,title",
        sourceUri = QueryItems.OR(source_uri),
        lang="eng",
        dateStart = datetime(2019, 1, 1)
    )

In [16]:
# print("Total articles retrieved: ", it.count(er))

Total articles retrieved:  18203


In [9]:
res = it.execQuery(er, 
                    sortBy = "rel", # sourceAlexaGlobalRank, socialScore, sourceImportance
                    # maxItems = 10,
                    returnInfo = ReturnInfo(
                        articleInfo = ArticleInfoFlags(
                            links = True,
                            image = True,
                            socialScore = True,
                            sentiment = True
                        ),
                        sourceInfo = SourceInfoFlags(
                            ranking = True
                        )
                    )
                )

In [15]:
res

TypeError: 'QueryArticlesIter' object does not support indexing

In [None]:
data = {}
data["articles"] = 

In [52]:
with open("./results/er_opt.json", "w") as fp:
    for art in res:
        json.dump(art, fp)

In [37]:
for art in res:
    print(art)

{'uri': '1082996795', 'lang': 'eng', 'isDuplicate': False, 'date': '2019-03-09', 'time': '02:28:00', 'dateTime': '2019-03-09T02:28:00Z', 'dataType': 'news', 'sim': 0, 'url': 'https://www.pbs.org/wgbh/nova/article/engineers-heatproof-new-material-using-3d-printing/', 'title': 'Engineers Could Heatproof Tech With New 3D-Printed Material', 'body': 'Receive emails about upcoming NOVA programs and related content, as well as featured reporting about current events through a science lens.\n\nEmail AddressZip CodeSubscribe\n\nA new kind of 3D printed material could make technology more heat-resistant.\n\nUnlike most naturally occurring solids, these manmade, mesh-like materials shrink when they\'re heated up. By tweaking the materials\' chemical composition, the researchers could control how much they shrank at different temperatures. This work could apply to everything from circuitry to dentistry.\n\nSupport Provided ByLearn More\n\nMany devices warp or break at high temperatures because the

In [None]:
res_fp = open("./results/er_opt.txt", "r")
