In [1]:
from pytextrank import *
from eventregistry import *
from datetime import datetime
from nltk import download
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
download('stopwords')
download('punkt')
import json

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hardik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/hardik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
source_uri = [
    'bbc.com',                          # news
    'blogs.wsj.com', 
    'npr.org',
    'pbs.org',
    'abcnews.go.com',
    'cbsnews.com',
    'nbcnews.com',
    'apnews.com',
    'edition.cnn.com',
    'nytimes.com',
    'nypost.com',
    'washingtonpost.com',
    'msnbc.com',
    'theguardian.com',
    'bloomberg.com',
    'newyorker.com',
    'politico.com',
    'foxnews.com',
    'huffingtonpost.com',
    'economist.com',
    'buzzfeednews.com',
    'vox.com',
    'reuters.com',
    'in.reuters.com',
    'forbes.com',
    'ndtv.com',
    'timesofindia.indiatimes.com',
    'economictimes.indiatimes.com',
    'ibtimes.co.in',
    'huffingtonpost.in',
    'indiatoday.in'
    'foxsports.com',                    # sports
    'espn.com',
    'nfl.com',
    'cbssports.com',
    'fifa.com',
    'techcrunch.com',                   # technology
    'wired.com',
    'lifehacker.com',
    'macworld.com',
    'pcworld.com',
    'engadget.com',
    'readwrite.com',
    'mashable.com',
    'gizmodo.com',
    'venturebeat.com',
    'recode.net',
    'cnet.com',
    'howtogeek.com',
    'entrepreneur.com',                 # business
    'hbr.org',
    'freakonomics.com',
    'ritholtz.com',
    'fortune.com',
    'business-standard.com',
    'businessinsider.com',
    'foxbusiness.com',
    'businesstimes.com.sg',
    'factly.in',                        # fact-checking
    'factcheck.org',
    'snopes.com',
    'checkyourfact.com',
    'politifact.com',
    'thequint.com'
    ]

In [3]:
def find_phrases():
    # file paths
    ip = "./data/input.json"
    op1 = "./data/op1.json"
    op2 = "./data/op2.json"
    op3 = "./data/op3.json"

    # Perform statistical parsing/tagging on a document in JSON format
    with open(op1, 'w') as f:
        for graf in parse_doc(json_iter(ip)):
            f.write("%s\n" % pretty_print(graf._asdict()))

    # Collect and normalize the key phrases from a parsed document      
    graph, ranks = text_rank(op1)
    render_ranks(graph, ranks)

    with open(op2, 'w') as f:
        for rl in normalize_key_phrases(op1, ranks):
            f.write("%s\n" % pretty_print(rl._asdict()))

    # Summarize a document based on most significant sentences and key phrases
    phrases = ", ".join(set([p for p in limit_keyphrases(op2, phrase_limit=12)]))
    phrases = [phrase.strip() for phrase in phrases.split(',')]
    phrases.sort(key=lambda x: len(x.split()), reverse=True)

    # remove stop words from each phrase
    stop_words = set(stopwords.words('english'))
    for index, phrase in enumerate(phrases):
        word_tokens = word_tokenize(phrase) 
        phrase = " ".join([w for w in word_tokens if not w in stop_words])
        phrases[index] = phrase

    # select longest phrases while maximizing keyword limit for API
    phrases_list = [phrase.split() for phrase in phrases]
    phrases_list = [phrase for phrase in phrases_list if len(phrase) > 1]
    phrases_final = []
    counter = 0
    for phrase in phrases_list:
        if (counter+len(phrase)) <= 15:
            phrases_final.append(" ".join(phrase))
            counter += len(phrase)
        else:
            continue

    return phrases_final

In [18]:
phrases = find_phrases()
print(phrases)

['pacific extreme pattern', 'anomalous atmospheric wave train', 'eastern us', 'extreme heat waves', 'little rain']


In [35]:
API_KEY = "c40d087b-97be-4617-9008-19d209368072"
er = EventRegistry(apiKey = API_KEY)

it = QueryArticlesIter(
        keywords = QueryItems.OR(phrases),
        dataType = ["news"],
        keywordsLoc = "body",
        sourceUri = QueryItems.OR(source_uri),
        lang="eng",
        dateStart = datetime(2019, 1, 1)
    )

res = it.execQuery(er, 
                    sortBy = "rel", # sourceAlexaGlobalRank, socialScore, sourceImportance
                    maxItems = 10,
                    returnInfo = ReturnInfo(
                        articleInfo = ArticleInfoFlags(
                            links = True,
                            image = True,
                            socialScore = True,
                            sentiment = True
                        ),
                        sourceInfo = SourceInfoFlags(
                            ranking = True
                        )
                    )
                )

using user provided API key for making requests
Event Registry host: http://eventregistry.org
Text analytics host: http://analytics.eventregistry.org


In [6]:
data = {}
data["articles"] = []
for art in res:
    data["articles"].append(json.dumps(art))

# data = json.dumps(data)

with open("./results/er_opt.json", "w") as fp:
    json.dump(data, fp)

In [24]:
with open("./results/er_opt.json", "r") as fp:
    df = json.load(fp)
    for i in range(len(df["articles"])):
        df["articles"][i] = json.loads(df["articles"][i])

In [24]:
df = json.loads(data["articles"][0])

IndexError: list index out of range

In [36]:
from uuid import uuid4
# create json structure for storage
data = {}
data["uid"] = str(uuid4())
data["articles"] = []
for art in res:
    data["articles"].append(art)

In [37]:
data

{'uid': '6c5786c6-e7d4-4a62-bc43-76226a208e32',
 'articles': [{'uri': '1100231420',
   'lang': 'eng',
   'isDuplicate': False,
   'date': '2019-03-31',
   'time': '05:28:00',
   'dateTime': '2019-03-31T05:28:00Z',
   'dataType': 'news',
   'sim': 0.8078431487083435,
   'url': 'https://www.washingtonpost.com/sports/wizards/harden-scores-50-has-triple-double-as-rockets-beat-kings/2019/03/30/6566a0f6-5368-11e9-bdb7-44f948cc0605_story.html',
   'title': 'Harden scores 50, has triple-double as Rockets beat Kings',
   'body': "By Associated Press March 31 at 12:58 AM\n\nHOUSTON -- James Harden had 50 points, 11 rebounds and 10 assists, scoring 10 straight Houston points down the stretch to help the Rockets beat the Sacramento Kings 119-108 on Saturday.\n\nHarden had his 42nd career triple-double and third 50-point game in the last six. He made seven 3-pointers to become the ninth player in NBA history to reach 2,000 career 3s.\n\nClint Capela added 24 points and 15 rebounds, and Chris Paul h