In [1]:
from stanfordcorenlp import StanfordCoreNLP
from datetime import datetime
import json
import re, string, os, itertools
import nltk
from nltk.corpus import stopwords
from collections import defaultdict
import os
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import subprocess
import shlex
from pymongo import MongoClient
import pathlib

In [47]:
resolved_entity_table = 'entities_resolved_overall'
entity_name = 'Reetika Khera'
article_table = 'articles'
res_folder = './RESULTS/'
directory3 = os.path.dirname(res_folder)
if not os.path.exists(directory3):
	os.makedirs(directory3)

In [37]:
'''
    Method to parse all the entities
'''
def get_all_entities(collection, types):
    pipeline = [{"$project":{"stdName":1,"type":1,"aliases":1,"articleIds":1,"num":{"$size":"$articleIds"}}}]
    cursor = list(collection.aggregate(pipeline))
    top_n_entities = {}
    entities = {type:[] for type in types}
    for ent in cursor:
        if(ent['type'] in types):
            entities[ent['type']].append(ent)

    for type in entities.keys():
        entities[type].sort(key=lambda x: x['num'], reverse=True)
        top_n_entities[type] = [{"name":obj['stdName'],"coverage":obj['num'],"aliases":obj['aliases'],"articleIds":obj['articleIds']} for obj in entities[type]]
    return top_n_entities

In [38]:
'''
    Configurations required for By-Statement Extraction
'''
#do not change the order
requiredCategs = ['FRONT_PAGE', 'REGIONAL_NEWS','NATIONAL_NEWS', 'INTERNATIONAL_NEWS', 'SPORTS', 'BUSINESS', 'OPINION']               
                
mongoConfigs = {
'host':'10.237.26.159',
'port':27017,
'db':'media-db'
}

esConfigs = {
'host':'10.237.26.25',
'port':9200,
'db':'media-db'
}

proxy_server='https://act4d.iitd.ernet.in:3128'
#proxy_server='https://10.10.78.62:3128'

entity_types = ['Person']
short_sources_list = ["Hindu", "TOI", "HT", "IE", "DecH", "Telegraph", "NIE"]
sources_list = ["The Hindu", "The Times Of India", "Hindustan Times", "Indian Express", "Deccan Herald", "Telegraph", "The New Indian Express"]


In [39]:
class ExtractSentences:
    
    caps = "([A-Z])"
    prefixes = "(Mr|St|Mrs|Ms|Dr|Rs)[.]"
    suffixes = "(Inc|Ltd|Jr|Sr|Co)"
    starters = "(Rs|Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
    acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
    websites = "[.](com|net|org|io|gov)"
    
    def split_into_sentences(self,text):
        text = " " + text + "  "
        text = text.replace("\n"," ")
        text = re.sub(self.prefixes,"\\1<prd>",text)
        text = re.sub(self.websites,"<prd>\\1",text)
        if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
        text = re.sub("\s" + self.caps + "[.] "," \\1<prd> ",text)
        text = re.sub(self.acronyms+" "+self.starters,"\\1<stop> \\2",text)
        text = re.sub(self.caps + "[.]" + self.caps + "[.]" + self.caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
        text = re.sub(self.caps + "[.]" + self.caps + "[.]","\\1<prd>\\2<prd>",text)
        text = re.sub(" "+self.suffixes+"[.] "+self.starters," \\1<stop> \\2",text)
        text = re.sub(" "+self.suffixes+"[.]"," \\1<prd>",text)
        text = re.sub(" " + self.caps + "[.]"," \\1<prd>",text)
        if "\"" in text: text = text.replace(".\"","\".")
        if "!" in text: text = text.replace("!\"","\"!")
        if "?" in text: text = text.replace("?\"","\"?")
        text = text.replace(".",".<stop>")
        text = text.replace("?","?<stop>")
        text = text.replace("!","!<stop>")
        text = text.replace("<prd>",".")
        sentences = text.split("<stop>")
        sentences = sentences[:-1]
        sentences = [s.strip() for s in sentences]
        return sentences

In [40]:
client = MongoClient(mongoConfigs['host'], mongoConfigs['port'])
db = client[mongoConfigs['db']]
collection = db[resolved_entity_table]  # collection having resolved entities
art_collection = db[article_table]  # collection having articles

entity_types = entity_types
short_sources_list = short_sources_list
sources_list = sources_list
fixed_keywords = ['says', 'said', 'asks', 'asked', 'told', 'announced', 'announce', 'claimed', 'claim']

extractor = ExtractSentences()  # object for extracting sentences from text

In [41]:
class StanfordNLP:
    def __init__(self, host='http://localhost', port=9000):
        self.nlp = StanfordCoreNLP(host, port=port,
                                   timeout=30000)
        self.props = {
            'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation',
            'pipelineLanguage': 'en',
            'outputFormat': 'json'
        }

    def word_tokenize(self, sentence):
        return self.nlp.word_tokenize(sentence)

    def pos(self, sentence):
        return self.nlp.pos_tag(sentence)

    def ner(self, sentence):
        return self.nlp.ner(sentence)

    def parse(self, sentence):
        return self.nlp.parse(sentence)

    def dependency_parse(self, sentence):
        return self.nlp.dependency_parse(sentence)

    def annotate(self, sentence):
        return json.loads(self.nlp.annotate(sentence, properties=self.props))

    @staticmethod
    def tokens_to_dict(_tokens):
        tokens = defaultdict(dict)
        for token in _tokens:
            tokens[int(token['index'])] = {
                'word': token['word'],
                'lemma': token['lemma'],
                'pos': token['pos'],
                'ner': token['ner']
            }
        return tokens

In [42]:
def findSentiment(sentiString):
    '''
    return Sentiment by Vader
    :param sentiString:
    :return:
    '''
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(sentiString)
    a_sent = (sentiment["compound"])
    return a_sent

In [43]:
'''
entitySpecificSentimentAnalysis:
    takes two argument
    1. Input File : set of sentences
    2. Keywords associated with target

    and output two list
    1. Articles on target
    2. Articles by target
    3. Articles not about target
'''


def preprocesstext(doc_set):
    text = doc_set.lower()
    text = text.replace('\r', '')
    text = text.replace('\r\n', '')
    text = text.replace('\n', '')
    text = text.replace('"', '')
    text = text.replace('%', ' ')
    return text


def entitySpecificCoverageAnalysis(doc_set, entity_keywords, entity_name, e_aliases):
    '''
    Finds the sentences that are about or by the entity
    :param doc_set: set of sentences
    :param entity_keywords: keywords as to which entity to identify in the sentence.
    :return: onTarget_sentences, byTarget_sentences, removed_sentences, onTargetTopic, byTargetTopic
    '''
    sNLP = StanfordNLP()
    onTargetArticles = []
    byTargetArticles = []
    removedArticles = []
    short_entity_name = ''.join(entity_name.split()).lower()
    entity_keywords.append(short_entity_name)
    for i in range(len(doc_set)):
        print('Document: {}'.format(i))
        text = preprocesstext(doc_set[i])
        for alis in e_aliases:
            text = text.replace(' ' + alis.lower() + ' ', ' ' + short_entity_name + ' ')
            text = text.replace(' ' + alis.lower() + '. ', ' ' + short_entity_name + ' . ')
            text = text.replace(' ' + alis.lower() + ', ', ' ' + short_entity_name + ' , ')
        try:
            pos_text = sNLP.pos(text)
        except json.decoder.JSONDecodeError:
            print('JSON_Decode_Error: ', text)
            continue
        parse_text = sNLP.dependency_parse(text)
        state1 = False
        state2 = False
        for pt in parse_text:
            if ((pt[0] == 'nsubj') or (pt[0] == 'nmod') or (pt[0] == 'amod') or (pt[0] == 'dobj')) and (
                        (pos_text[pt[1] - 1][0] in entity_keywords) or (pos_text[pt[2] - 1][0] in entity_keywords)):
                if ((pt[0] == 'nsubj') and (
                                pos_text[pt[1] - 1][0] in fixed_keywords or pos_text[pt[2] - 1][0] in fixed_keywords)):
                    state2 = True
                else:
                    state1 = True
        if state1:
            onTargetArticles.append(text)
        if state2:
            byTargetArticles.append(text)
        else:
            removedArticles.append(text)
    return (onTargetArticles, byTargetArticles, removedArticles)

In [78]:
def get_names_aliases_articles(entities):
    e_names = []
    e_aliases = []
    e_articleIds = []
    indices = []
    for type in entities.keys():
        for entity in entities[type]:
            e_names.append(entity['name'])
            e_aliases.append(entity['aliases'])
            e_articleIds.append(entity["articleIds"])
    return (e_names, e_aliases, e_articleIds)

In [44]:
entities = get_all_entities(collection, N, entity_types)

In [79]:
e_names, e_aliases, e_articleIds = get_names_aliases_articles(entities)

In [80]:
e_names[:5]

['Narendra Modi.Besides',
 'Targeting Narendra Modi',
 'Manmohan Singh.Then',
 'Virat Kohli Dhoni',
 'Donald Trump']

In [71]:
def findPowerEliteIndex(entity_name, e_names, e_aliases):
    '''
    FInd all the entity resolution which may contain the given entity.
    :param entity_name: Given entity
    :param e_names: list of all entity names
    :param e_aliases: list of all entity aliases
    :return: set of indices
    '''
    print('Search ', entity_name, ' : ', len(e_names))
    indices = []
    for i in range(len(e_names)):
        name = e_names[i].replace('.', '')
        alias = ','.join(e_aliases[i])
        if entity_name.lower() in name.lower() or entity_name.lower() in alias.lower():
            indices.append(i)
    return indices

In [72]:
entity_ind = findPowerEliteIndex(entity_name, e_names, e_aliases)
print(entity_ind)

Search  Reetika Khera  :  2785864
[20002, 169699, 188364, 301878, 358022, 1542608, 1572147, 1665674, 1903404, 2498496]


In [73]:
articles = {s: [] for s in sources_list}
print(entity_name + ' : READ ARTICLE IDS ...')

print(entity_ind)
new_entity_alias = []
for l in range(len(entity_ind)):
    new_entity_alias.extend(e_aliases[entity_ind[l]])
    for article_id in e_articleIds[entity_ind[l]]:
        article = art_collection.find({"_id": article_id})[0]
        # url = article["articleUrl"]
        text = article["text"]
        source = article["sourceName"]
        if source in sources_list:
            articles[source].append(text)

Reetika Khera : READ ARTICLE IDS ...
[20002, 169699, 188364, 301878, 358022, 1542608, 1572147, 1665674, 1903404, 2498496]


In [74]:
def printToFile(about_entity, by_entity, entity, source):
    '''
    :param about_entity: set of sentences that are about the entity
    :param by_entity:  set of sentences that are statements made by the entity
    :param entity: Entity name
    :param source: News source Short URL to be used in file naming
    :return: None
    '''
    about_sent = 0
    by_sent = 0
    if len(about_entity):
        fname = './' + res_folder + '/about_' + '_'.join(entity.split()) + '_' + source + '.txt'
        outfile = open(fname, 'w')
        for l in range(len(about_entity)):
            line = about_entity[l]
            line_sent = findSentiment(line)
            outfile.write(';;' + str(line_sent) + ';;' + line + '\n')
            about_sent = about_sent + line_sent
        outfile.close()
    if len(by_entity):
        fname = './' + res_folder + '/by_' + '_'.join(entity.split()) + '_' + source + '.txt'
        outfile = open(fname, 'w')
        for l in range(len(by_entity)):
            line = by_entity[l]
            line_sent = findSentiment(line)
            outfile.write(';;' + str(line_sent) + ';;' + line + '\n')
            by_sent = by_sent + line_sent
        outfile.close()
    return (about_sent, by_sent)

In [75]:
def printRemovedToFile(entity, source, sentences):
    '''
    Write the removed sentences to a file
    :param entity: Entity Name
    :param source:  News source Short URL to be used in file naming
    :param sentences: set of removed sentences
    :return: None
    '''
    fname = './' + res_folder + '/removed_' + '_'.join(entity.split()) + '_' + source + '.txt'
    outfile = open(fname, 'w')
    for s in sentences:
        outfile.write(s + '\n')
    outfile.close()

In [76]:
for j in range(1, len(sources_list)):
    source = sources_list[j]
    print(source, ' , #articles : ', len(articles[source]))
    sentences = []

    print('Extract sentences...')
    for text in list(set(articles[source])):
        ext_sentences = extractor.split_into_sentences(str(text))
        sentences.extend(ext_sentences)

    print('Find entity specific sentences...')
    onTargetArticles, byTargetArticles, removedArticles = entitySpecificCoverageAnalysis \
        (sentences, new_entity_alias, entity_name, new_entity_alias)

    print('Print about and by entity sentences to file...')
    about_sent, by_sent = printToFile(onTargetArticles, byTargetArticles,
                                      entity_name, short_sources_list[j])
    printRemovedToFile(entity_name, short_sources_list[j], removedArticles)

    print(entity_name, source, "done...")
    print('About: ', len(onTargetArticles), ' By: ', len(byTargetArticles), 'Removed: ', len(removedArticles))
    print('About: ', about_sent, ' By: ', by_sent, '\n')

The Times Of India  , #articles :  0
Extract sentences...
Find entity specific sentences...
Print about and by entity sentences to file...
Reetika Khera The Times Of India done...
About:  0  By:  0 Removed:  0
About:  0  By:  0 

Hindustan Times  , #articles :  3
Extract sentences...
Find entity specific sentences...
Document: 0
Document: 1
Document: 2
Document: 3
Document: 4
Document: 5
Document: 6
Document: 7
Document: 8
Document: 9
Document: 10
Document: 11
Document: 12
Document: 13
Document: 14
Document: 15
Document: 16
Document: 17
Document: 18
Document: 19
Document: 20
Document: 21
Document: 22
Document: 23
Document: 24
Document: 25
Document: 26
Document: 27
Document: 28
Document: 29
Document: 30
Document: 31
Document: 32
Document: 33
Document: 34
Document: 35
Document: 36
Document: 37
Document: 38
Document: 39
Document: 40
Document: 41
Document: 42
Document: 43
Document: 44
Document: 45
Document: 46
Document: 47
Document: 48
Document: 49
Document: 50
Document: 51
Document: 52
Do