In [211]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_lg
import en_core_web_sm
nlp = en_core_web_lg.load()
import requests
import json
#nlp = spacy.load("/Users/praveenkumarrajendran/codebase/air/en_lg")

In [252]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

def get_full_text(token):
    
    full_text = ""
    for child_token in token.children:
        full_text = full_text + " "+child_token.text
    full_text = full_text + " "+ token.text
    text_tokens = full_text.split(" ")
    filterd_tokens = []
    for token in text_tokens:
        if(token not in stopwords):
            filterd_tokens.append(token)
    full_text = " ".join(filterd_tokens)
    return full_text


def extract_event_attribute(event, token, side):

    full_text = get_full_text(token);

    isLeft = (side == "left")
    ent_type = token.ent_type_;
    if(ent_type == ""):
        extract_event_attribute_from_list(event, token.children, side)
    if(ent_type == 'DATE'):
        event['date'].add(full_text)
    elif(ent_type == 'LOC'):
#         print("Going to add token::"+token.text)
        event['location'].add(token.text)
    elif(ent_type == 'GPE'):
#         event['country'] = full_text
        event['location'].add(token.text)
    elif(ent_type == 'CARDINAL'):
        event['fatalities'] = full_text
    elif(ent_type == 'PERSON'):
        attribute_name = "actor2" if "actor1" in event else "actor1"
        event[attribute_name] = full_text
        for child in token.children:
            if(child.dep_ == 'nummod'):
                extract_event_attribute(event, child, side)
    elif(ent_type == 'ORG'):
#         attribute_name = "org2" if "org1" in event else "org1"
        event['org'].add(full_text)
#         event['org'].add(full_text)
        for child in token.children:
            if(child.dep_ == 'nummod'):
                extract_event_attribute(event, child, side)
    
    

def extract_event_attribute_from_list(event, tokens, side):
    
    entity_type=""
    full_text=""
    for token in tokens:
        extract_event_attribute(event, token, side)

def extract_event(doc):
    
    event = {}
    event['location']=set()
    event['date'] = set()
    event['org'] = set()
    for sent in doc.sents:
        short_doc = nlp(sent.text)
#         for ent in short_doc.ents:
#             print(ent.text+"::"+ent.label_)
        for token in short_doc:
            dependency = token.dep_
            if(dependency == "ROOT"):
                for left_token in token.lefts:
                    child_dep = left_token.dep_
                    if(child_dep in('nsubjpass', 'nsubj')):
                        extract_event_attribute(event, left_token,'left')
                    elif(child_dep == 'prep'):
                        extract_event_attribute_from_list(event, left_token.rights, 'left')

                for right_token in token.rights:
                    right_child_dep = right_token.dep_
                    full_text = get_full_text(right_token)
                    if(right_child_dep in('attr','dobj')):
                        extract_event_attribute(event, right_token,'right')
                    elif(right_child_dep == 'prep'):
                        extract_event_attribute_from_list(event, right_token.rights, 'right')
                    elif(right_child_dep == 'agent'):
                        extract_event_attribute_from_list(event, right_token.children, 'right')

            elif(dependency == "pobj"):
                extract_event_attribute(event,token,'right')
#         displacy.render(nlp(str(short_doc)), style='dep', jupyter = True, options = {'distance': 120})
    return event


In [253]:
violence_tokens = set()
violence_doc = nlp("crime kill murder death died criminal convict attack assault assaulted harrasment offence illegal attacker attacked")
for token in violence_doc:
    violence_tokens.add(token.lemma_)

protest_tokens = set()
protest_doc = nlp("protest protester agitation perpetrators rioters riot discord rebellion activist activism demonstration demonstrating resentment grievances agitators ban fast march dharna mourn strike")
for token in protest_doc:
    protest_tokens.add(token.lemma_)


In [254]:
# Compare lemmatised tokens

def classify_doc_topic(doc):
    docTokens = set()
    topic = 'Unclassified'
    for token in doc:
        docTokens.add(token.lemma_)
    violence_match = len(docTokens.intersection(violence_tokens))
    protest_match = len(docTokens.intersection(protest_tokens))

    if(violence_match > 0 and protest_match > 0):
        if(violence_match > protest_match):
            topic = 'Violence against Civilians'
        else:
            topic = 'Riots/Protests'
    elif (violence_match > 0):
        topic = 'Violence against Civilians'
    elif (protest_match > 0):
        topic = 'Riots/Protests'    
    return topic

In [255]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
stopwords = list(STOP_WORDS)

def summarise_event(doc):
    mytokens = [token.text for token in doc]
    word_frequencies = {}
    for word in doc:
        if word.text not in stopwords:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1

    # print(word_frequencies)
    maximum_frequency = max(word_frequencies.values())
    for word in word_frequencies.keys():
            word_frequencies[word] = (word_frequencies[word]/maximum_frequency)
    # word_frequencies
    sentence_list = [ sentence for sentence in doc.sents ]
    sentence_scores = {}
    for sent in sentence_list:
            for word in sent:
                if word.text.lower() in word_frequencies.keys():
                    if len(sent.text.split(' ')) < 30:
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_frequencies[word.text.lower()]
                        else:
                            sentence_scores[sent] += word_frequencies[word.text.lower()]
    # print(sentence_scores)
    from heapq import nlargest
    summarized_sentences = nlargest(3, sentence_scores, key=sentence_scores.get)
    # summarized_sentences
    # for w in summarized_sentences:
    #     print(w.text)
    final_sentences = [ w.text for w in summarized_sentences ]
    summary = ' '.join(final_sentences)
    return summary

In [256]:
def is_country_of_interest(addresses):
    for address in addresses:
        area_name = address['long_name']
        address_types = address['types']
        if ('country' in address_types):
            if(area_name in ['India','Indonesia','Thailand']):
                return True
            else:
                return False

In [257]:

def get_location_data(possible_locations):
    location_data = {}
    for location in possible_locations:
        response = requests.get('https://maps.googleapis.com/maps/api/geocode/json?address='+location+'&key=AIzaSyAQ4fdgmwd5tNsjI50ZqTIO0LURkXccxzc')
        responseJson = json.loads(response.text)
        results = responseJson['results']
        if(len(results) > 0):
            firstResult = results[0]
            addresses = firstResult['address_components']
#             if the name is the same for admin_level_1, admin_level_2, admin_level_3, then take only admin_level_1
            for address in addresses:
                if(not is_country_of_interest(addresses)):
                    continue
#                 print(address)
                area_name = address['long_name']
                address_types = address['types']
                if ('country' in address_types):
                    location_data['country'] = area_name
                elif ('administrative_area_level_1' in address_types):
                    location_data['administrative_area_level_1'] = area_name
                elif ('administrative_area_level_2' in address_types 
                      and 'administrative_area_level_2' not in location_data):
                    location_data['administrative_area_level_2'] = area_name
                elif('locality' in address_types
                    and 'administrative_area_level_3' not in location_data):
                    location_data['administrative_area_level_3'] = area_name
                elif('sublocality' in address_types
                    and 'administrative_area_level_4' not in location_data):
                    location_data['administrative_area_level_4'] = area_name
#             print(location_data)
                if(len(location_data) == 5):
                   break
    return location_data

In [258]:
def get_organization_data(possible_organizations):
    organization_data = {}
    for org in possible_organizations:
        org = org.strip()
        org = "+".join(org.split(" "))
        print('Org is::'+org)
        response = requests.get('https://kgsearch.googleapis.com/v1/entities:search?query='+org+'&key=AIzaSyAQ4fdgmwd5tNsjI50ZqTIO0LURkXccxzc&limit=5&indent=True')
        responseJson = json.loads(response.text)
        if('itemListElement' in responseJson):
            items = responseJson['itemListElement']
            if(len(items) > 0):
                firstResult = items[0]['result']
                org_name = firstResult['name']
                types = firstResult['@type']
                if('Organization' in types):
                    if(len(organization_data) == 0):
                        organization_data['org1'] = org_name
                    else: 
                        organization_data['org2'] = org_name
            if(len(organization_data) == 2):
                break
    return organization_data

In [259]:
import datefinder
import datetime

weekdays = ['monday','tuesday','wednesday','thursday','friday','saturday','sunday']

# from the list of date entities from the article, 
# - finds the date that is at least distance from the published date
# - converts 'day' to dates relative to the published date
# - if there are no date entities, assumes the published date to be event date

def get_event_date(dates, published_date_str):
    
    published_date = datetime.datetime.strptime(published_date_str, "%Y-%m-%d")
    published_day_of_week = published_date.weekday()
    recent_date = None
    current_min = None
    for date_str in dates:
        event_date = None
        date_str = date_str.lower()
        for day in weekdays:
            if(day in date_str):
                given_day_of_week = weekdays.index(day)
                day_diff = abs(published_day_of_week - given_day_of_week)
                if(day_diff > 0):
                    day_diff = 7-day_diff
                event_date = published_date - datetime.timedelta(days = day_diff)
        if(event_date is None):
            dates = list(datefinder.find_dates(date_str))       
            if(len(dates) > 0):
                event_date = dates[0]
        if(event_date is not None):
            relative_day_diff = abs((published_date - event_date).days)
            if(relative_day_diff == 0):
                recent_date = event_date
                break
            else:
                if(current_min is None or relative_day_diff < current_min):
                    current_min = relative_day_diff
                    recent_date = event_date
    if(recent_date is None):
        recent_date = published_date
    return str(recent_date).split(" ")[0]

In [261]:
final_event_map = {}
final_event_map['org']=[]
# final_event_map['org1']=[]
# final_event_map['org2']=[]
final_event_map['actor1']=[]
final_event_map['actor2']=[]
final_event_map['type']=[]
final_event_map['summary']=[]
final_event_map['date']=[]
final_event_map['location']=[]


# doc = nlp("Ghodse killed Mahatma Ghandi on Oct 2, 1947")
# doc = nlp("Ghandhi was born on October 2, 1869 to the couples Karamchand Ghandhi and Putlibai Ghandhi")
# doc = nlp("On 2nd October 1947, Godse killed Mahatma Ghandi")
# doc = nlp("On 2nd October 1947, Mahatma Ghandi was killed by Godse")
# doc = nlp("On 14 February 2019, a convoy of vehicles carrying security personnel on the Jammu Srinagar National Highway was attacked by a vehicle-borne suicide bomber at Lethpora (near Awantipora) in the Pulwama district, Jammu and Kashmir, India. The attack resulted in the deaths of 40 Central Reserve Police Force (CRPF) personnel and the attacker.The responsibility for the attack was claimed by the Pakistan-based Islamist militant group Jaish-e-Mohammed.The attacker was Adil Ahmad Dar, a local from Pulwama district, and a member of Jaish-e-Mohammed.")

url_list = ['https://www.indiatoday.in/elections/lok-sabha-2019/story/sabarimala-outfit-holds-namajapa-protest-in-kerala-1501155-2019-04-13',
           'https://www.time8.in/massive-protest-infront-of-agp-headquarters-in-guwahati-opposing-alliance/',
           'https://www.tribuneindia.com/news/chandigarh/farmers-protest-against-govt-demand-relief-for-crop-loss/746263.html',
           'https://www.time8.in/assam-nrc-illegal-bangladeshis-thrashed-by-miscreants-in-cachar/',
           'https://www.tribuneindia.com/news/chandigarh/teachers-protest-transfers-hold-deo-hostage/734787.html',
           'http://e-pao.net/GP.asp?src=17..020419.apr19']

for url in url_list:
    article = NewsPlease.from_url(url)
    doc = nlp(article.text)
    published_date_str = str(article.date_publish).split(" ")[0]
    doc_topic = classify_doc_topic(doc)
    if(doc_topic!="Unclassified"):

        event = extract_event(doc)
        event['type'] = doc_topic
        event_summary = summarise_event(doc)
        event['summary'] = event_summary
        if 'org' in event:
#             print(event['org'])
            org_data = get_organization_data(event['org'])
            print("Org Data")
            print(org_data)
            final_event_map['org'].append(org_data)
        else:
            final_event_map['org'].append('')
#         else:
#             final_event_map['org1'].append('')
#         if 'org2' in event: 
#             final_event_map['org2'].append(event['org2'])
#         else:
#             final_event_map['org2'].append('')
        if 'actor1' in event:
            final_event_map['actor1'].append(event['actor1'])  
        else:
            final_event_map['actor1'].append('')
        if 'actor2' in event:
            final_event_map['actor2'].append(event['actor2'])
        else:
            final_event_map['actor2'].append('')
        if 'date' in event:
            final_event_map['date'].append(get_event_date(event['date'], published_date_str))
        else:
            final_event_map['date'].append('')
        if 'type' in event:
            final_event_map['type'].append(event['type'])
        else:
            final_event_map['type'].append('')
        if 'summary' in event:
            final_event_map['summary'].append(event['summary'])
        else:
            final_event_map['summary'].append('')
        if 'location' in event:
#             print("Final Location::"+str(event['location']))
            location_data = get_location_data(event['location'])
            #do not process if country is not set, this will happen for events outside India, Thailand and Indonesia
            if('country' not in location_data):
                continue
            print("Location data")
            print(location_data)
            final_event_map['location'].append(str(location_data))
        else:
            final_event_map['location'].append('')

# import os
# path = '/Users/praveenkumarrajendran/codebase/air/articles' 
# for file in os.listdir( path ):
#     file = path+"/"+file
#     print(file)
#     if file.endswith( ".txt" ):
#         f=open(file, 'r')  
#         content = f.readlines()
#         f.close() 
#         doc = nlp(content[0])
#         # sentences = [x for x in doc.sents]
#         # for ent in doc.ents:
#         #     print(ent.text+"::"+ent.label_)
#         doc_topic = classify_doc_topic(doc)
#         # print(extract_event(doc))
#         if(doc_topic!="Unclassified"):

#             event = extract_event(doc)
#             print("Event is::"+str(event))
#             event['type'] = doc_topic
#             event_summary = summarise_event(doc)
#             event['summary'] = event_summary
#             if 'org' in event:
#                 print("Final org list")
#                 print(event['org'])
#                 final_event_map['org'].append(event['org'])
#             else:
#                 final_event_map['org'].append('')
# #             if 'org1' in event:
# #                 final_event_map['org1'].append(event['org1'])
# #             else:
# #                 final_event_map['org1'].append('')
# #             if 'org2' in event: 
# #                 final_event_map['org2'].append(event['org2'])
# #             else:
# #                 final_event_map['org2'].append('')
#             if 'actor1' in event:
#                 final_event_map['actor1'].append(event['actor1'])  
#             else:
#                 final_event_map['actor1'].append('')
#             if 'actor2' in event:
#                 final_event_map['actor2'].append(event['actor2'])
#             else:
#                 final_event_map['actor2'].append('')
#             if 'date' in event:
                
#                 final_event_map['date'].append(event['date'])
#             else:
#                 final_event_map['date'].append('')
#             if 'type' in event:
#                 final_event_map['type'].append(event['type'])
#             else:
#                 final_event_map['type'].append('')
#             if 'summary' in event:
#                 final_event_map['summary'].append(event['summary'])
#             else:
#                 final_event_map['summary'].append('')
#             if 'location' in event:
#                 print("Location data in event")
#                 print(event['location'])
#                 final_event_map['location'].append(event['location'])
#             else:
#                 final_event_map['location'].append('')
# add to table
# print events table

import pandas as pd
pd.DataFrame.from_dict(final_event_map)






Org is::Communist+Party
Org is::Ayyappa
Org is::Sabarimala+Action+Council
Org is::LDF
Org is::K
Org is::(+Photo+:+)+With+,+activists+PTI
Org is::Secretariat
Org is::India(Marxist)-led+Left+Democratic+Front
Org is::P
Org is::Kolathur+Adavaitha+Ashram
Org Data
{'org1': 'Communist Party of India (Marxist)'}
Location data
{'administrative_area_level_1': 'Kerala', 'country': 'India', 'administrative_area_level_3': 'Thiruvananthapuram', 'administrative_area_level_2': 'Thiruvananthapuram'}
Org is::
Org is::Guwahati
Org is::Asom
Org is::BJP
Org is::Gana
Org is::Ambari
Org is::AGP
Org is::incumbent+BJP
Org is::BJP+AGP
Org Data
{'org1': 'Bharatiya Janata Party'}
Location data
{'administrative_area_level_1': 'Assam', 'country': 'India', 'administrative_area_level_3': 'Guwahati', 'administrative_area_level_2': 'Kamrup'}
Org is::Bhartiya+Kisan+BKU+Union
Org is::Haryana+Government
Org is::A
Org Data
{'org1': 'Haryana', 'org2': 'Serie A'}
Location data
{'administrative_area_level_3': 'Ambala', 'admin

KeyError: 'name'

In [14]:
from datetime import datetime

#datetime_object = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')
datetime_object = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')
print(datetime_object.year)

2005
