In [465]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_lg
import en_core_web_sm
nlp = en_core_web_lg.load()
#nlp = spacy.load("/Users/praveenkumarrajendran/codebase/air/en_lg")

In [520]:

def get_full_text(token):
    
    full_text = ""
    for child_token in token.children:
        full_text = full_text + " "+child_token.text
    full_text = full_text + " "+ token.text
    return full_text


def extract_event_attribute(event, token, side):

    full_text = get_full_text(token);
    isLeft = (side == "left")
    ent_type = token.ent_type_;
    if(ent_type == ""):
        extract_event_attribute_from_list(event, token.children, side)
    if(ent_type == 'DATE'):
        if 'date' not in event:
            event['date'] = full_text
    elif(ent_type == 'LOC'):
#         event['location'] = full_text
        event['location'].add(full_text)
    elif(ent_type == 'GPE'):
#         event['country'] = full_text
        event['location'].add(full_text)
    elif(ent_type == 'CARDINAL'):
        event['fatalities'] = full_text
    elif(ent_type == 'PERSON'):
        attribute_name = "actor2" if "actor1" in event else "actor1"
        event[attribute_name] = full_text
        for child in token.children:
            if(child.dep_ == 'nummod'):
                extract_event_attribute(event, child, side)
    elif(ent_type == 'ORG'):
        attribute_name = "org2" if "org1" in event else "org1"
        event[attribute_name] = full_text
#         event['org'].add(full_text)
        for child in token.children:
            if(child.dep_ == 'nummod'):
                extract_event_attribute(event, child, side)
    
    

def extract_event_attribute_from_list(event, tokens, side):
    
    entity_type=""
    full_text=""
    for token in tokens:
        extract_event_attribute(event, token, side)

def extract_event(doc):
    
    event = {}
    event['location']=set()
    for sent in doc.sents:
        short_doc = nlp(sent.text)
#         for ent in short_doc.ents:
#             print(ent.text+"::"+ent.label_)
        for token in short_doc:
            dependency = token.dep_
            if(dependency == "ROOT"):
                for left_token in token.lefts:
                    child_dep = left_token.dep_
                    if(child_dep in('nsubjpass', 'nsubj')):
                        extract_event_attribute(event, left_token,'left')
                    elif(child_dep == 'prep'):
                        extract_event_attribute_from_list(event, left_token.rights, 'left')

                for right_token in token.rights:
                    right_child_dep = right_token.dep_
                    full_text = get_full_text(right_token)
                    if(right_child_dep in('attr','dobj')):
                        extract_event_attribute(event, right_token,'right')
                    elif(right_child_dep == 'prep'):
                        extract_event_attribute_from_list(event, right_token.rights, 'right')
                    elif(right_child_dep == 'agent'):
                        extract_event_attribute_from_list(event, right_token.children, 'right')

            elif(dependency == "pobj"):
                extract_event_attribute(event,token,'right')
#         displacy.render(nlp(str(short_doc)), style='dep', jupyter = True, options = {'distance': 120})
    return event


In [521]:
violence_tokens = set()
violence_doc = nlp("crime kill murder death died criminal convict attack assault assaulted harrasment offence illegal attacker attacked")
for token in violence_doc:
    violence_tokens.add(token.lemma_)

protest_tokens = set()
protest_doc = nlp("protest protester agitation perpetrators rioters riot discord rebellion activist activism demonstration demonstrating resentment grievances agitators ban fast march dharna mourn strike")
for token in protest_doc:
    protest_tokens.add(token.lemma_)


In [522]:
# Compare lemmatised tokens

def classify_doc_topic(doc):
    docTokens = set()
    topic = 'Unclassified'
    for token in doc:
        docTokens.add(token.lemma_)
    violence_match = len(docTokens.intersection(violence_tokens))
    protest_match = len(docTokens.intersection(protest_tokens))

    if(violence_match > 0 and protest_match > 0):
        if(violence_match > protest_match):
            topic = 'Violence against Civilians'
        else:
            topic = 'Riots/Protests'
    elif (violence_match > 0):
        topic = 'Violence against Civilians'
    elif (protest_match > 0):
        topic = 'Riots/Protests'    
    return topic

In [523]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
stopwords = list(STOP_WORDS)

def summarise_event(doc):
    mytokens = [token.text for token in doc]
    word_frequencies = {}
    for word in doc:
        if word.text not in stopwords:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1

    # print(word_frequencies)
    maximum_frequency = max(word_frequencies.values())
    for word in word_frequencies.keys():
            word_frequencies[word] = (word_frequencies[word]/maximum_frequency)
    # word_frequencies
    sentence_list = [ sentence for sentence in doc.sents ]
    sentence_scores = {}
    for sent in sentence_list:
            for word in sent:
                if word.text.lower() in word_frequencies.keys():
                    if len(sent.text.split(' ')) < 30:
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_frequencies[word.text.lower()]
                        else:
                            sentence_scores[sent] += word_frequencies[word.text.lower()]
    # print(sentence_scores)
    from heapq import nlargest
    summarized_sentences = nlargest(3, sentence_scores, key=sentence_scores.get)
    # summarized_sentences
    # for w in summarized_sentences:
    #     print(w.text)
    final_sentences = [ w.text for w in summarized_sentences ]
    summary = ' '.join(final_sentences)
    return summary

In [524]:
# sentences = [sent.string.strip() for sent in doc.sents]
# for sent_text in sentences:
#     doc = nlp(sent_text);
#     for ent in doc.ents:
#         value = ent.text
#         if(ent.label_=='DATE'):
#             dates = list(datefinder.find_dates(ent.text))
#             if(len(dates) > 0):
#                 value = dates[0].strftime('%m/%d/%Y');
#         event[ent.label_].add(value)

In [548]:
final_event_map = {}
final_event_map['org1']=[]
final_event_map['org2']=[]
final_event_map['actor1']=[]
final_event_map['actor2']=[]
final_event_map['type']=[]
final_event_map['summary']=[]
final_event_map['date']=[]
final_event_map['location']=[]


# doc = nlp("Ghodse killed Mahatma Ghandi on Oct 2, 1947")
# doc = nlp("Ghandhi was born on October 2, 1869 to the couples Karamchand Ghandhi and Putlibai Ghandhi")
# doc = nlp("On 2nd October 1947, Godse killed Mahatma Ghandi")
# doc = nlp("On 2nd October 1947, Mahatma Ghandi was killed by Godse")
# doc = nlp("On 14 February 2019, a convoy of vehicles carrying security personnel on the Jammu Srinagar National Highway was attacked by a vehicle-borne suicide bomber at Lethpora (near Awantipora) in the Pulwama district, Jammu and Kashmir, India. The attack resulted in the deaths of 40 Central Reserve Police Force (CRPF) personnel and the attacker.The responsibility for the attack was claimed by the Pakistan-based Islamist militant group Jaish-e-Mohammed.The attacker was Adil Ahmad Dar, a local from Pulwama district, and a member of Jaish-e-Mohammed.")
import os
path = '/Users/praveenkumarrajendran/codebase/air/articles' 
for file in os.listdir( path ):
    file = path+"/"+file
    print(file)
    if file.endswith( ".txt" ):
        f=open(file, 'r')  
        content = f.readlines()
        f.close() 
        doc = nlp(content[0])
        # sentences = [x for x in doc.sents]
        # for ent in doc.ents:
        #     print(ent.text+"::"+ent.label_)
        doc_topic = classify_doc_topic(doc)
        # print(extract_event(doc))
        if(doc_topic!="Unclassified"):

            event = extract_event(doc)
            event['type'] = doc_topic
            event_summary = summarise_event(doc)
            event['summary'] = event_summary
            if 'org1' in event:
                final_event_map['org1'].append(event['org1'])
            else:
                final_event_map['org1'].append('')
            if 'org2' in event: 
                final_event_map['org2'].append(event['org2'])
            else:
                final_event_map['org2'].append('')
            if 'actor1' in event:
                final_event_map['actor1'].append(event['actor1'])  
            else:
                final_event_map['actor1'].append('')
            if 'actor2' in event:
                final_event_map['actor2'].append(event['actor2'])
            else:
                final_event_map['actor2'].append('')
            if 'date' in event:
                final_event_map['date'].append(event['date'])
            else:
                final_event_map['date'].append('')
            if 'type' in event:
                final_event_map['type'].append(event['type'])
            else:
                final_event_map['type'].append('')
            if 'summary' in event:
                final_event_map['summary'].append(event['summary'])
            else:
                final_event_map['summary'].append('')
            if 'location' in event:
                final_event_map['location'].append(event['location'])
            else:
                final_event_map['location'].append('')
# add to table
# print events table

import pandas as pd
pd.DataFrame.from_dict(final_event_map)






/Users/praveenkumarrajendran/codebase/air/articles/tribune_india_feb24.txt
/Users/praveenkumarrajendran/codebase/air/articles/tribune_india_feb26.txt
/Users/praveenkumarrajendran/codebase/air/articles/toi_mar3.txt
/Users/praveenkumarrajendran/codebase/air/articles/time8_mar16.txt
/Users/praveenkumarrajendran/codebase/air/articles/toi_mar13.txt
/Users/praveenkumarrajendran/codebase/air/articles/tribune_india_mar21.txt
/Users/praveenkumarrajendran/codebase/air/articles/time8_mar11.txt
/Users/praveenkumarrajendran/codebase/air/articles/toi_mar14.txt


Unnamed: 0,org1,org2,actor1,actor2,type,summary,date,location
0,Farmers,farmers’,Farmer,leader Jhanda Singh Jethuke,Riots/Protests,"If banks do not respond, we will have no optio...",Saturday,"{ Ludhiana, Ayali on Chowk, Patiala}"
1,hostage Lock,The Education Departmnet,Secretary Krishan Kumar,Minister Capt Amarinder Singh,Riots/Protests,"The six teachers, who have been transferred, a...","26 , 2019 , Feb","{ MD Tribune News Service , February Patiala, ..."
2,,,Imran,the accused Sumit,Violence against Civilians,"On Friday, at 6.08pm, officials were informed ...",Friday,"{ Bindapur, Dwarka district}"
3,AGP In Headquarters,the and BJP AGP,,,Riots/Protests,Massive Protest Infront Of AGP Headquarters In...,March 2019 16th,"{ Guwahati, bound Assam}"
4,a near here Athikkadu,Athikkadu,,,Riots/Protests,BPCL is about to lay the pipeline through the ...,Tuesday,"{ Bengaluru, the western TN, Karnataka, nea..."
5,Farmers,the BKU,Manav in Chowk,,Riots/Protests,"As per the information, out of the 4,950 insur...",Wednesday,"{ Ambala City, , suffered , Ambala}"
6,,,,,Riots/Protests,,"March , 2019 11th","{ India - Bangladesh, Cachar , located , Jala..."
7,Suleeswaranpatti,", was Nagaraj","A , arrested Nagaraj",Nagaraj,Violence against Civilians,Nagaraj was arrested on the charges of attacki...,Wednesday,"{ Pollachi, near Suleeswaranpatti}"


/Users/praveenkumarrajendran/codebase/air/articles/tribune_india_feb24.txt
/Users/praveenkumarrajendran/codebase/air/articles/tribune_india_feb26.txt
['Chandigarh Posted at: Feb 26, 2019,  7:58 AM; last updated: Feb 26, 2019, 12:12 PM (IST) Teachers protest transfers, hold DEO hostage Lock school gate, boycott state govt’s ‘Parho Punjab, Parhao Punjab’ project  Also in this section Pay 20% more for booze from April 1 Rs 75 lakh seized from vehicle in Lalru ASI shoots self in suicide bid Rs 94L gold seized from passenger at airport Bailable warrants against Lifestyle MD Tribune News Service Patiala, February 25 The ongoing boycott against the Punjab Government’s ‘Parho Punjab, Parhao Punjab’ project escalated when hundreds of teachers held the District Education Officer (Primary) and other officials of the Education Department hostage at Government Multipurpose Elementary School here. In the evening, teachers in large numbers gathered at the school and locked the main gate of the school