### A class for StanfordNER Tagger.

In [1]:
# -*- coding: utf-8 -*-
import nltk
from nltk.tag.stanford import StanfordNERTagger

#refer links - https://pythonprogramming.net/named-entity-recognition-stanford-ner-tagger/
#refer links - https://blog.sicara.com/train-ner-model-with-nltk-stanford-tagger-english-french-german-6d90573a9486

class MyStanfordNERTagger:
    """
    Extracts entities using StanfordNERTagger
    """
    english_model = './stanford-ner/english.muc.7class.distsim.crf.ser.gz'
    jar = './stanford-ner/stanford-ner.jar'

    st = StanfordNERTagger(english_model, jar, encoding='utf-8')

    def extract_entity_stanford(self, text, entity):
        classified_text = self.st.tag(text.split())
        #print(' Classified text', classified_text)
        extracted_value = []
        for item in classified_text:
            if item[1] == entity:
                extracted_value.append(item[0])
        return extracted_value


### Extract the sentence containing the date.

In [2]:
# -*- coding: utf-8 -*-

import pandas as pd
import os
import json
import re
import spacy
from spacy.symbols import DATE


class DateFinder:
    """
    Find the sentence with the event date.
    """
    l = []
    nlp = spacy.load("en_core_web_sm")

    def stanford_ner_tagger(self, text, entity):
        """
        Extracts entities using Stanford NER Tagger
        :return:
        """
        s = MyStanfordNERTagger()
        tagged_values = s.extract_entity_stanford(text, entity)
        return tagged_values

    def splitParagraphIntoSentences(self, paragraph):
        ''' break a paragraph into sentences
            and return a list '''
        # to split by multile characters

        #   regular expressions are easiest (and fastest)
        sentenceEnders = re.compile('[.]')
        sentenceList = sentenceEnders.split(paragraph)
        return sentenceList

    def combine(self, value1, value2):
        l = set()
        l.add(value1)
        l.add(value2)
        return l

    def get_event_date(self):
        """
        Gets the sentence containing "a date" using spaCY NER Tagging.
        :return:
        """
        count = 0
        f = open('sample_input.csv', 'r')
        fieldnames = ['text', 'date_publish', 'event_type_635', 'textrank', 'actor1_635']
        dataframe = pd.read_csv(f, names=fieldnames, header=None, skiprows=1)


        for index, row in dataframe.iterrows():
            try:
                # f = open(filename, "r").read()
                text = row['text']
                date_publish = row['date_publish']
                label = row['event_type_635']
                ordered_entities = row['textrank']
                actor1_635 = row['actor1_635']
                count = count + 1
                # d = json.loads(f)
                dict = {}
                # for key, value in d.items():
                dict["date_publish"] = date_publish
                dict["label"] = label
                dict["actor1_635"] = actor1_635
                dict["ordered_entities_from_textrank"] = ordered_entities
                dict["full_text"] = text
                sentence_list = self.splitParagraphIntoSentences(text)
                event_day = False
                dict["first_sentence"] = sentence_list[0]
                for i in range(0, len(sentence_list)):
                    if event_day is True:
                        break
                    if event_day is False:
                        doc = self.nlp(sentence_list[i])
                        # For the same sentence, you check if there is an entity tagged as "time". Giving it higher priority.
                        # ALSO YOU CAN'T DO THAT, CUZ IT IS NOT ALWAYS CORRECT => "11-hour long" gets higher priority than "Saturday"
                        spacy_time = False
                        spacy_date = False
                        for ent in doc.ents:
                            if spacy_time is False and spacy_date is False:
                                if ent.label_ == u'TIME':
                                    spacy_time = True
                                    spacy_time_value = "True"
                                    TIME_text = ent.text
                                    event_day = True
                                    if i != 0:
                                        sentence_list[i] = sentence_list[i].replace('\n', '')
                                        sentence_with_date_value = sentence_list[i]
                                    else:
                                        sentence_with_date_value = u"Same as first sentence"
                            # doing this because "time" tags better entities than date => one-day international today.
                                elif ent.label_ == u'DATE':
                                    spacy_date = True
                                    spacy_date_value = "True"
                                    DATE_text = ent.text
                                    event_day = True
                                    if i != 0:
                                        sentence_list[i] = sentence_list[i].replace('\n', '')
                                        sentence_with_date_value = sentence_list[i]
                                    else:
                                        sentence_with_date_value = u"Same as first sentence"
                            else:
                                break
                        if event_day is True:
                            if spacy_time is True and spacy_date is False:
                                dict["spaCY_date"] = spacy_time_value
                                dict["DATE"] = TIME_text
                                dict["sentence_with_date"] = sentence_with_date_value
                            elif spacy_time is False and spacy_date is True:
                                dict["spaCY_date"] = spacy_date_value
                                dict["DATE"] = DATE_text
                                dict["sentence_with_date"] = sentence_with_date_value
                            else:
                                #giving priority to date here.
                                dict["spaCY_date"] = spacy_date_value
                                dict["DATE"] = DATE_text
                                dict["sentence_with_date"] = sentence_with_date_value
                            break
                entity = u'DATE'
                value = text
                stanford_value = self.stanford_ner_tagger(value, entity)
                if len(stanford_value) > 0:
                    combined_date_values = self.combine(stanford_value[0], DATE_text)
                else:
                    combined_date_values = "Null"
                dict["stanfordNER_date"] = stanford_value
                dict["combined_date"] = combined_date_values
                if event_day is False:
                    dict["spaCY_date"] = "False"
                    dict["DATE"] = "Null"
                    dict["sentence_with_date"] = "Null"
                self.l.append(dict)
            except Exception as e:
                print('Text', text)
                print(e)
        print(self.l)
        with open('./output/list_stored_in_csv.txt', 'w+') as file1:
            file1.write(str(self.l))
        s = pd.DataFrame(self.l, columns=['date_publish', 'label', 'actor1_635', 'textrank', 'stanfordNER_date', 'combined_date', 'first_sentence', 'spaCY_date', 'DATE', 'sentence_with_date', 'full_text'])
        s.to_csv("./output/date_and_first_sentence.csv", encoding="utf-8")
        list_to_be_returned = self.l
        return list_to_be_returned

In [3]:
# -*- coding: utf-8 -*-
import dateparser
import datetime
from datetime import datetime

class ModifyDate():
    """
    Modify date string to a standard format.
    """
    dates_to_be_returned = []
    def standardize_date(self, date_publish, date_in_article):
        for i, j in zip(date_publish, date_in_article):
            try:
                i = i.encode("ascii", "ignore")
                j = j.encode("ascii", "ignore")
                try:
                    datetime_object = datetime.strptime(i, '%Y-%m-%d %H:%M:%S')
                    date = dateparser.parse(j, settings={'RELATIVE_BASE': datetime_object})
                    if date is not None:
                        date = date.strftime('%m/%d/%Y %H:%M:%S')
                    else:
                        date = None
                except Exception as e:
                    date = None
                if date is None:
                    #If there was no proper date mentioned in the article, then copy the published date.
                    date = i
            except Exception as e:
                #if date_publish is None
                date = "2017-01-31 11:59:00"
            self.dates_to_be_returned.append(date)
        return self.dates_to_be_returned


In [4]:
# -*- coding: utf-8 -*-
import gensim
from gensim.summarization.summarizer import summarize

class Summarizer:
    """
    Summarizes the given text.
    """

    def get_summary(self, text):
        try:
            summary = summarize(text, ratio=0.4)
            if len(summary.split('.')) <= 2:
                print(summary)
                print('-'*32)
                summary = summarize(text, ratio=0.75)
                print(summary)
                print('-' * 32)
        except Exception as e:
            print(e)
            summary = text
        return summary

### Extract other entities and summarize.

In [5]:
# -*- coding: utf-8 -*-

import spacy
import pandas as pd
import numpy as np


class Extractor:
    """
    Defines methods to extract entities needed.
    """
    nlp = spacy.load("en_core_web_sm")

    def split_paragraph_into_sentences(self, paragraph):
        """ break a paragraph into sentences
            and return a list """
        import re
        # to split by multiple characters

        # regular expressions are easiest (and fastest)
        sentence_enders = re.compile('[.]')
        sentence_list = sentence_enders.split(paragraph)
        return sentence_list

    def get_actor(self):
        st = MyStanfordNERTagger()
        df = DateFinder()
        date_and_main_sentence_list = df.get_event_date()

        for dictionary in date_and_main_sentence_list:
            # Having each entity as a list just to extract every single mention of its kind from the text.
            # Ideally, the first value from this list would be the match. (Inferring from the "first mention" rule)
            person = []
            ORG = []
            location = []
            region = []

            sentence_with_person = []
            sentence_with_loc = []
            sentence_with_ORG = []
            sentence_with_region = []

            stanford_person = []
            stanford_ORG = []
            stanford_location = []

            for key, value in dictionary.items():
                full_text = dictionary['full_text']
                full_text_split = self.split_paragraph_into_sentences(full_text)
                if key == 'DATE' and value != 'Null':
                    sentence_with_date = dictionary['sentence_with_date']
                    first_sentence = False
                    if sentence_with_date == u'Same as first sentence':
                        sentence_with_date = dictionary['first_sentence']
                        first_sentence = True

                    window_size = 1
                    i = 1
                    doc = self.nlp(sentence_with_date)
                    current_sentence = sentence_with_date
                    before_flag = False
                    after_flag = False

                    stanford_location = st.extract_entity_stanford(full_text, u'LOCATION')
                    stanford_ORG = st.extract_entity_stanford(full_text, u'ORGANIZATION')
                    stanford_person = st.extract_entity_stanford(full_text, u'PERSON')

                    while i <= window_size + 1:
                        # Searching for other entities in the same sentence. (Could be expanded to a window size of 2)
                        for ent in doc.ents:
                            if ent.label_ == u'PERSON':
                                person.append(ent.text)
                                sentence_with_person.append(current_sentence)
                            elif ent.label_ == u'ORG':
                                ORG.append(ent.text)
                                sentence_with_ORG.append(current_sentence)
                            elif ent.label_ == u'LOC':
                                location.append(ent.text)
                                sentence_with_loc.append(current_sentence)
                            elif ent.label_ == u'GPE':
                                region.append(ent.text)
                                sentence_with_region.append(current_sentence)

                        if first_sentence is True:
                            # Search for the entities in the next sentence.
                            try:
                                print(full_text_split[0])
                                next_sentence = full_text_split[i]
                                doc = self.nlp(next_sentence)
                                i = i + 1
                            except Exception as e:
                                break
                        else:
                            if after_flag is False or before_flag is False:
                                # Otherwise, search in one sentence previous to the current one. And one in the sentence after the current one.
                                try:
                                    j = full_text_split.index(sentence_with_date)
                                    if before_flag is False:
                                        next_sentence = full_text_split[j-1]
                                        before_flag = True
                                        doc = self.nlp(next_sentence)
                                    elif after_flag is False:
                                        next_sentence = full_text_split[j+1]
                                        after_flag = True
                                        doc = self.nlp(next_sentence)
                                        # increment only at this block, because i should be unchanged for the first block. i <= window_size + 1
                                        i = i + 1
                                    current_sentence = next_sentence
                                except Exception as e:
                                    break
                            else:
                                break
            dictionary.update({'person': person, 'stanford_person': stanford_person, 'ORG': ORG, 'stanford_ORG': stanford_ORG, 'location': location, 'stanford_location': stanford_location, 'region': region, 'sentence_with_person': sentence_with_person,
                               'sentence_with_ORG': sentence_with_ORG, 'sentence_with_loc': sentence_with_loc, 'sentence_with_region': sentence_with_region})
            summarizer = Summarizer()
            summarized_text = summarizer.get_summary(full_text)
            dictionary.update({'summary': summarized_text})

        for d in date_and_main_sentence_list:
            summary = d.get("summary")
            sdate = d.get("sentence_with_date")
            sregion = d.get("sentence_with_region")
            sperson = d.get("sentence_with_person")
            sORG = d.get("sentence_with_ORG")
            sloc = d.get("sentence_with_loc")
            summary_split = self.split_paragraph_into_sentences(summary)

            try:
                # First mentioned rule. So we are checking if the sentence of the first entity has been mentioned.
                if sdate == u'Same as first sentence':
                    text = d.get("full_text")
                    text_list = self.split_paragraph_into_sentences(text)
                    sdate = text_list[0]
                a = summary_split.index(sdate)
                d.update({'date_check': 'True'})
            except ValueError as exp:
                d.update({'date_check': 'False'})
            except:
                pass

            try:
                b = summary_split.index(sregion[0])
                d.update({'region_check': 'True'})
            except ValueError as exp:
                d.update({'region_check': 'False'})
            except:
                pass

            try:
                c = summary_split.index(sperson[0])
                d.update({'person_check': 'True'})
            except ValueError as exp:
                d.update({'person_check': 'False'})
            except:
                pass

            try:
                d2 = summary_split.index(sORG[0])
                d.update({'ORG_check': 'True'})
            except ValueError as exp:
                d.update({'ORG_check': 'False'})
            except:
                pass

            try:
                e = summary_split.index(sloc[0])
                d.update({'Loc_check': 'True'})
            except ValueError as exp:
                d.update({'Loc_check': 'False'})
            except:
                pass

        s = pd.DataFrame(date_and_main_sentence_list, columns=['date_publish', 'label', 'actor1_635', 'textrank', 'stanfordNER_date', 'combined_date', 'first_sentence', 'spaCY_date', 'DATE', 'sentence_with_date', 'full_text', 'person', 'stanford_person', 'ORG', 'stanford_ORG', 'location', 'stanford_location', 'region',
                                                               'sentence_with_person', 'sentence_with_ORG', 'sentence_with_loc', 'sentence_with_region', 'summary',
                                                               'date_check', 'region_check', 'person_check', 'ORG_check', 'Loc_check'])
        m = ModifyDate()
        date_publish_column = s['date_publish'].tolist()
        date_in_article = s['DATE'].tolist()
        standardized_dates = m.standardize_date(date_publish_column, date_in_article)

        s['standardized_dates'] = standardized_dates

        s.to_csv("./output/extracted_data.csv", encoding="utf-8")
        summary_list = []
        with open('./summary.txt', 'w+') as summary_file:
            for d in date_and_main_sentence_list:
                compare_summary = dict()
                compare_summary["text"] = d.get("full_text")
                compare_summary["summary"] = d.get("summary")
                summary_list.append(compare_summary)
            summary_file.write(str(summary_list))


e = Extractor()
e.get_actor()

[{'date_publish': '3/23/15 20:26', 'label': 'Protests', 'actor1_635': 'Protesters (India)', 'ordered_entities_from_textrank': "['incident', 'Jarada', 'village', 'persons', 'police', 'group', 'groups', 'Odisha']", 'full_text': 'Odisha Sun Times Bureau Brahmapur, Mar 23 Police today arrested 11 persons in connection with yesterday s group clash in Ambagaon under Jarada police limits in Odisha s Ganjam district. While three persons have sustained injuries in yesterday s group clash, over 40 houses were set ablaze as rival groups set afire thatched houses belonging to the other group and hurled bombs at each other. One seriously injured person has been referred to a hospital at Visakhapatnam while other two are undergoing treatment at Jarada hospital, police said. All male members of the village, which has around 90 households, have fled after the incident fearing arrest. DIG southern range Amitav Thakur who was on a visit to the strife torn village said The situation is now under control.

Odisha Sun Times Bureau Brahmapur, Mar 23 Police today arrested 11 persons in connection with yesterday s group clash in Ambagaon under Jarada police limits in Odisha s Ganjam district
Odisha Sun Times Bureau Brahmapur, Mar 23 Police today arrested 11 persons in connection with yesterday s group clash in Ambagaon under Jarada police limits in Odisha s Ganjam district
ABHIJIT TALUKDAR Tamulpur, March 23, 2019 Tension gripped Kamrup district after a man was stabbed to death on March 23rd, 2019
ABHIJIT TALUKDAR Tamulpur, March 23, 2019 Tension gripped Kamrup district after a man was stabbed to death on March 23rd, 2019
Tripura News 10 CPI M supporters injured after MP Jiten successfully opened BJP occupied Kamalpur CPI M Party office TIWN March 23, 2019 AGARTALA, March 23 TIWN Tension hits Kamalpur after MP Jitendra Chaudhury successfully opened Kamalpur party office of CPI M after 11 months
Tripura News 10 CPI M supporters injured after MP Jiten successfully opened BJP occupied Kamalpur 

### Fill the slots by scoring each tagged value.

In [7]:
import csv
import ast
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import difflib
from builtins import any as b_any

def get_filled_slots():
    csvfile = open('./output/extracted_data.csv', 'r')
   
    csv_reader1 = csv.reader(csvfile)
    
    df1 = pd.read_csv(csvfile)
    
    person = []
    ORG = []
    text_rank = []
    region = []
    event_type_635 = []
    full_text = []

    for index, row in df1.iterrows():
        person_combined = []
        ORG_combined = []
        location_combined = []

        textrank = ast.literal_eval(row['textrank'])
        spacy_person = ast.literal_eval(row['person'])
        stanford_person = ast.literal_eval(row['stanford_person'])

        spacy_ORG = ast.literal_eval(row['ORG'])
        stanford_ORG = ast.literal_eval(row['stanford_ORG'])

        spacy_location = ast.literal_eval(row['location'])
        spacy_region = ast.literal_eval(row['region'])
        stanford_location = ast.literal_eval(row['stanford_location'])

        for item in spacy_person:
            person_combined.append(item)
        for item in stanford_person:
            person_combined.append(item)

        for item in spacy_ORG:
            ORG_combined.append(item)
        for item in stanford_ORG:
            ORG_combined.append(item)
        for item in spacy_location:
            location_combined.append(item)
        for item in spacy_region:
            location_combined.append(item)
        for item in stanford_location:
            location_combined.append(item)

        event_type_635.append(str(row['actor1_635']))
        full_text.append(str(row['full_text']))
        person.append(person_combined)
        ORG.append(ORG_combined)
        text_rank.append(textrank)
        region.append(location_combined)

    intersect_person = []
    intersect_ORG = []

    count = 0


    return_value = True

    for item in text_rank:
        possible_person = []
        possible_ORG = []
        choices_person = person[count]
        choices_ORG = ORG[count]
        for v in item:
            score = difflib.get_close_matches(v, choices_person, n=1, cutoff=0.6)
            #sort
            if score:
                d = {}
                ratio = difflib.SequenceMatcher(None, v, score[0]).ratio()
                d[score[0]] = ratio
                possible_person.append(d)
            score2 = difflib.get_close_matches(v, choices_ORG, n=1, cutoff=0.6)
            if score2:
                d = {}
                ratio = difflib.SequenceMatcher(None, v, score2[0]).ratio()
                d[score2[0]] = ratio
                possible_ORG.append(d)
        intersect_person.append(possible_person)
        intersect_ORG.append(possible_ORG)
        count = count + 1

    # Create a new csv with the additional columns containing the scored values for each slot.
    if return_value is True:
        df1['intersect-person'] = intersect_person
        df1['intersect-ORG'] = intersect_ORG
        df1['intersect-location'] = region
        df1.to_csv('./output/scored_entities.csv', encoding='utf-8')
        print(df1)
        

get_filled_slots()

   Unnamed: 0   date_publish     label          actor1_635  \
0           0  3/23/15 20:26  Protests  Protesters (India)   
1           1  3/23/19 13:42     Riots     Rioters (India)   
2           2            NaN  Protests  Protesters (India)   
3           3  3/23/19 21:35  Protests  Protesters (India)   
4           4  3/23/19 11:30  Protests  Protesters (India)   
5           5  3/26/19 17:46  Protests  Protesters (India)   

                                            textrank  \
0  ['incident', 'Jarada', 'village', 'persons', '...   
1  ['Afiz', 'Ali', 'Police', 'Rizul', 'March', 'd...   
2  ['M', 'CPI', 'Kamalpur', 'BJP', 'MP', 'Jitendr...   
3  ['students', 'Gandoh', 'Government', 'school',...   
4  ['police', 'clash', 'groups', 'district', 'Dhe...   
5  ['Congress', 'Bhubaneswar', 'police', 'Bhawan'...   

                                    stanfordNER_date           combined_date  \
0                                                 []                    Null   
1  ['March',

### Get the latitude and the longitude for the locations - For UI purposes.

In [8]:
from geopy.geocoders import Nominatim
import pandas as pd
import ast
import time
import geocoder
import json


def find_geo_code(intersect_location):
    # geolocator = Nominatim(user_agent="NER_tagger")

    location_for_all_rows_with_geo_code = []
    f1 = open('./output/dummy_file1.json', 'a+')
    geolocator = Nominatim(user_agent="NER_Tagger")
    for location in intersect_location:
        geo_codes_per_row = []
        for each_location in location:
            try:
                loc = geolocator.geocode(each_location)
                if loc is not None:
                    lat_long = (loc.latitude, loc.longitude)
                    dic = {}
                    dic[each_location] = lat_long
                    geo_codes_per_row.append(dic)
                    time.sleep(2)
            except Exception as e:
                time.sleep(2)
                print(e)
                pass
        print(geo_codes_per_row)
        location_for_all_rows_with_geo_code.append(geo_codes_per_row)
        f1.write(json.dumps(location_for_all_rows_with_geo_code))
    print(location_for_all_rows_with_geo_code)
    return location_for_all_rows_with_geo_code


def find_country(intersect_location):
    intersect_location_updated = []
    f2 = open('./output/dummy_file2.json', 'a+')
    for location in intersect_location:
        list_of_updated_locations = []
        for each_location in location:
            try:
                g = geocoder.google(each_location, key='YOUR_API_KEY')
                country = g.country_long
                state = g.state_long
                updated_location = each_location + ',' + state + ',' + country
                print(updated_location)
                list_of_updated_locations.append(updated_location)
                time.sleep(2)
            except Exception as e:
                print(e)
                time.sleep(2)
                list_of_updated_locations.append(each_location)
                pass
        f2.write(json.dumps(intersect_location_updated))
        intersect_location_updated.append(list_of_updated_locations)
    print(intersect_location_updated)
    return intersect_location_updated


csvfile = open('./output/scored_entities.csv', 'r')
df1 = pd.read_csv(csvfile)

intersect_location = []
for index, row in df1.iterrows():
    loc = ast.literal_eval(row['intersect-location'])
    intersect_location.append(loc)

intersect_location1 = intersect_location
location_set = []

for v in intersect_location1:
    l = list(set(v))
    location_set.append(l)

intersect_location_updated = find_country(intersect_location=location_set)

location_in_csv = find_geo_code(intersect_location_updated)

lat_long_column = []
for each_location in location_in_csv:
    lat_long_per_row = []
    for each_dict in each_location:
        for key, value in each_dict.items():
            lat_long_per_row.append(value)
    lat_long_column.append(lat_long_per_row)

df1['lat-long-with-cities'] = location_in_csv
df1['lat-long'] = lat_long_column

df1.to_csv('./output/final_data_with_lat_and_long.csv', encoding='utf-8')

Visakhapatnam,Andhra Pradesh,India
Odisha,Odisha,India
Jarada,Florida,United States
Sabha,Sabha District,Libya
Lok,Oklahoma,United States
Kamalpur,Tripura,India
can only concatenate str (not "NoneType") to str
Doda,Minnesota,United States
can only concatenate str (not "NoneType") to str
Sources,British Columbia,Canada
Dhepaguda,Odisha,India
Koraput,Odisha,India
Sadar,Pennsylvania,United States
district.,Kansas,United States
Odisha,Odisha,India
Bhubaneswar,Odisha,India
[['Visakhapatnam,Andhra Pradesh,India', 'Odisha,Odisha,India', 'Jarada,Florida,United States'], [], ['Sabha,Sabha District,Libya', 'Lok,Oklahoma,United States', 'Kamalpur,Tripura,India'], ['Gandoh', 'Doda,Minnesota,United States', 'Thathri'], ['Sources,British Columbia,Canada', 'Dhepaguda,Odisha,India', 'Koraput,Odisha,India', 'Sadar,Pennsylvania,United States', 'district.,Kansas,United States'], ['Odisha,Odisha,India', 'Bhubaneswar,Odisha,India']]
[{'Visakhapatnam,Andhra Pradesh,India': (17.7231276, 83.3012842)}, {'Odish