In [1]:
import spacy
import os
import nltk
from current_project_programs.preprocess import Preprocess
from nltk.parse.stanford import StanfordParser
from nltk.corpus import stopwords
from spacy import displacy
from spacy.matcher import Matcher
import json
from spacy.tokens import Doc, Span, Token
#import neuralcoref
import time
from nltk.corpus import wordnet
import numpy as np

def load_text_files(filename, dir='WikipediaArticles/'):
    list_of_directory = os.listdir(dir)
    dataset = {}
    test_sentences = []
    for file in list_of_directory:
        if (not filename and file.endswith('.txt')) or file.endswith(filename):
            current = open(dir+file,"r",encoding='ISO-8859-1')
            list_of_paragraphs = []
            for count, line in enumerate(current):
                list_of_paragraphs.append(line.strip().replace('ï»¿','').replace('â€“','-'))
            dataset[file] = list_of_paragraphs
    dataset['Praveen.txt'] = test_sentences
    return dataset

def feature_extraction(word):
    definition,synonyms,antonyms,hyper,hypo,mero,holo = [],[],[],[],[],[],[]
    for syn in wordnet.synsets(word):
        definition.append(syn.definition())
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
            if lemma.antonyms():
                antonyms.append(lemma.antonyms()[0].name())
        for hypernym in syn.hypernyms()[:2]:
            hyper.append(hypernym.lemma_names())
        for hyponym in syn.hyponyms()[:2]:
            hypo.append(hyponym.lemma_names())
        for meronym in syn.part_meronyms()[:2]:
            mero.append(meronym.lemma_names())
        for holonym in syn.part_holonyms()[:2]:
            holo.append(holonym.lemma_names())
    return definition,synonyms,antonyms,hyper,hypo,mero,holo

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # Matched span
    if doc[start].ent_iob_=='B' and (len(doc)==end or doc[end].ent_type_!='GPE'):
        sent = span.sent  # Sentence containing matched span
        extraction = {"template":"PART", "sentences":[sent.text]}
        arguments = {}
        # Append mock entity for match in displaCy style to matched_sents
        # get the match span by ofsetting the start and end of the span with the
        # start and end of the sentence in the doc
        match_ents = [{
            "start": span.start_char - sent.start_char,
            "end": span.end_char - sent.start_char,
            "label": "PLACE_IN_PLACE",
        }]
        matched_sents.append({"text": sent.text, "ents": match_ents})
        arguments["1"], arguments["2"] = ''.join(str(span)).split(',')
        extraction["arguments"] = arguments
        result["extraction"].append(extraction)
       
def show_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # Matched span
    sent = span.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": "PLACE",
    }]
    matched_sents.append({"text": sent.text, "ents": match_ents})
   
def merge_ents(doc):
    #print_token_attr(doc)
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(ent)
           
    i = 0
    while i<len(doc):
        token = doc[i]
        if token.pos_=='PROPN':
            if token.dep_=='compound':
                tail = token
                while tail.i+1<len(doc) and tail.nbor().dep_=='compound':
                    tail = tail.nbor()
                if tail.i>token.i:
                    with doc.retokenize() as retokenizer:
                        retokenizer.merge(doc[token.i:tail.i+1])
        i += 1
       
    i = 0    
    while i<len(doc):
        token = doc[i]
        if token.pos_=='PROPN':
            if token.dep_ == 'nmod':
                if token.head.lemma_.lower() in place_lemmas:
                    place_type = token.head
                    arr = [x.text.strip()+' '+place_type.lemma_ for x in doc[token.i:place_type.i+1] if x.pos_=='PROPN' and x.lemma_.lower() not in place_lemmas]
                    if not (place_type.i+1<len(doc) and place_type.nbor().lemma_.lower()=='of'):
                        with doc.retokenize() as retokenizer:
                            retokenizer.merge(doc[token.i:place_type.i+1], attrs={"ENT_TYPE":'GPE'})
                        if len(arr)>1:
                            doc[token.i]._.grouping_type = "SIBLING"
                            doc[token.i]._.list = arr
        i += 1
    return doc

def merge_siblings(doc):
    #print_token_attr(doc)
    i = 0
    while i<len(doc):
        token = doc[i]
        if token.pos_=='CCONJ':
            conj = token.text
            a = token.head.i
            b = token.i
            while b<len(doc) and doc[b].text != [x for x in token.head.children][-1].text:
                b += 1                
            cur = token.head
            ent_type = {}
            ent_type[doc[b].ent_type_] = 1
            if cur.ent_type_ not in ent_type:
                ent_type[cur.ent_type_] = 1
            else:
                ent_type[cur.ent_type_] += 1                
            while cur.head.pos_=='PROPN':
                cur = cur.head
                if cur.ent_type_ not in ent_type:
                    ent_type[cur.ent_type_] = 1
                else:
                    ent_type[cur.ent_type_] += 1
                a = cur.i
                if cur.head == cur:
                    break            
            mod = ''
            attr = {'ENT_TYPE': None}
            if doc[a].dep_=='nmod':
                mod = doc[a].head.lemma_
                if mod.lower() in place_lemmas:
                    attr = {"ENT_TYPE": 'GPE'}
            else:
                attr = {"ENT_TYPE": [k for k, v in sorted(ent_type.items(), key=lambda item: item[1])][-1]}
            if attr["ENT_TYPE"] == 'GPE':
                with doc.retokenize() as retokenizer:
                    retokenizer.merge(doc[a:b+1], attrs=attr)
                doc[a]._.grouping_type = "SIBLING"
                doc[a]._.list = [y.strip()+' '+mod for x in doc[a].text.split(conj) for y in x.split(',')]
                i = a
                #print(doc[a:b+1], attrs)
        i += 1
    return doc

def merge_ancestors(doc):
    #print_token_attr(doc)
    i = 0
    while i<len(doc):
        token  = doc[i]
        if token.ent_type_=='GPE':
            if token.dep_ == 'poss':
                if token.head.ent_type_=='GPE':
                    parent = token.text.strip()
                    child = token.head.text.strip()
                    with doc.retokenize() as retokenizer:
                        retokenizer.merge(doc[token.i:token.head.i+1], attrs={"ENT_TYPE":'GPE'})
                    doc[i]._.grouping_type = "CHILD_PARENT"
                    doc[i]._.list = [child, parent]
            if i+1<len(doc) and doc[i+1].text==',':
                if token.head.dep_=='prep':
                    tail = token
                    while tail.i+2<len(doc) and doc[tail.i+1].text==',' and doc[tail.i+2].ent_type_=='GPE':
                        tail = doc[tail.i+2]
                    if tail!=token:
                        with doc.retokenize() as retokenizer:
                            retokenizer.merge(doc[i:tail.i+1], attrs={"ENT_TYPE":'GPE'})
                        doc[i]._.grouping_type = "CHILD_PARENT"
                        doc[i]._.list = [y.strip() for y in doc[i].text.split(',')]
                else:
                    result = [token]
                    j = 0
                    while j<len(result):
                        result.extend([x for x in result[j].children if x.ent_type_=='GPE'])
                        j+=1
                    if len(result)>1:
                        with doc.retokenize() as retokenizer:
                            retokenizer.merge(doc[i:result[-1].i+1], attrs={"ENT_TYPE":'GPE'})
                        doc[i]._.grouping_type = "CHILD_PARENT"
                        doc[i]._.list = [y.strip() for y in doc[i].text.split(',')]
        elif token.pos_=='NOUN' and token.lemma_.lower() in place_lemmas:
            preps = [child for child in token.children if child.dep_ == 'prep' and child.lemma_.lower() in ['of']]
            for prep in preps:
                children = [x for x in prep.rights if x.pos_=='PROPN']
                if children:
                    parents = [x for x in token.lefts if x.pos_ in ['DET','PROPN','ADJ']]
                    parent = [x for x in parents if x.pos_=='PROPN']
                    if parent:
                        ls = [children[-1].text, parent[0].text]
                        with doc.retokenize() as retokenizer:
                            retokenizer.merge(doc[parents[0].i:children[-1].i+1], attrs={"ENT_TYPE":'GPE','POS':'PROPN'})
                        doc[parents[0].i]._.grouping_type = "CHILD_PARENT"
                        doc[parents[0].i]._.list = ls
                        i = parents[0].i
                    else:
                        parent = [x for x in parents if x.ent_type_=='NORP' and x.head==token]
                        gpe_text = get_gpe_from_norp(parent[0].lemma_.lower()) if parent else None
                        if gpe_text:
                            ls = [children[-1].text, gpe_text]
                            with doc.retokenize() as retokenizer:
                                retokenizer.merge(doc[parents[0].i:children[-1].i+1], attrs={"ENT_TYPE":'GPE','POS':'PROPN'})
                            doc[parents[0].i]._.grouping_type = "CHILD_PARENT"
                            doc[parents[0].i]._.list = ls
                            i = parents[0].i
        i += 1
    return doc

def print_token_attr(doc):
    for token in doc:
        #print(token.text, token.ent_type_, token.ent_iob_, token.lemma_, token.pos_, token.tag_, token.shape_, token.is_alpha, token.is_stop, token.left_edge, token.right_edge, token.ent_kb_id_, token.norm_, token.subtree, token.n_rights, token.conjuncts, token.ancestors, token.head, token.dep_, [x.text for x in token.children])
        #print(token.text, token.lemma_, token.ent_type_, token.pos_, token.left_edge, token.right_edge, token.ent_kb_id_,
        #      token.norm_, [x for x in token.subtree], token.n_rights, token.conjuncts, [x for x in token.ancestors],
        #      token.head, token.dep_, [x.text for x in token.children])
        print(token.text, token.ent_type_, token._.list if token._.grouping_type in ['CHILD_PARENT', 'SIBLING'] else '',
              token.pos_, token.dep_, token.head,[x for x in token.lefts], [x for x in token.rights])

def insert_part_instance(doc, place, in_place):
    print(place, 'in', in_place)
    extraction = {"template":"PART", "sentences":[doc[:].sent.text]}
    arguments = {}
    arguments["1"], arguments["2"] = place, in_place
    extraction["arguments"] = arguments
    result["extraction"].append(extraction)
   
def insert_buy_instance(doc, buyer, item, price, quantity, source):
    print(buyer, 'bought', quantity, item, 'for', price, 'from', source)
    extraction = {"template":"BUY", "sentences":[doc[:].sent.text]}
    arguments = {}
    arguments["1"], arguments["2"], arguments["3"], arguments["4"], arguments["5"] = buyer, item, price, quantity, source
    extraction["arguments"] = arguments
    result["extraction"].append(extraction)
   
def insert_ancestor_list(doc, arr):
    if len(arr)>1:
        for x in range(len(arr)-1):
            for y in range(x+1,len(arr)):
                insert_part_instance(doc, arr[x], arr[y])
               
def insert_child_of_list(doc, child, arr):
    for parent in arr:
        insert_part_instance(doc, child, parent)
       
def insert_on_type(doc, child, parent):
    if child._.grouping_type in ['CHILD_PARENT', 'SIBLING']:
        for child_place in child._.list:
            if parent._.grouping_type in ['CHILD_PARENT', 'SIBLING']:
                insert_child_of_list(doc, child_place, parent._.list)
            else:
                insert_part_instance(doc, child_place, parent.text)
    else:
        if parent._.grouping_type in ['CHILD_PARENT', 'SIBLING']:
            insert_child_of_list(doc, child.text, parent._.list)
        else:
            insert_part_instance(doc, child.text, parent.text)
           
def get_gpe_from_norp(word):
    for defn in feature_extraction(word)[0]:
        if defn.startswith('of or relating to'):
            for ent in nlp_wo_pipe(defn).ents:
                if ent.label_=='GPE':
                    return ent.text
    return None

def extract_place_in_place(doc):
    i = 0    
    #print_token_attr(doc)
    while i<len(doc):
        token = doc[i]
        if token.pos_ == 'PROPN':
            if token.ent_type_=='GPE':
                if token._.grouping_type == 'CHILD_PARENT':
                    insert_ancestor_list(doc, token._.list)
            if token.dep_=='nsubj':
                if token.head.lemma_.lower()=='be':
                    nouns = [child for child in token.head.children if child.pos_=='NOUN' and child.lemma_.lower() in place_lemmas]
                    preps = [child for noun in nouns for child in noun.children if child.dep_ == 'prep' and child.lemma_.lower() in ['of','in']]
                    for prep in preps:
                        for child in prep.children:
                            if child.ent_type_=='GPE':
                                insert_on_type(doc, token, child)                  
                            elif child.pos_ == 'PROPN':
                                insert_part_instance(doc, token.text, child.text)
                            elif child.pos_=='NOUN' and child.lemma_.lower() in place_lemmas:
                                for c in child.children:
                                    if c.pos_=='PROPN':
                                        insert_on_type(doc, token, c)
            elif token.dep_=='poss':
                if token.ent_type_=='GPE':
                    if token.head.pos_ == 'PROPN':
                        insert_on_type(doc, token.head, token)
            else:
                pass
        elif token.pos_=='ADP':
            if token.dep_=='prep':
                if token.lemma_.lower()=='in':
                    child = token.head
                    if child.ent_type_ in ['GPE','LOC','LOC DET']:
                        parents = [x for x in token.rights if x.ent_type_=='GPE']
                        if parents:
                            for parent in parents:
                                insert_on_type(doc, child, parent)
                        else:
                            if token.i+1<len(doc) and token.nbor().ent_type_=='GPE':
                                parent = token.nbor()
                                insert_on_type(doc, child, parent)
                    elif child.lemma_.lower() in place_lemmas:
                        child = None
        i += 1
    return doc

#Praveen
def extract_all_named_entities(doc):
    template = {"buyer":"","item":"","price":"","quantity":"","source":""}
    entities = [ent for ent in doc.ents if ent.label_ in ["BUY","MONEY","ORG","PERSON","PRODUCT","PERCENT","CARDINAL"]]
    print("Entities",entities)
    list_of_templates = []
   
    #print("Code in first rule: (X acquired/purchased Y for dollars)")
    #print("**************************************")
    def constraint_check(entities):
        list_t = []
        if len(entities)==3:
            template = {"buyer":"","item":"","price":"","quantity":"","source":""}
            for ent in ["acquire","purchase","buy","brought","purchased","acquired"]:
                    if ent.strip()==entities[1].text:
                        template["buyer"]= entities[0].text
                        template["item"]= entities[2].text
            for ent in ["acquired by","purchased by"]:
                    if ent.strip()==entities[1].text:
                        template["buyer"]= entities[2].text
                        template["item"] = entities[0].text
            if len(template["buyer"])>0 and len(template["item"])>0:
                list_t.append(template)
        else:
            for index in range(len(entities)):
                template = {"buyer":"","item":"","price":"","quantity":"","source":""}
                for ent in ["acquire","purchase","buy","brought","purchased","acquired","acquisition"]:
                    if (entities[index].text.strip()==ent.strip()):
                        if index+2<len(entities):
                            if entities[index-1].label_  in ["ORG","PERSON","PRODUCT"] and entities[index+1].label_ in ["ORG","PERSON","PRODUCT"] and entities[index+2].label_!="MONEY":
                                template["buyer"]= entities[index-1].text
                                template["item"] = entities[index+1].text
                        else:
                            if index-1>0 and index+1<len(entities):
                                if entities[index-1].label_  in ["ORG","PERSON","PRODUCT"] and entities[index+1].label_ in ["ORG","PERSON","PRODUCT"]:
                                    template["buyer"]= entities[index-1].text
                                    template["item"] = entities[index+1].text
                           
                if len(template["buyer"])>0 and len(template["item"])>0:
                    list_t.append(template)  
                for ent in ["acquired by","purchased by"]:
                    if entities[index].text.strip()==ent and index-1 and index+1:
                        if index+2<len(entities):
                            if entities[index-1].label_  in ["ORG","PERSON","PRODUCT"] and entities[index+1].label_ in ["ORG","PERSON","PRODUCT"] and entities[index+2].label_!="MONEY":
                                template["buyer"]= entities[index-1].text
                                template["item"] = entities[index+1].text
                        else:
                            if entities[index-1].label_  in ["ORG","PERSON","PRODUCT"] and entities[index+1].label_ in ["ORG","PERSON","PRODUCT"]:
                                template["buyer"]= entities[index-1].text
                                template["item"] = entities[index+1].text
                if len(template["buyer"])>0 and len(template["item"])>0:
                    list_t.append(template)
        return list_t

    if len(entities)>=3:
        list_of_templates = constraint_check(entities)
    if len(template["buyer"])==0:
        for ent in entities:      
                head = ent.root.head
                for token in head.children:
                    if (token.pos_ in ["PROPN","NOUN"] and token.dep_=="nsubj" and token.ent_type_ in ["ORG","PERSON","PRODUCT"] and token.head.pos_=="VERB"):
                        template["buyer"] = token.text
                    if (token.pos_=="PROPN" and token.dep_=="nsubj" and token.head.pos_=="VERB" and token.head.dep_=="ROOT"):
                        template["buyer"] = token.text
                    if (token.pos_=="PROPN" and token.dep_=="appos" and token.head.pos_=="NOUN" and token.head.dep_!="dobj"):
                        template["buyer"] = token.text
                    if (token.pos_=="VERB" and token.dep_=="relcl" and token.head.pos_=="PROPN"):
                        template["buyer"] = token.head.text
                    if (token.pos_ in ["PROPN","NOUN"] and  token.dep_ =="dobj" and token.head.pos_=="VERB" and token.head.ent_type_=="BUY"):
                        template["item"] = token.text
                    if (len(template["item"])!=0 and token.pos_ in ["PROPN","NOUN","ADV"] and token.dep_ in ["dobj","pobj","nsubjpass"] and token.head.pos_=="VERB" and token.ent_type_=="BUY" and token.head.dep_!="advcl"):
                        template["item"] = token.text
                    if (len(template["item"])!=0 and token.pos_=="PROPN" and token.dep_=="dobj" and token.head.ent_type_=="BUY" and token.ent_type_ in ["ORG","PERSON","PRODUCT"]):
                        template["item"] = token.text
                    if (len(template["item"])!=0 and token.pos_=="PROPN" and token.dep_=="appos" and token.head.pos_=="NOUN" and token.head.head.ent_type_=="BUY"):
                        template["item"] = token.text
                    if (token.pos_=="PROPN" and token.dep_=="compound" and token.head.pos_=="PROPN"):
                        template["item"] = token.text
                    if (token.pos_=="NUM" and token.dep_ in ["npadvmod","pobj","nummod"] and token.ent_type_ in ["MONEY","CARDINAL"]):
                        if token.head.pos_=="NOUN":
                            template["price"] = str(token.text)+" "+str(token.head.text)
                        else:
                            template["price"] = token.text
                    if (token.pos_ in ["PROPN","ADP"] and token.dep_=="pobj" and  token.head.pos_ in ["ADP","SCONJ"] and token.head.dep_=="prep" and token.head.head.pos_ in ["VERB"] and token.ent_type_ in ["ORG","PERSON","PRODUCT","GPE"]):
                         template["source"] = token.text
                    if (token.ent_type_=="PERCENT" and token.pos_=="NOUN" and token.dep_ in ["compound","dobj","npadvmod","nmod"]):
                        template["quantity"] = token.text
                    #print(template)
        if len(template["buyer"])>0 and len(template["item"])>0:
            list_of_templates.append(template)
        #print("first_rule",template)  
        template = {"buyer":"","item":"","price":"","quantity":"","source":""}
        #print("Code in second rule:(X  was acquired/purchased Y for dollars)")
        #print("**************************************")
        for ent in entities:
                head = ent.root.head    
                for token in head.children:
                    if (token.pos_=="VERB" and token.dep_=="acl" and token.ent_type_ in ["BUY"]):
                        template["item"]=head
                    if (token.pos_=="PROPN" and token.dep_=="nsubjpass" and  token.ent_type_ in ["ORG","PERSON","PRODUCT"]):
                            template["item"] = token
                    if (token.pos_=="PROPN" and token.dep_=="pobj") and (token.ent_type_ in ["ORG","PERSON","PRODUCT"]):
                            template["buyer"] = token
                    if (token.ent_type_=="MONEY" and token.pos_=="NUM" and token.dep_ in ["npadvmod","pobj","nummod"]):
                            template["price"] = token
                    if (token.ent_type_=="PERCENT" and token.pos_=="NOUN" and token.dep_ in ["compound","dobj","npadvmod"]):
                        template["quantity"] = token
        if len(template["buyer"])>0 and len(template["item"])>0:
            list_of_templates.append(template)
            template = {"buyer":"","item":"","price":"","quantity":"","source":""}
    #print("second_rule",template)
    list_of_templates= [dict(y) for y in set(tuple(x.items()) for x in list_of_templates)]
   
    print("Final Template:",list_of_templates)
    for template in list_of_templates:
        insert_buy_instance(doc, template['buyer'] if len(template["buyer"])>0 else template['buyer'],
                            template['item'] if len(template["item"])>0 else template['item'],
                            template['price'] if len(template["price"])>0 else template['price'],
                            template['quantity'] if len(template["quantity"])>0 else template['quantity'],
                            template['source'] if len(template["source"])>0 else template['source'])
    return list_of_templates
#Praveen

if __name__=='__main__':
    s = time.time()
    nltk.download('punkt')
    nltk.download('wordnet')
    nltk.download('stopwords')
    nltk.download('averaged_perceptron_tagger')
    nlp = spacy.load('en_core_web_sm')
    nlp_wo_pipe = spacy.load('en_core_web_sm')
    matched_sents = []  # Collect data of matched sentences to be visualized
    #filename = 'test.txt'
    filename = None
    dataset = load_text_files(filename)
    prep = Preprocess()
    Token.set_extension("grouping_type", default=None, force=True)
    Token.set_extension("list", default=None, force=True)
    place_lemmas = ['place', 'village', 'county', 'city', 'district', 'region', 'state', 'capital', 'country', 'continent', 'world']
    #print(feature_extraction('in'))
    """
    new_dataset = {}
    coref = neuralcoref.NeuralCoref(nlp.vocab)
    nlp.add_pipe(coref, name='neuralcoref', after='ner')
    for filename in dataset:
        new_dataset[filename] = []
        for paragraph in dataset[filename]:
            paragraph =  prep.remove_unwanted_character(paragraph).lstrip()
            doc = nlp(paragraph)
            new_dataset[filename].append(doc._.coref_resolved)
    nlp.remove_pipe("neuralcoref")
    """
    nlp.add_pipe(merge_ents, name='merge_ents', after='ner')
    nlp.add_pipe(merge_siblings, name='merge_siblings', after='merge_ents')
    nlp.add_pipe(merge_ancestors, name='merge_ancestors', after='merge_siblings')
    nlp.add_pipe(extract_place_in_place, name='extract_place_in_place', after='merge_ancestors')
   
    #Praveen
    from spacy.pipeline import EntityRuler
    definition,synonyms,antonyms,hyper,hypo,mero,holo = feature_extraction('acquired')            
    ruler = EntityRuler(nlp)
    buy_tags = ["acquire","purchase","buy","brought"]
    patterns = [{"label": "BUY", "pattern": "purchase"},{"label": "BUY", "pattern": "acquisition"},{"label": "BUY", "pattern": "buy"},{"label": "BUY", "pattern": "acquire"},{"label": "BUY", "pattern": "purchased"},{"label": "BUY", "pattern": "purchased by"},{"label": "BUY", "pattern": "acquired by"},{"label": "BUY", "pattern": "brought"},{"label": "BUY", "pattern": "acquired"}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler, name='ruler', after='extract_place_in_place')
    #Praveen
   
    for filename in dataset:
        result  = {"document":filename, "extraction":[]}
        for paragraph in dataset[filename]:            
            paragraph = prep.sentence_tokenize(paragraph)
            for each_sent in paragraph:
                lem_sent = prep.lemmatize(each_sent)
                cleaned_sentence = prep.remove_stopwords(lem_sent)
                cleaned_sentence = prep.word_tokenize(cleaned_sentence)
                cleaned_sentence = prep.pos_tag(cleaned_sentence)
                sentence = nlp(each_sent)
                #Praveen
                for tag in buy_tags:
                    if tag in each_sent:
                        sentence = merge_ents(sentence)
                        print(extract_all_named_entities(sentence))
                        #for ent in sentence.ents:
                            #print(ent.text,"-->", ent.label_)
                        displacy.render(sentence, style="ent",jupyter=True)
                        displacy.render(sentence, style="dep",jupyter=True)
                #Praveen
               
                #arr = [token for token in doc if token.ent_type_=='GPE']
                #if (len(arr)>1):
                #    print(doc)
                #print_token_attr(doc)
        #displacy.render(matched_sents, style="ent", manual=True)
        json_object = json.dumps(result, indent = 4)
        print(json_object)
        with open(filename[:-4]+".json", "w") as outfile:
            outfile.write(json_object)
        #exit()
    nlp.remove_pipe("merge_ents")
    nlp.remove_pipe("merge_siblings")
    nlp.remove_pipe("merge_ancestors")
    nlp.remove_pipe("extract_place_in_place")
    nlp.remove_pipe("ruler")
    print(time.time()-s)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shari\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shari\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shari\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\shari\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Entities [Amazon, acquired, Whole Foods Market, US$13.4 billion]
Final Template: [{'buyer': 'Amazon', 'item': 'Whole Foods Market', 'price': '', 'quantity': '', 'source': ''}]
Amazon bought  Whole Foods Market for  from 
[{'buyer': 'Amazon', 'item': 'Whole Foods Market', 'price': '', 'quantity': '', 'source': ''}]


Bellevue in Washington
{
    "document": "1.txt",
    "extraction": [
        {
            "template": "BUY",
            "sentences": [
                "In 2017, Amazon acquired Whole Foods Market for US$13.4 billion."
            ],
            "arguments": {
                "1": "Amazon",
                "2": "Whole Foods Market",
                "3": "",
                "4": "",
                "5": ""
            }
        },
        {
            "template": "PART",
            "sentences": [
                "Amazon was founded by Jeff Bezos in Bellevue, Washington, in July 1994."
            ],
            "arguments": {
                "1": "Bellevue",
                "2": "Washington"
            }
        }
    ]
}
Entities [Apple Inc., The Walt Disney Company's, acquisition, NeXT.In 2017, Amazon, acquired, Whole Foods Market, US$13.4 billion, Amazon]
Final Template: [{'buyer': "The Walt Disney Company's", 'item': 'NeXT.In 2017', 'price': '', 'quantity': '', 'source': ''}, {

{
    "document": "2.txt",
    "extraction": [
        {
            "template": "BUY",
            "sentences": [
                "He was the chairman, chief executive officer (CEO), and co-founder of Apple Inc.; chairman and majority shareholder of Pixar; a member of The Walt Disney Company's board of directors following its acquisition of Pixar; and the founder, chairman, and CEO of NeXT.In 2017, Amazon acquired Whole Foods Market for US$13.4 billion, which vastly increased Amazon's presence as a brick-and-mortar retailer."
            ],
            "arguments": {
                "1": "The Walt Disney Company's",
                "2": "NeXT.In 2017",
                "3": "",
                "4": "",
                "5": ""
            }
        },
        {
            "template": "BUY",
            "sentences": [
                "He was the chairman, chief executive officer (CEO), and co-founder of Apple Inc.; chairman and majority shareholder of Pixar; a member of The Walt Disney 

{
    "document": "4.txt",
    "extraction": [
        {
            "template": "BUY",
            "sentences": [
                "In 2001, he led Amazon to acquire NeXT, solving the desperately failed operating system strategy and bringing Bezos back."
            ],
            "arguments": {
                "1": "Amazon",
                "2": "NeXT",
                "3": "",
                "4": "",
                "5": ""
            }
        }
    ]
}
Entities [Lakeside School's, buy, General Electric, GE]
Final Template: [{'buyer': "Lakeside School's", 'item': 'General Electric', 'price': '', 'quantity': '', 'source': ''}]
Lakeside School's bought  General Electric for  from 
[{'buyer': "Lakeside School's", 'item': 'General Electric', 'price': '', 'quantity': '', 'source': ''}]


Entities [purchased, three, Cities Service Preferred]
Final Template: []
[]


Entities [Microsoft, acquire, the New York Times]
Final Template: [{'buyer': 'Microsoft', 'item': 'the New York Times', 'price': '', 'quantity': '', 'source': ''}]
Microsoft bought  the New York Times for  from 
[{'buyer': 'Microsoft', 'item': 'the New York Times', 'price': '', 'quantity': '', 'source': ''}]


Entities [SEC, Gates, Microsoft, purchase, Alamo Financial]
Final Template: [{'buyer': 'Microsoft', 'item': 'Alamo Financial', 'price': '', 'quantity': '', 'source': ''}]
Microsoft bought  Alamo Financial for  from 
[{'buyer': 'Microsoft', 'item': 'Alamo Financial', 'price': '', 'quantity': '', 'source': ''}]


Entities [brought]
Final Template: []
[]


Entities [Microsoft, acquired, the New York Evening News, $50.2 million]
Final Template: []
[]


{
    "document": "5.txt",
    "extraction": [
        {
            "template": "BUY",
            "sentences": [
                "When he was in the eighth grade, he used the proceeds from Lakeside School's rummage sale to buy a telescope and a General Electric (GE) computer for the students."
            ],
            "arguments": {
                "1": "Lakeside School's",
                "2": "General Electric",
                "3": "",
                "4": "",
                "5": ""
            }
        },
        {
            "template": "BUY",
            "sentences": [
                "In 1973, Microsoft began to acquire stock in the New York Times."
            ],
            "arguments": {
                "1": "Microsoft",
                "2": "the New York Times",
                "3": "",
                "4": "",
                "5": ""
            }
        },
        {
            "template": "BUY",
            "sentences": [
                "In 1974, the SEC opened a

In [None]:
import glob
import errno
import nltk
import spacy
from spacy import displacy
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import Tree

# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
import copy
import os
import string
from itertools import chain
from nltk.stem import PorterStemmer
# import en_core_web_sm
from spacy.pipeline import EntityRuler


def add_custom_named_entity(nlp):
    ruler = EntityRuler(nlp)
    patterns = [{'label': 'JOBTITLE', 'pattern': 'executive'}, {'label': 'JOBTITLE', 'pattern': 'actress'},
                {'label': 'JOBTITLE', 'pattern': 'host'}, {'label': 'JOBTITLE', 'pattern': 'producer'},
                {'label': 'JOBTITLE', 'pattern': 'philanthropist'}, {'label': 'JOBTITLE', 'pattern': 'queen'},
                {'label': 'JOBTITLE', 'pattern': 'barber'}, {'label': 'JOBTITLE', 'pattern': 'president'},
                {'label': 'JOBTITLE', 'pattern': 'miner'}, {'label': 'JOBTITLE', 'pattern': 'city councilman'},
                {'label': 'JOBTITLE', 'pattern': 'farmer'}, {'label': 'JOBTITLE', 'pattern': 'preacher'},
                {'label': 'JOBTITLE', 'pattern': 'maid'}, {'label': 'JOBTITLE', 'pattern': 'student'},
                {'label': 'JOBTITLE', 'pattern': 'news anchor'}, {'label': 'JOBTITLE', 'pattern': 'critic'},
                {'label': 'JOBTITLE', 'pattern': 'columnist'}, {'label': 'JOBTITLE', 'pattern': 'candidate'},
                {'label': 'JOBTITLE', 'pattern': 'author'}, {'label': 'JOBTITLE', 'pattern': 'housewife'},
                {'label': 'JOBTITLE', 'pattern': 'judge'}, {'label': 'JOBTITLE', 'pattern': 'princess'},
                {'label': 'JOBTITLE', 'pattern': 'personal trainer'}, {'label': 'JOBTITLE', 'pattern': 'reader'},
                {'label': 'JOBTITLE', 'pattern': 'model student'}, {'label': 'JOBTITLE', 'pattern': 'journalist'},
                {'label': 'JOBTITLE', 'pattern': 'biographer'}, {'label': 'JOBTITLE', 'pattern': 'reporter'},
                {'label': 'JOBTITLE', 'pattern': 'king'}, {'label': 'JOBTITLE', 'pattern': 'filmmaker'},
                {'label': 'JOBTITLE', 'pattern': 'editor'}, {'label': 'JOBTITLE', 'pattern': 'therapist'},
                {'label': 'JOBTITLE', 'pattern': 'entertainer'}, {'label': 'JOBTITLE', 'pattern': 'ceo'},
                {'label': 'JOBTITLE', 'pattern': 'senator'}, {'label': 'JOBTITLE', 'pattern': 'chairman'},
                {'label': 'JOBTITLE', 'pattern': 'politician'}, {'label': 'JOBTITLE', 'pattern': 'leader'},
                {'label': 'JOBTITLE', 'pattern': 'pope'}, {'label': 'JOBTITLE', 'pattern': 'springer'},
                {'label': 'JOBTITLE', 'pattern': 'professor'}, {'label': 'JOBTITLE', 'pattern': 'attorney'},
                {'label': 'JOBTITLE', 'pattern': 'governor'}, {'label': 'JOBTITLE', 'pattern': 'crown prince'},
                {'label': 'JOBTITLE', 'pattern': 'teacher'}, {'label': 'JOBTITLE', 'pattern': 'premier'},
                {'label': 'JOBTITLE', 'pattern': 'mayor'}, {'label': 'JOBTITLE', 'pattern': 'magician'},
                {'label': 'JOBTITLE', 'pattern': 'executive producer'}, {'label': 'JOBTITLE', 'pattern': 'magnate'},
                {'label': 'JOBTITLE', 'pattern': 'vice president'}, {'label': 'JOBTITLE', 'pattern': 'oracle'},
                {'label': 'JOBTITLE', 'pattern': 'founder'}, {'label': 'JOBTITLE', 'pattern': 'congressman'},
                {'label': 'JOBTITLE', 'pattern': 'stockbroker'}, {'label': 'JOBTITLE', 'pattern': 'bachelor'},
                {'label': 'JOBTITLE', 'pattern': 'salesman'}, {'label': 'JOBTITLE', 'pattern': 'analyst'},
                {'label': 'JOBTITLE', 'pattern': 'general'}, {'label': 'JOBTITLE', 'pattern': 'janitor'},
                {'label': 'JOBTITLE', 'pattern': 'boss'}, {'label': 'JOBTITLE', 'pattern': 'doctor'},
                {'label': 'JOBTITLE', 'pattern': 'activist'}, {'label': 'JOBTITLE', 'pattern': 'owner'},
                {'label': 'JOBTITLE', 'pattern': 'director'}, {'label': 'JOBTITLE', 'pattern': 'trader'},
                {'label': 'JOBTITLE', 'pattern': 'chief financial officer'},
                {'label': 'JOBTITLE', 'pattern': 'publisher'}, {'label': 'JOBTITLE', 'pattern': 'companion'},
                {'label': 'JOBTITLE', 'pattern': 'assistant coach'}, {'label': 'JOBTITLE', 'pattern': 'mobile'},
                {'label': 'JOBTITLE', 'pattern': 'manager'},
                {'label': 'JOBTITLE', 'pattern': 'mediator'},
                {'label': 'JOBTITLE', 'pattern': 'secretary of the treasury'},
                {'label': 'JOBTITLE', 'pattern': 'actuary'}, {'label': 'JOBTITLE', 'pattern': 'manufacturer'},
                {'label': 'JOBTITLE', 'pattern': 'river'}, {'label': 'JOBTITLE', 'pattern': 'surveyor'},
                {'label': 'JOBTITLE', 'pattern': 'lieutenant governor'}, {'label': 'JOBTITLE', 'pattern': 'commander'},
                {'label': 'JOBTITLE', 'pattern': 'envoy'}, {'label': 'JOBTITLE', 'pattern': 'lieutenant colonel'},
                {'label': 'JOBTITLE', 'pattern': 'translator'}, {'label': 'JOBTITLE', 'pattern': 'captain'},
                {'label': 'JOBTITLE', 'pattern': 'colonel'}, {'label': 'JOBTITLE', 'pattern': 'commander colonel'},
                {'label': 'JOBTITLE', 'pattern': 'brigadier general'},
                {'label': 'JOBTITLE', 'pattern': 'major general'}, {'label': 'JOBTITLE', 'pattern': 'guard'},
                {'label': 'JOBTITLE', 'pattern': 'soldier'}, {'label': 'JOBTITLE', 'pattern': 'secretary'},
                {'label': 'JOBTITLE', 'pattern': 'baron'}, {'label': 'JOBTITLE', 'pattern': 'chief of staff'},
                {'label': 'JOBTITLE', 'pattern': 'major'}, {'label': 'JOBTITLE', 'pattern': 'admiral'},
                {'label': 'JOBTITLE', 'pattern': 'president general'}, {'label': 'JOBTITLE', 'pattern': 'coach'},
                {'label': 'JOBTITLE', 'pattern': 'chancellor'}, {'label': 'JOBTITLE', 'pattern': 'administrator'},
                {'label': 'JOBTITLE', 'pattern': 'merchant'}, {'label': 'JOBTITLE', 'pattern': 'attorney general'},
                {'label': 'JOBTITLE', 'pattern': 'secretary of state'},
                {'label': 'JOBTITLE', 'pattern': 'secretary of war'}, {'label': 'JOBTITLE', 'pattern': 'diplomat'},
                {'label': 'JOBTITLE', 'pattern': 'chief justice'}, {'label': 'JOBTITLE', 'pattern': 'negotiator'},
                {'label': 'JOBTITLE', 'pattern': 'minister'},
                {'label': 'JOBTITLE', 'pattern': 'principal'},
                {'label': 'JOBTITLE', 'pattern': 'secretary of treasury'}, {'label': 'JOBTITLE', 'pattern': 'scholar'},
                {'label': 'JOBTITLE', 'pattern': 'historian'}, {'label': 'JOBTITLE', 'pattern': 'lieutenant general'},
                {'label': 'JOBTITLE', 'pattern': 'speaker'}, {'label': 'JOBTITLE', 'pattern': 'reverend'},
                {'label': 'JOBTITLE', 'pattern': 'architect'}, {'label': 'JOBTITLE', 'pattern': 'dentist'},
                {'label': 'JOBTITLE', 'pattern': 'dancer'}, {'label': 'JOBTITLE', 'pattern': 'pastor'},
                {'label': 'JOBTITLE', 'pattern': 'creator'}, {'label': 'JOBTITLE', 'pattern': 'charter'},
                {'label': 'JOBTITLE', 'pattern': 'entrepreneur'}, {'label': 'JOBTITLE', 'pattern': 'engineer'},
                {'label': 'JOBTITLE', 'pattern': 'designer'}, {'label': 'JOBTITLE', 'pattern': 'co-founder'},
                {'label': 'JOBTITLE', 'pattern': 'co-chairman'},
                {'label': 'JOBTITLE', 'pattern': 'model'}, {'label': 'JOBTITLE', 'pattern': 'pilot'},
                {'label': 'JOBTITLE', 'pattern': 'sailor'}, {'label': 'JOBTITLE', 'pattern': 'commodore'},
                {'label': 'JOBTITLE', 'pattern': 'guide'}, {'label': 'JOBTITLE', 'pattern': 'chief executive officer'},
                {'label': 'JOBTITLE', 'pattern': 'chief technology officer'},
                {'label': 'JOBTITLE', 'pattern': 'astronaut'}, {'label': 'JOBTITLE', 'pattern': 'scientist'},
                {'label': 'JOBTITLE', 'pattern': 'gen.'}, {'label': 'JOBTITLE', 'pattern': 'geographer'},
                {'label': 'JOBTITLE', 'pattern': 'emperor'}, {'label': 'JOBTITLE', 'pattern': 'theologian'},
                {'label': 'JOBTITLE', 'pattern': 'marine'}, {'label': 'JOBTITLE', 'pattern': 'count'},
                {'label': 'JOBTITLE', 'pattern': 'teller'}, {'label': 'JOBTITLE', 'pattern': 'printer manufacturer'},
                {'label': 'JOBTITLE', 'pattern': 'recorder'},
                {'label': 'JOBTITLE', 'pattern': 'general manager'}, {'label': 'JOBTITLE', 'pattern': 'salesmen'},
                {'label': 'JOBTITLE', 'pattern': 'vendor'}, {'label': 'JOBTITLE', 'pattern': 'graphic designer'},
                {'label': 'JOBTITLE', 'pattern': 'inventor'},
                {'label': 'JOBTITLE', 'pattern': 'secretary of housing and urban development'},
                {'label': 'JOBTITLE', 'pattern': 'secretary of transportation'},
                {'label': 'JOBTITLE', 'pattern': 'referee'}, {'label': 'JOBTITLE', 'pattern': 'dealer'},
                {'label': 'JOBTITLE', 'pattern': 'driver'}, {'label': 'JOBTITLE', 'pattern': 'collector'},
                {'label': 'JOBTITLE', 'pattern': 'vice-president'}, {'label': 'JOBTITLE', 'pattern': 'demonstrator'},
                {'label': 'JOBTITLE', 'pattern': 'cell maker'}, {'label': 'JOBTITLE', 'pattern': 'private'},
                {'label': 'JOBTITLE', 'pattern': 'spokesman'}, {'label': 'JOBTITLE', 'pattern': 'buyer'},
                {'label': 'JOBTITLE', 'pattern': 'cfo'}, {'label': 'JOBTITLE', 'pattern': 'managing director'},
                {'label': 'JOBTITLE', 'pattern': 'chief executive'}, {'label': 'JOBTITLE', 'pattern': 'retailer'},
                {'label': 'JOBTITLE', 'pattern': 'printer'}, {'label': 'JOBTITLE', 'pattern': 'developer'},
                {'label': 'JOBTITLE', 'pattern': 'processor'}, {'label': 'JOBTITLE', 'pattern': 'grip'},
                {'label': 'JOBTITLE', 'pattern': 'chief operating officer'},
                {'label': 'JOBTITLE', 'pattern': 'assistant'}, {'label': 'JOBTITLE', 'pattern': 'layer'},
                {'label': 'JOBTITLE', 'pattern': 'operator'}, {'label': 'JOBTITLE', 'pattern': 'header'},
                {'label': 'JOBTITLE', 'pattern': 'writer'}, {'label': 'JOBTITLE', 'pattern': 'singer'},
                {'label': 'JOBTITLE', 'pattern': 'evangelist'}, {'label': 'JOBTITLE', 'pattern': 'executive director'},
                {'label': 'JOBTITLE', 'pattern': 'general counsel'},
                {'label': 'JOBTITLE', 'pattern': 'city manager'}, {'label': 'JOBTITLE', 'pattern': 'physician'},
                {'label': 'JOBTITLE', 'pattern': 'importer'}, {'label': 'JOBTITLE', 'pattern': 'explorer'},
                {'label': 'JOBTITLE', 'pattern': 'empress'}, {'label': 'JOBTITLE', 'pattern': 'boxer'},
                {'label': 'JOBTITLE', 'pattern': 'general secretary'}, {'label': 'JOBTITLE', 'pattern': 'party leader'},
                {'label': 'JOBTITLE', 'pattern': 'rubber'}, {'label': 'JOBTITLE', 'pattern': 'representative'},
                {'label': 'JOBTITLE', 'pattern': 'secretary of defense'}, {'label': 'JOBTITLE', 'pattern': 'buddha'},
                {'label': 'JOBTITLE', 'pattern': 'prince'}, {'label': 'JOBTITLE', 'pattern': 'first mate'},
                {'label': 'JOBTITLE', 'pattern': 'director-general'}, {'label': 'JOBTITLE', 'pattern': 'fund manager'},
                {'label': 'JOBTITLE', 'pattern': 'surgeon'}, {'label': 'JOBTITLE', 'pattern': 'cook'},
                {'label': 'JOBTITLE', 'pattern': 'comptroller'}, {'label': 'JOBTITLE', 'pattern': 'second'},
                {'label': 'JOBTITLE', 'pattern': 'refiner'}, {'label': 'JOBTITLE', 'pattern': 'tanker'},
                {'label': 'JOBTITLE', 'pattern': 'vice-chairman'}, {'label': 'JOBTITLE', 'pattern': 'president-elect'},
                {'label': 'JOBTITLE', 'pattern': 'executive chairman'}, {'label': 'JOBTITLE', 'pattern': 'constable'},
                {'label': 'JOBTITLE', 'pattern': 'interim president'},
                {'label': 'JOBTITLE', 'pattern': 'nobel laureate'}, {'label': 'JOBTITLE', 'pattern': 'dean'},
                {'label': 'JOBTITLE', 'pattern': 'tier'}, {'label': 'JOBTITLE', 'pattern': 'artist'},
                {'label': 'JOBTITLE', 'pattern': 'landscape architect'},
                {'label': 'JOBTITLE', 'pattern': 'consultant'}, {'label': 'JOBTITLE', 'pattern': 'chef'},
                {'label': 'JOBTITLE', 'pattern': 'vice chairman'}, {'label': 'JOBTITLE', 'pattern': 'superior'},
                {'label': 'JOBTITLE', 'pattern': 'jeweler'}, {'label': 'JOBTITLE', 'pattern': 'specialist'},
                {'label': 'JOBTITLE', 'pattern': 'broker'}, {'label': 'JOBTITLE', 'pattern': 'strategist'},
                {'label': 'JOBTITLE', 'pattern': 'treasury secretary'}, {'label': 'JOBTITLE', 'pattern': 'underwriter'},
                {'label': 'JOBTITLE', 'pattern': 'quality control supervisor'},
                {'label': 'JOBTITLE', 'pattern': 'auditor'}, {'label': 'JOBTITLE', 'pattern': 'spokeswoman'},
                {'label': 'JOBTITLE', 'pattern': 'district attorney'},
                {'label': 'JOBTITLE', 'pattern': 'principal author'}, {'label': 'JOBTITLE', 'pattern': 'treasurer'},
                {'label': 'JOBTITLE', 'pattern': 'lobbyist'}, {'label': 'JOBTITLE', 'pattern': 'deputy mayor'},
                {'label': 'JOBTITLE', 'pattern': 'guru'}, {'label': 'JOBTITLE', 'pattern': 'communications director'},
                {'label': 'JOBTITLE', 'pattern': 'assistant attorney general'},
                {'label': 'JOBTITLE', 'pattern': 'executive vice president'},
                {'label': 'JOBTITLE', 'pattern': 'chief compliance officer'},
                {'label': 'JOBTITLE', 'pattern': 'lawyer'}, {'label': 'JOBTITLE', 'pattern': 'spokesperson'},
                {'label': 'JOBTITLE', 'pattern': 'technician'},
                {'label': 'JOBTITLE', 'pattern': 'intelligence director'}, {'label': 'JOBTITLE', 'pattern': 'hacker'},
                {'label': 'JOBTITLE', 'pattern': 'batman'}, {'label': 'JOBTITLE', 'pattern': 'astronomer'},
                {'label': 'JOBTITLE', 'pattern': 'composer'}, {'label': 'JOBTITLE', 'pattern': 'aerospace engineer'},
                {'label': 'JOBTITLE', 'pattern': 'homemaker'}, {'label': 'JOBTITLE', 'pattern': 'marketing manager'},
                {'label': 'JOBTITLE', 'pattern': 'businesswoman'}, {'label': 'JOBTITLE', 'pattern': 'monk'},
                {'label': 'JOBTITLE', 'pattern': 'drier'}, {'label': 'JOBTITLE', 'pattern': 'explorer captain'},
                {'label': 'JOBTITLE', 'pattern': 'builder'}, {'label': 'JOBTITLE', 'pattern': 'sounder'},
                {'label': 'JOBTITLE', 'pattern': 'state treasurer'}, {'label': 'JOBTITLE', 'pattern': 'superintendent'},
                {'label': 'JOBTITLE', 'pattern': 'governor general'},
                {'label': 'JOBTITLE', 'pattern': 'prime minister'}, {'label': 'JOBTITLE', 'pattern': 'chief minister'},
                {'label': 'JOBTITLE', 'pattern': 'poet'}, {'label': 'JOBTITLE', 'pattern': 'novelist'},
                {'label': 'JOBTITLE', 'pattern': 'indian activist'}, {'label': 'JOBTITLE', 'pattern': 'clerk'},
                {'label': 'JOBTITLE', 'pattern': 'sheikh'}, {'label': 'JOBTITLE', 'pattern': 'barrister'},
                {'label': 'JOBTITLE', 'pattern': 'priest'}, {'label': 'JOBTITLE', 'pattern': 'landlady'},
                {'label': 'JOBTITLE', 'pattern': 'magistrate'}, {'label': 'JOBTITLE', 'pattern': 'police officer'},
                {'label': 'JOBTITLE', 'pattern': 'saint'}, {'label': 'JOBTITLE', 'pattern': 'messiah'},
                {'label': 'JOBTITLE', 'pattern': 'dictator'}, {'label': 'JOBTITLE', 'pattern': 'representative leader'},
                {'label': 'JOBTITLE', 'pattern': 'governor-general'}, {'label': 'JOBTITLE', 'pattern': 'marshal'},
                {'label': 'JOBTITLE', 'pattern': 'philosopher'}, {'label': 'JOBTITLE', 'pattern': 'butcher'},
                {'label': 'JOBTITLE', 'pattern': 'missionary'}, {'label': 'JOBTITLE', 'pattern': 'sultan'},
                {'label': 'JOBTITLE', 'pattern': 'interpreter'}, {'label': 'JOBTITLE', 'pattern': 'economist'},
                {'label': 'JOBTITLE', 'pattern': 'physicist'}, {'label': 'JOBTITLE', 'pattern': 'musician'},
                {'label': 'JOBTITLE', 'pattern': 'custodian'}, {'label': 'JOBTITLE', 'pattern': 'investment banker'},
                {'label': 'JOBTITLE', 'pattern': 'financier'}, {'label': 'JOBTITLE', 'pattern': 'patriarch'},
                {'label': 'JOBTITLE', 'pattern': 'secretary of commerce'},
                {'label': 'JOBTITLE', 'pattern': 'secretary of labor'}, {'label': 'JOBTITLE', 'pattern': 'performer'},
                {'label': 'JOBTITLE', 'pattern': 'legislator'}, {'label': 'JOBTITLE', 'pattern': 'actor'},
                {'label': 'JOBTITLE', 'pattern': 'cabinetmaker'}, {'label': 'JOBTITLE', 'pattern': 'carpenter'},
                {'label': 'JOBTITLE', 'pattern': 'servant'}, {'label': 'JOBTITLE', 'pattern': 'ambassador'},
                {'label': 'JOBTITLE', 'pattern': 'chief of staff general'}, {'label': 'JOBTITLE', 'pattern': 'rep.'},
                {'label': 'JOBTITLE', 'pattern': 'campaign manager'}, {'label': 'JOBTITLE', 'pattern': 'jurist'},
                {'label': 'JOBTITLE', 'pattern': 'whig activist'}, {'label': 'JOBTITLE', 'pattern': 'orderly'},
                {'label': 'JOBTITLE', 'pattern': 'sociologist'}, {'label': 'JOBTITLE', 'pattern': 'marker'},
                {'label': 'JOBTITLE', 'pattern': 'bishop'}, {'label': 'JOBTITLE', 'pattern': 'botanist'},
                {'label': 'JOBTITLE', 'pattern': 'sheriff'}, {'label': 'JOBTITLE', 'pattern': 'chief of police'},
                {'label': 'JOBTITLE', 'pattern': 'firefighter'}, {'label': 'JOBTITLE', 'pattern': 'reliever'},
                {'label': 'JOBTITLE', 'pattern': 'cartographer'}, {'label': 'JOBTITLE', 'pattern': 'lt. col.'},
                {'label': 'JOBTITLE', 'pattern': 'anthropologist'}, {'label': 'JOBTITLE', 'pattern': 'earl'},
                {'label': 'JOBTITLE', 'pattern': 'minority leader'}, {'label': 'JOBTITLE', 'pattern': 'food critic'},
                {'label': 'JOBTITLE', 'pattern': 'playwright'}, {'label': 'JOBTITLE', 'pattern': 'cowboy'},
                {'label': 'JOBTITLE', 'pattern': 'first lady'},
                {'label': 'JOBTITLE', 'pattern': 'agriculture commissioner'},
                {'label': 'JOBTITLE', 'pattern': 'corporal'}, {'label': 'JOBTITLE', 'pattern': 'flyer'},
                {'label': 'JOBTITLE', 'pattern': 'software engineer'}, {'label': 'JOBTITLE', 'pattern': 'navigator'},
                {'label': 'JOBTITLE', 'pattern': 'businessman'}, {'label': 'JOBTITLE', 'pattern': 'steward'},
                {'label': 'JOBTITLE', 'pattern': 'comedian'}, {'label': 'JOBTITLE', 'pattern': 'grocer'},
                {'label': 'JOBTITLE', 'pattern': 'crater'}, {'label': 'JOBTITLE', 'pattern': 'student activist'},
                {'label': 'JOBTITLE', 'pattern': 'machinist'}, {'label': 'JOBTITLE', 'pattern': 'hatter'},
                {'label': 'JOBTITLE', 'pattern': 'babysitter'}, {'label': 'JOBTITLE', 'pattern': 'waitress'},
                {'label': 'JOBTITLE', 'pattern': 'computer scientist'}, {'label': 'JOBTITLE', 'pattern': 'tipper'},
                {'label': 'JOBTITLE', 'pattern': 'hockey player'}, {'label': 'JOBTITLE', 'pattern': 'researcher'},
                {'label': 'JOBTITLE', 'pattern': 'broadcaster'}, {'label': 'JOBTITLE', 'pattern': 'thinner'},
                {'label': 'JOBTITLE', 'pattern': 'CEO'}]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    return nlp


def read_all_files():
    os.chdir(r'D:\IE_NLP_Project\dataset\Test')
    myFiles = glob.glob('*.txt')
    # print(myFiles)
    # print(os.getcwd())
    docs = []
    sent_tokens = []
    for i in myFiles:
        f = open(i, encoding="ascii", errors="ignore")
        temp = f.read()
        docs.append(temp)
        temp2 = sent_tokenize(temp)
        sent_tokens.extend(temp2)
        f.close()
    return docs, sent_tokens


def read_single_file(filename):
    os.chdir(r'C:\Users\shari\Documents\Projects\NLP\WikipediaArticles')
    #os.chdir(r'C:\Users\Nachiappa\PycharmProjects\NLPProject\venv\WikipediaArticles')
    sent_tokens = []
    f = open(filename, encoding="ascii", errors="ignore")
    temp = f.read()
    temp2 = sent_tokenize(temp)
    sent_tokens.extend(temp2)
    f.close()
    return sent_tokens


def read_single_file_table(file_name):
    os.chdir(r'C:\Users\shari\Documents\Projects\NLP\WikipediaArticles')
    #os.chdir(r'C:\Users\Nachiappa\PycharmProjects\NLPProject\venv\WikipediaArticles')
    f = open(file_name, encoding="ascii", errors="ignore")
    lines = f.readlines()
    sentences = ''
    for line in lines:
        if line.find('    ') == 0 or '\t' in line:
            sentences = sentences + line.replace('\n','.')
    f.close()
    sent_tokens =  sent_tokenize(sentences)
    return sent_tokens


def read_all_files_table():
    os.chdir(r'C:\Users\shari\Documents\Projects\NLP\WikipediaArticles')
    myFiles = glob.glob('*.txt')
    # print(myFiles)
    # print(os.getcwd())
    docs = []
    sent_tokens = []
    for i in myFiles:
        f = open(i, encoding="ascii", errors="ignore")
        lines = f.readlines()
        sentences = []
        for line in lines:
            if line.find('    ') == 0 or '\t' in line:
                sentences = sentences + line.replace('\n', '.')
        sent_tokens.extend(sent_tokenize(sentences))
        f.close()
    return sent_tokens


def word_tokenization(sentences):
    # stop_words = set(stopwords.words('english'))
    # all_stops = stop_words | set(string.punctuation)
    # word_tokens = [w for w in word_tokens if not w in all_stops]
    word_tokens = []
    word_tokens_list = []
    for i in sentences:
        word_tokens.extend(word_tokenize(i))
        word_tokens_list.append(word_tokenize(i))
    return word_tokens, word_tokens_list


def word_lemmatization(words):
    lemmatize_word = []
    lemmatizer = WordNetLemmatizer()
    for word in words:
        lemmatize_word.append(lemmatizer.lemmatize(word))
    return lemmatize_word


def word_stemmatization(words):
    stemmatize_word = []
    ps = PorterStemmer()
    for word in words:
        stemmatize_word.append(ps.stem(word))
    return stemmatize_word


def POS_tagging(words):
    POS_tags = []
    POS_tags = nltk.pos_tag(words)
    return POS_tags


def dependency_parsing(sentence):
    dependency_parsed_tree = []
    en_nlp = spacy.load('en_core_web_sm')
    doc = en_nlp(sentence)
    sent = list(doc.sents)
    for s in sent:
        rootOfSentence = s.root.text
    for token in doc:
        dependency_parsed_tree.append([token.dep_, token.head.text, token.text, token.pos_, token.tag_, token.ent_type_, token.ent_iob_])
    return dependency_parsed_tree, rootOfSentence, doc


def display_dependency_parsing(sentence):
    en_nlp = spacy.load('en_core_web_sm')
    doc = en_nlp(sentence)

    def to_nltk_tree(node):
        if node.n_lefts + node.n_rights > 0:
            return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
        else:
            return node.orth_

    [to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]
    displacy.render(doc, style='dep')


def wordnet_features(words):
    synonymns_list = []
    hypernyms_list = []
    hyponyms_list = []
    meronyms_list = []
    holonyms_list = []
    for word in words:
        for i, j in enumerate(wn.synsets(word)):
            synonymns_list.extend(wn.synset(j.name()).lemma_names())
            hypernyms_list.extend(list(chain(*[l.lemma_names() for l in j.hypernyms()])))
            hyponyms_list.extend(list(chain(*[l.lemma_names() for l in j.hyponyms()])))
            meronyms_list.extend(list(chain(*[l.lemma_names() for l in j.part_meronyms()])))
            holonyms_list.extend(list(chain(*[l.lemma_names() for l in j.part_holonyms()])))
    return synonymns_list, hypernyms_list, hyponyms_list, meronyms_list, holonyms_list


def named_entity_recognition(sentence):
    entities = []
    entity_labels = []
    entities_sent = []
    en_nlp = spacy.load('en_core_web_sm')
    doc = en_nlp(sentence)
    for X in doc.ents:
        entities.append(X.text)
        entity_labels.append(X.label_)
        entities_sent.append([X.label_, X.text])
    return entities, entity_labels, entities_sent


def filter_spans(spans):
    # Filter a sequence of spans so they don't contain overlaps
    # For spaCy 2.1.4+: this function is available as spacy.util.filter_spans()
    get_sort_key = lambda span: (span.end - span.start, -span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
        seen_tokens.update(range(span.start, span.end))
    result = sorted(result, key=lambda span: span.start)
    return result


def extract_person_pos_relations(doc):
    # Merge entities and noun chunks into one token
    # to combine the entity words and noun chunks
    per = False
    pos = False
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            per = True
        if ent.label_ == 'JOBTITLE':
            pos = True
    if per == True and pos == True:
        work_list = []
        pos_final_texts = []
        get_per_pos_chunks(doc, pos_final_texts)
        spans = list(doc.ents) + list(doc.noun_chunks)
        spans = filter_spans(spans)
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
        pos_partial = []
        for token in list(doc):
            for pos_text in pos_final_texts:
                if token.text == pos_text:
                    pos_nchunk = token
                    if token.head.ent_type_ == 'PERSON':
                        pos_partial.append((token.head, token))
        add_work_to_final_list(pos_partial, work_list)
        pre_pos_partial = []
        pre_pos_list = [w for w in doc if w.ent_type_ == 'JOBTITLE']
        for pre_pos in pre_pos_list:
            pre_pos_person = [w for w in pre_pos.rights if w.ent_type_ == 'PERSON']
            if pre_pos_person:
                pre_pos_person = pre_pos_person[0]
                pre_pos_partial.append((pre_pos_person, pre_pos))
        add_work_to_final_list(pre_pos_partial, work_list)
        rules_for_extraction(doc, work_list)
        return work_list
    else:
        return []


def get_per_pos_chunks(doc, pos_final_texts):
    for chunk in list(doc.noun_chunks):
        per_nc = False
        pos_nc = False
        per_text = ''
        pos_text = ''
        for ent in chunk.ents:
            if ent.label_ == 'JOBTITLE':
                pos_nc = True
                pos_ent = ent
                pos_final_texts.append(chunk.text)
            if ent.label_ == 'PERSON':
                per_nc = True
                per_ent = ent
        if per_nc and pos_nc:
            # add_work_to_final_list([(per_ent, pos_ent)], work_list)
            print("{}\t{}\t\t\t{}\t{}\t\t\t{}\t{}\t\t\t{}\t{}\n".format('PERSON: ', per_ent.text, 'POSITION: ',
                                                                        pos_ent.text, 'ORG: ', '', 'LOCATION: ', ''))


def rules_for_extraction(doc, work_list):
    for person in filter(lambda w: w.ent_type_ == 'PERSON', doc):
        work = ()
        pos_final = []
        org_final = []
        gpe_final = []
        # if person.dep_ == 'nsubj' and person.head.dep_ == 'root':
        #     position = [w for w in person.head.lefts if w.dep_ == 'nsubj']
        #     pos_final.append(position)
        if person.dep_ in ('nsubj'):
            position = [w for w in person.head.rights if w.dep_ == 'attr']
            if position:
                position = position[0]
                pos_final.append(position)

                def extract_conjuncts(position):
                    for curr_pos in position.conjuncts:
                        pos_final.append(curr_pos)

                extract_conjuncts(position)
        elif person.dep_ == 'ROOT':
            if len(list(doc.sents)) > 1:
                root = list(doc.sents)[1].root
                position = [w for w in root.rights if w.dep_ == 'attr']
                if position:
                    position = position[0]
                    pos_final.append(position)

                    def extract_conjuncts(position):
                        for curr_pos in position.conjuncts:
                            pos_final.append(curr_pos)

                    extract_conjuncts(position)
        for who in filter(lambda w: w.text.lower() == 'who'.lower(), doc):
            if who.dep_ == 'nsubj':
                who_prep = [w for w in who.head.rights if w.dep_ == 'prep' and w.text == 'as']
                if who_prep:
                    who_prep = who_prep[0]
                    position = [w for w in who_prep.rights if w.dep_ == 'pobj']
                    if position:
                        position = position[0]
                        pos_final.append(position)
        for pos in pos_final:
            pos_prep = [w for w in pos.rights if w.dep_ == 'prep']
            if pos_prep:
                pos_prep = pos_prep[0]
                org = [w for w in pos_prep.rights if w.ent_type_ == 'ORG']
                if org:
                    org = org[0]
                    org_final.append(org)
                if not org:
                    gpe = [w for w in pos_prep.rights if w.ent_type_ == 'GPE']
                    if gpe:
                        gpe = gpe[0]
                        gpe_final.append(gpe)
        # for org in filter(lambda w: w.ent_type_ == 'GPE', doc):
        #     org_final.append(org)
        if len(list(set(gpe_final))) == 1:
            gpe_final = gpe_final[0]
        work = (person, list(set(pos_final)), list(set(org_final)), gpe_final)
        work_list.append(work)


def add_work_to_final_list(pos_partial, work_list):
    for (per, pos) in pos_partial:
        org_final = []
        gpe_final = []
        pos_prep = [w for w in pos.rights if w.dep_ == 'prep']
        if pos_prep:
            pos_prep = pos_prep[0]
            org = [w for w in pos_prep.rights if w.ent_type_ == 'ORG']
            if org:
                org = org[0]
                org_final.append(org)
            if not org:
                gpe = [w for w in pos_prep.rights if w.ent_type_ == 'GPE']
                if gpe:
                    gpe = gpe[0]
                    gpe_final.append(gpe)
        if len(list(set(gpe_final))) == 1:
            gpe_final = gpe_final[0]
        pos_list = []
        pos_list.append(pos)
        work = (per, pos_list, list(set(org_final)), gpe_final)
        work_list.append(work)


def extract_person_pos_relations_table(doc):
    # Merge entities and noun chunks into one token
    # to combine the entity words and noun chunks
    spans = list(doc.ents) + list(doc.noun_chunks)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)
    work_list = []
    for person in filter(lambda w: w.ent_type_ == 'PERSON', doc):
        work = ()
        pos_final = []
        org_final = []
        if person.dep_ == 'ROOT':
            position = [w for w in person.rights if w.dep_ == 'attr' or w.dep_ == 'appos']
            if position:
                position = position[0]
                if position.ent_type_ == '':
                    pos_final.append(position)
                def extract_conjuncts(position):
                    for curr_pos in position.conjuncts:
                        if curr_pos.ent_type_ == '':
                            pos_final.append(curr_pos)
                extract_conjuncts(position)
        for org in filter(lambda w: w.ent_type_ == 'GPE', doc):
            org_final.append(org)
        if len(list(set(gpe_final))) == 1:
            gpe_final = gpe_final[0]
        work = (person, list(set(pos_final)), list(set(org_final)), gpe_final)
        work_list.append(work)
    return work_list


def display_work_list(work_list):
    for person, pos_list, org_list, gpe in work_list:
        pos_str = ''
        for pos in pos_list:
            pos_str = pos_str + pos.text + '; '
        org_str = ''
        for org in org_list:
            org_str = org_str + org.text + '; '
        gpe_str = ''
        if gpe:
            gpe_str = gpe.text
        if len(pos_list) != 0:
            print("{}".format(person.doc.text))
            print("{}\t{}\t\t\t{}\t{}\t\t\t{}\t{}\t\t\t{}\t{}\n".format('PERSON: ',person.text,'POSITION: ', pos_str,'ORG: ', org_str, 'LOCATION: ', gpe_str))


def display_per_org_sen_list(doc):
    per_org_sen_list = []
    per = False
    org = False
    for sp in doc:
        if sp.ent_type_ == 'PERSON':
            per = True
        if sp.ent_type_ == 'ORG':
            org = True
    if per == True and org == True:
        per_org_sen_list.append(doc.text)
    for per_org_sen in per_org_sen_list:
        print(per_org_sen)
    return per_org_sen_list


def display_per_pos_sent(doc):
    per = False
    pos = False
    per_ent  = None
    pos_ent = None
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            per = True
            per_ent = ent
        if ent.label_ == 'JOBTITLE':
            pos = True
            pos_ent = ent
    if per == True and pos == True:
        print(doc.text)
        print(per_ent.label_ + ': ' + per_ent.text + '\t\t\t' + pos_ent.label_ + ': ' + pos_ent.text + '\n')
        return doc.text


TEXT_FILE_NAME = '1.txt'
sent_tokens = read_single_file(TEXT_FILE_NAME)
print("sent_tokens",sent_tokens)
# docs, sent_tokens = read_all_files()
# word_tokens, work_tokens_list = word_tokenization(sent_tokens)
# pos_tagged_sent_tokens = []
# for words in work_tokens_list:
#     pos_tagged_words = POS_tagging(words)
#     pos_tagged_sent_tokens.append(pos_tagged_words)
#
# synonymns_list, hypernyms_list, hyponyms_list, meronyms_list, holonyms_list = wordnet_features(word_tokens)
#
# dependency_parsed_tree_list = []
# entities_sent_list = []
# for sent_token in sent_tokens:
#     dependency_parsed_tree = dependency_parsing(sent_token)
#     dependency_parsed_tree_list.append(dependency_parsed_tree)
#     entities, entity_labels, entities_sent = named_entity_recognition(sent_token)
#     entities_sent_list.append(entities_sent)
# print('None')

nlp = spacy.load("en_core_web_sm")
nlp = add_custom_named_entity(nlp)
per_pos_sent_list = []
for sent_token in sent_tokens:
    doc = nlp(sent_token)
    work_list = extract_person_pos_relations(doc)
    display_work_list(work_list)
sent_tokens = read_single_file_table(TEXT_FILE_NAME)
# sent_tokens = read_all_files_table()
for sent_token in sent_tokens:
    print(sent_token)
for sent_token in sent_tokens:
    doc = nlp(sent_token)
    display_dependency_parsing(sent_token)
    work_list = extract_person_pos_relations_table(doc)
    display_work_list(work_list)
    # display_per_pos_sent(doc)
    # per_org = display_per_org_sen_list(doc)
    # per_org_list.extend(per_org)