In [1]:
import os
import pickle
import re
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
def getTime():
    return time.strftime("%Y%m%d", time.gmtime())
# env
print('{}\n{}'.format(getTime(),os.getcwd()))

20190109
C:\Github\training_2\NLP\03_NER


# NER and Information Extraction (IE) using SpaCy
[Here is a reading list of subjects](https://pubweb.eng.utah.edu/~cs6390/schedule.html)

In [3]:
import spacy
from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()

## Noun or Entity triplets?

 - 1st order: Departures > at > heathrow.
 - 2nd order: Drones > stopped > departures.
 - 3rd order: Drones > at > heatrow.

In [4]:
doc = "Flights at Heathrow were stopped by Drones. Reported runners in the big park. It is London busiest airport."#" Heathrow is London busiest airport"
doc = nlp(doc)
displacy.render(doc,  jupyter=True, style='ent')

In [5]:
displacy.render(doc, jupyter=True, style='dep')

In [6]:
# # Word, POS and type
# for w in doc:
#     print("'{}'".format(w), w.pos_ , w.dep_,)

In [7]:
def tokenDeps(token):
    """looks for dependent tokens to the left and right"""
    deps = list(token.lefts) + list(token.rights)
    return deps

In [8]:
# noun to noun phrases
NOUNS = ['NOUN','PROPN']
for w in doc:
    subjects = [w for w in doc if w.pos_ in NOUNS]
subjects


for subj in subjects:
    deps = tokenDeps(subj)
    if len(deps) > 0: # if subject has dependancies
        for dep in deps:
            moreSubjects = [x for x in tokenDeps(dep) if x.pos_ in NOUNS]
            if len(moreSubjects)>0:
                print((subj,dep,moreSubjects[0]))

(Flights, at, Heathrow)
(runners, in, park)


In [9]:
# It'd probably be better for `word.subtree` to return a `Span` object
# instead of a generator over the tokens. If you want the `Span` you can
# get it via the `.right_edge` and `.left_edge` properties. The `Span`
# object is nice because you can easily get a vector, merge it, etc.
for word in doc:
    if word.dep_ in ('xcomp', 'ccomp','pobj'):
        subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
        print(subtree_span.text, '|', subtree_span.root.text,'|',word.dep_,list(word.children))

# You might also want to select a head, and then select a start and end
# position by walking along its children. You could then take the
# `.left_edge` and `.right_edge` of those tokens, and use it to calculate
# a span.

Heathrow | Heathrow | pobj []
Drones | Drones | pobj []
the big park | park | pobj [the, big]
London busiest airport | busiest | ccomp [London, airport]


In [10]:
for chunk in doc.noun_chunks:
    print(chunk.root.head.text,chunk.root.text, chunk.root.dep_,)

stopped Flights nsubjpass
at Heathrow pobj
by Drones pobj
runners runners ROOT
in park pobj
is It nsubj
busiest airport dobj


# [Subject, Verb, Objects](https://github.com/NSchrading/intro-spacy-nlp/blob/master/subject_object_extraction.py)

In [11]:
# find subjects or entities in text
entities = ['ORG','FAC','PERSON']
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
NOUNS = ['NOUN','PROPN']

In [12]:
# 1. find verbs
verbs = [tok for tok in doc if tok.pos_ == "VERB" and tok.dep_ != "aux"]
verbs

[were, stopped, Reported, is]

In [None]:
for verb in verbs:
    deps = tokenDeps(verb)
    if len(deps) > 0: # if subject has dependancies
        for dep in deps:
             if dep.pos_ in NOUNS or dep.dep_ in SUBJECTS:
                    print('1',dep.text, verb)
                    moreDeps = tokenDeps(dep) 
                    if len(moreDeps)>0: # if subject has dependancies
                        for d in moreDeps:
                            print('3',dep,verb,d, [x for x in tokenDeps(d) if x.pos_ in NOUNS][0])
                                

In [15]:
doc = 'Net income was $9.4 million compared to the prior year of $2.7 million. Revenue exceeded twelve billion dollars, with a loss of $1b.'
doc = nlp(doc)
displacy.render(doc, jupyter=True, style='dep')

In [16]:
def extract_currency_relations(doc):
    # merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    for span in spans:
        span.merge()

    relations = []
    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
        if money.dep_ in ('attr', 'dobj'): #if money dependency is attr or dobj
            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
            relations.append((money.head.head, money))
    return relations

In [31]:
for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
    if money.dep_ in ('attr', 'dobj'): #if money dependency is attr or dobj
        subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
        print('1',money,subject)
    elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
        print('2',money.head.head, money)

1 $9.4 million [Net income]
2 the prior year $2.7 million
1 twelve billion dollars [Revenue]
2 a loss 1b



## [SVO package](https://nicschrading.com/project/Intro-to-NLP-with-spaCy/)
[github](https://github.com/NSchrading/intro-spacy-nlp/blob/master/subject_object_extraction.py)

In [14]:
#from nltk.stem.wordnet import WordNetLemmatizer
#from spacy.en import English

SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
OBJECTS = ["dobj", "dative", "attr", "oprd"]

def getSubsFromConjunctions(subs):
    moreSubs = []
    for sub in subs:
        # rights is a generator
        rights = list(sub.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
            if len(moreSubs) > 0:
                moreSubs.extend(getSubsFromConjunctions(moreSubs))
    return moreSubs

def getObjsFromConjunctions(objs):
    moreObjs = []
    for obj in objs:
        # rights is a generator
        rights = list(obj.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
            if len(moreObjs) > 0:
                moreObjs.extend(getObjsFromConjunctions(moreObjs))
    return moreObjs

def getVerbsFromConjunctions(verbs):
    moreVerbs = []
    for verb in verbs:
        rightDeps = {tok.lower_ for tok in verb.rights}
        if "and" in rightDeps:
            moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"])
            if len(moreVerbs) > 0:
                moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))
    return moreVerbs

def findSubs(tok):
    head = tok.head
    while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
        head = head.head
    if head.pos_ == "VERB":
        subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
        if len(subs) > 0:
            verbNegated = isNegated(head)
            subs.extend(getSubsFromConjunctions(subs))
            return subs, verbNegated
        elif head.head != head:
            return findSubs(head)
    elif head.pos_ == "NOUN":
        return [head], isNegated(tok)
    return [], False

def isNegated(tok):
    negations = {"no", "not", "n't", "never", "none"}
    for dep in list(tok.lefts) + list(tok.rights):
        if dep.lower_ in negations:
            return True
    return False

def findSVs(tokens):
    svs = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        if len(subs) > 0:
            for sub in subs:
                svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
    return svs

def getObjsFromPrepositions(deps):
    objs = []
    for dep in deps:
        if dep.pos_ == "ADP" and dep.dep_ == "prep":
            objs.extend([tok for tok in dep.rights if tok.dep_  in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")])
    return objs

def getObjsFromAttrs(deps):
    for dep in deps:
        if dep.pos_ == "NOUN" and dep.dep_ == "attr":
            verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
            if len(verbs) > 0:
                for v in verbs:
                    rights = list(v.rights)
                    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                    objs.extend(getObjsFromPrepositions(rights))
                    if len(objs) > 0:
                        return v, objs
    return None, None

def getObjFromXComp(deps):
    for dep in deps:
        if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
            v = dep
            rights = list(v.rights)
            objs = [tok for tok in rights if tok.dep_ in OBJECTS]
            objs.extend(getObjsFromPrepositions(rights))
            if len(objs) > 0:
                return v, objs
    return None, None

def getAllSubs(v):
    verbNegated = isNegated(v)
    subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
    if len(subs) > 0:
        subs.extend(getSubsFromConjunctions(subs))
    else:
        foundSubs, verbNegated = findSubs(v)
        subs.extend(foundSubs)
    return subs, verbNegated

def getAllObjs(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
    objs.extend(getObjsFromPrepositions(rights))

    #potentialNewVerb, potentialNewObjs = getObjsFromAttrs(rights)
    #if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
    #    objs.extend(potentialNewObjs)
    #    v = potentialNewVerb

    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

def findSVOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjs(v)
            for sub in subs:
                for obj in objs:
                    objNegated = isNegated(obj)
                    svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_))
    return svos

def getAbuserOntoVictimSVOs(tokens):
    maleAbuser = {'he', 'boyfriend', 'bf', 'father', 'dad', 'husband', 'brother', 'man'}
    femaleAbuser = {'she', 'girlfriend', 'gf', 'mother', 'mom', 'wife', 'sister', 'woman'}
    neutralAbuser = {'pastor', 'abuser', 'offender', 'ex', 'x', 'lover', 'church', 'they'}
    victim = {'me', 'sister', 'brother', 'child', 'kid', 'baby', 'friend', 'her', 'him', 'man', 'woman'}

    svos = findSVOs(tokens)
    wnl = WordNetLemmatizer()
    passed = []
    for s, v, o in svos:
        s = wnl.lemmatize(s)
        v = "!" + wnl.lemmatize(v[1:], 'v') if v[0] == "!" else wnl.lemmatize(v, 'v')
        o = "!" + wnl.lemmatize(o[1:]) if o[0] == "!" else wnl.lemmatize(o)
        if s in maleAbuser.union(femaleAbuser).union(neutralAbuser) and o in victim:
            passed.append((s, v, o))
    return passed

def printDeps(toks):
    for tok in toks:
        print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])


# Entity 2 Entity Relations
# [Train you own entity on NER](https://spacy.io/usage/training#ner)
 - https://github.com/explosion/spaCy/issues/2183
 - [train ur own recipe](https://support.prodi.gy/t/is-there-any-recipes-to-train-a-relation-extraction-model/182/2)
 - https://github.com/explosion/spaCy/blob/master/examples/training/train_intent_parser.py