## Information Extraction Practise Notebook


#### Aims of the notebook

> - Create a function that can accurately extract the subject of a sentence 
- Consider problems with references to 'He'

------

### Imports

In [1]:
import spacy
import pandas as pd

#### Load Spacy Model

In [131]:
# load spaCy model
nlp = spacy.load("en_core_web_sm")

#### Declare Text Example

In [184]:
text = "An eighth straight Bundesliga title has now loomed sharply into view for Bayern Munich thanks to Kimmich's chip evading Burki's desperate but doomed grasp."



### Separate into Clauses

##### Define a function to split sentence into clauses based on nltk tree

In [185]:
from nltk import Tree

In [243]:
# Function to convert string into tree
# taken from Stack overflow post by Christos Baziotis Aug 23 '16
# https://stackoverflow.com/questions/36610179/how-to-get-the-dependency-tree-with-spacy

def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return node.orth_
    else:
        return node.orth_

In [244]:
def find_root(docu):
    for token in docu:
        if token.head is token:
            return token

#### Create a Spacy Object 

In [245]:
doc = nlp(text)

In [306]:
doc.noun_chunks

generator

In [246]:
trees = [to_nltk_tree(sent.root) for sent in doc.sents]

### Method with regex parser

In [250]:
from nltk import pos_tag,RegexpParser

In [270]:
sent = 'Many tourists visit the Cardiff Castle when there are tours and historical visits'
toks = text.split()

In [271]:
tagged = pos_tag(toks)

In [366]:
grammar = "CHUNK: {<.*>+?<NN|NNP><.*>+?<NN|NNP>?}" 
cp = RegexpParser(grammar)

In [367]:
parsed = cp.parse(tagged)

In [368]:
for subtree in parsed.subtrees():
    if subtree.label()=="S" or subtree.label()=="SBAR":
        print(subtree)
        #subtexts.append(' '.join(subtree.leaves()))



(S
  (CHUNK An/DT eighth/JJ straight/NN Bundesliga/NNP title/NN)
  (CHUNK
    has/VBZ
    now/RB
    loomed/VBN
    sharply/RB
    into/IN
    view/NN
    for/IN
    Bayern/NNP)
  (CHUNK Munich/NNP thanks/NNS to/TO Kimmich's/NNP chip/NN)
  (CHUNK evading/VBG Burki's/NNP desperate/NN)
  but/CC
  doomed/VBD
  grasp./NNS)


#### Loop to evaluate sentence

In [376]:
test = "Robert Lewandowski's first-half penalty and Benjamin Pavard's second-half header pushed Die Roten back four points clear at the top of the Bundesliga table"

In [377]:
test_doc = nlp(test)

In [379]:
for tok in test_doc: 
    print(tok.text, "-->",tok.dep_,"-->", tok.pos_)
    # print(tok)

Robert --> compound --> PROPN
Lewandowski --> poss --> PROPN
's --> case --> PART
first --> amod --> ADJ
- --> punct --> PUNCT
half --> compound --> NOUN
penalty --> nsubj --> NOUN
and --> cc --> CCONJ
Benjamin --> compound --> PROPN
Pavard --> poss --> PROPN
's --> case --> PART
second --> amod --> ADJ
- --> punct --> PUNCT
half --> compound --> NOUN
header --> conj --> NOUN
pushed --> ROOT --> VERB
Die --> compound --> PROPN
Roten --> dobj --> PROPN
back --> advmod --> ADV
four --> nummod --> NUM
points --> npadvmod --> NOUN
clear --> advmod --> ADJ
at --> prep --> ADP
the --> det --> DET
top --> pobj --> NOUN
of --> prep --> ADP
the --> det --> DET
Bundesliga --> compound --> PROPN
table --> pobj --> NOUN


In [249]:
for entity in doc.ents:
    print(entity.text, entity.label_)

eighth ORDINAL
Bundesliga GPE
Bayern Munich ORG
Kimmich ORG
Burki GPE


#### Function to identify targets in text

In [109]:
def match_target(string):
    
    # Declare dictionary of matches
    sentence_dictionary = {
        'nsubj': '',
        'subjpass': '',
        'pobj': '',
        'dobj': '',
        "poss": '',
        "children": ''
    }
    
    
    # Iterate through tokens in input string
    for i, tok in enumerate(string):
        
        
        # Only consider proper nouns or pronouns
        if str(tok.pos_) == "PROPN" or str(tok.pos_) == "PRON":
            # Find nsubj
            if str(tok.dep_) == "nsubj":
                sentence_dictionary['nsubj'] += str(tok)
                sentence_dictionary['children'] += str([token.text for token in tok.children])[1:-1]
            # Find passive subj
            elif str(tok.dep_) == ("subjpass"):
                sentence_dictionary['subjpass'] += str(tok)
                sentence_dictionary['children'] += str([token.text for token in tok.children])[1:-1]
            # Find pobj    
            elif str(tok.dep_) == ("pobj"):
                sentence_dictionary['pobj'] += str(tok)
                sentence_dictionary['children'] += str([token.text for token in tok.children])[1:-1]
            # Find dobj 
            elif str(tok.dep_) == ("dobj"):
                sentence_dictionary['dobj'] += str(tok)
                sentence_dictionary['children'] += str([token.text for token in tok.children])[1:-1]
            # Find possessive
            elif str(tok.dep_) == ("poss"):
                sentence_dictionary['poss'] += str(tok)
                sentence_dictionary['children'] += str([token.text for token in tok.children])[1:-1]
                
    
    # Include logic here to return a target from the dictionary
    
    
    # Include logic to match children with predefined tag list
    
        
    # Return the dictionary
    return sentence_dictionary

#### Use function

In [110]:
match_target(doc)

{'nsubj': '',
 'subjpass': '',
 'pobj': 'Munich',
 'dobj': '',
 'poss': 'KimmichBurki',
 'children': '\'Bayern\'"\'s""\'s"'}