## Feature Extraction for SRL task

The selected sentences are from SEM-2012-SharedTask-CD-SCO-dev-simple.v2.txt (ID=40 & 120, chapter='baskervilles08')

In [1]:
# baskervilles01, 40th sentence
sent_1 = '''When i said that you stimulated me I meant, to be frank, 
that in noting your fallacies I was occasionally guided towards the truth.'''
# baskervilles08, 120th sentence
sent_2 = '''When I came round the balcony he had reached the end of the farther corridor, 
and I could see from the glimmer of light through an open door that he had entered one of the rooms.'''
sent_3 = ''' Obama was recognized as one of the greatest presidents in the USA.'''

In [47]:
import spacy
import pandas as pd
from spacy import displacy
import networkx as nx
import io
import stanza
import benepar

In [3]:
nlp= spacy.load('en_core_web_sm')
doc_1,doc_2, doc_3 = nlp(sent_1), nlp(sent_2), nlp(sent_3)
df_1, df_2, df_3  = pd.DataFrame([token for token in list(doc_1) if token],columns=['Surface_form']), pd.DataFrame([token for token in list(doc_2) if token],columns=['Surface_form']), pd.DataFrame([token for token in list(doc_3) if token],columns=['Surface_form'])

In [6]:
def get_graph(doc):
    '''Function that calculate the path from current tokens to the root.'''
    edges = []
    paths = []
    for token in doc:
        if token.dep_ == 'ROOT':
            entity1 = token.text.lower()
        for child in token.children:
            edges.append(('{0}'.format(token.lower_),'{0}'.format(child.lower_)))
    graph = nx.Graph(edges)
   
    for token in doc:
        entity2 = token.text.lower()
        path_len = nx.shortest_path_length(graph, source=entity1, target=entity2)
        paths.append(path_len)
    return paths

def create_dataframe(df, doc):
    '''Function that display tokens and other dependency information from the original sentence.'''
    Dependency = []
    Head = []
    Token_spcy = []
    POS = []
    NER = []

    named_entities = doc.ents
    for token in doc:
        ne = 'NAN'
        dependency = format(token.dep_)
        head = token.head.text
        token_spcy = format(token.text)
        pos_tag = token.pos_
        for ent in named_entities:
            if ent.text == token.text:
                ne = ent.label_
            else:
                ne = 'NAN'
        Dependency.append(dependency)
        Head.append(head)
        Token_spcy.append(token_spcy)
        POS.append(pos_tag)
        NER.append(ne)
   
    df['Token_spcy'] = Token_spcy
    df['Head'] = Head
    df['Relation2_head'] = Dependency
    df['Path'] = get_graph(doc)
    df['POS'] = POS
    df['NER'] = NER
    return df

In [7]:
create_dataframe(df_1, doc_1)

Unnamed: 0,Surface_form,Token_spcy,Head,Relation2_head,Path,POS,NER
0,When,When,said,advmod,1,SCONJ,NAN
1,i,i,said,nsubj,1,PRON,NAN
2,said,said,said,ROOT,0,VERB,NAN
3,that,that,stimulated,mark,1,SCONJ,NAN
4,you,you,stimulated,nsubj,2,PRON,NAN
5,stimulated,stimulated,said,ccomp,1,VERB,NAN
6,me,me,stimulated,dobj,2,PRON,NAN
7,I,I,meant,nsubj,1,PRON,NAN
8,meant,meant,stimulated,parataxis,2,VERB,NAN
9,",",",",meant,punct,3,PUNCT,NAN


In [8]:
create_dataframe(df_2, doc_2)

Unnamed: 0,Surface_form,Token_spcy,Head,Relation2_head,Path,POS,NER
0,When,When,came,advmod,1,SCONJ,NAN
1,I,I,came,nsubj,1,PRON,NAN
2,came,came,came,ROOT,0,VERB,NAN
3,round,round,came,prep,1,ADV,NAN
4,the,the,balcony,det,3,DET,NAN
5,balcony,balcony,round,pobj,2,NOUN,NAN
6,he,he,reached,nsubj,4,PRON,NAN
7,had,had,reached,aux,4,AUX,NAN
8,reached,reached,balcony,relcl,3,VERB,NAN
9,the,the,end,det,3,DET,NAN


In [9]:
create_dataframe(df_3, doc_3)

Unnamed: 0,Surface_form,Token_spcy,Head,Relation2_head,Path,POS,NER
0,,,recognized,dep,1,SPACE,NAN
1,Obama,Obama,recognized,nsubjpass,1,PROPN,NAN
2,was,was,recognized,auxpass,1,AUX,NAN
3,recognized,recognized,recognized,ROOT,0,VERB,NAN
4,as,as,recognized,prep,1,ADP,NAN
5,one,one,as,pobj,2,NUM,NAN
6,of,of,one,prep,3,ADP,NAN
7,the,the,presidents,det,5,DET,NAN
8,greatest,greatest,presidents,amod,5,ADJ,NAN
9,presidents,presidents,of,pobj,4,NOUN,NAN


In [19]:
# ! pip install benepar



In [13]:
# ! pip install constituent-treelib

Collecting constituent-treelib

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.11.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.3 which is incompatible.



  Using cached constituent_treelib-0.0.5-py3-none-any.whl (17 kB)
Collecting huspacy==0.6.0
  Using cached huspacy-0.6.0-py3-none-any.whl (91 kB)
Collecting protobuf==3.20.3
  Using cached protobuf-3.20.3-cp39-cp39-win_amd64.whl (904 kB)
Collecting pdfkit==1.0.0
  Using cached pdfkit-1.0.0-py3-none-any.whl (12 kB)
Collecting wand==0.6.10
  Using cached Wand-0.6.10-py2.py3-none-any.whl (142 kB)
Installing collected packages: protobuf, wand, pdfkit, huspacy, constituent-treelib
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.19.1
    Uninstalling protobuf-3.19.1:
      Successfully uninstalled protobuf-3.19.1
Successfully installed constituent-treelib-0.0.5 huspacy-0.6.0 pdfkit-1.0.0 protobuf-3.20.3 wand-0.6.10


In [15]:
from constituent_treelib import ConstituentTree
# nlp= spacy.load('en_core_web_lg')
# Define the language that should be considered with respect to the underlying benepar and spaCy models 
language = ConstituentTree.Language.English

# You can also specify the desired spaCy model for the language ("Small" is selected by default)
spacy_model_size = ConstituentTree.SpacyModelSize.Large

# Create the neccesary NLP pipeline that is required to instantiate a ConstituentTree object
# nlp_constituent = ConstituentTree.create_pipeline(language, spacy_model_size) 

# If you wish, you can instruct the library to download and install the models automatically
nlp_constituent = ConstituentTree.create_pipeline(language, spacy_model_size, download_models=True) 

# Now we can instantiate a ConstituentTree object and pass it the parsed sentence as well as the NLP pipeline


'en_core_web_lg' not found. Downloading...
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [48]:
# import nltk
benepar.download('benepar_en3')
nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')

[nltk_data] Downloading package benepar_en3 to
[nltk_data]     C:\Users\anaverageone\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!
2023-02-16 22:29:42 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-02-16 22:29:43 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-02-16 22:29:43 INFO: Use device: cpu
2023-02-16 22:29:43 INFO: Loading: tokenize
2023-02-16 22:29:43 INFO: Loading: pos
2023-02-16 22:29:43 INFO: Loading: constituency
2023-02-16 22:29:44 INFO: Done loading processors!


In [83]:
# SPacy for constituency parsing
nlp_benepar = spacy.load('en_core_web_md')
nlp_benepar.add_pipe('benepar', config={'model': 'benepar_en3'})
doc_1,doc_2 = nlp_benepar(sent_1), nlp_benepar(sent_2)
sent_1,sent_2 = list(doc_1.sents)[0], list(doc_2.sents)[0]
print(sent_1._.parse_string)
print(sent_2._.parse_string)

(S (SBAR (WHADVP (WRB When)) (S (NP (PRP i)) (VP (VBD said) (SBAR (IN that) (S (NP (PRP you)) (VP (VBD stimulated) (NP (PRP me)))))))) (NP (PRP I)) (VP (VBD meant) (, ,) (S (VP (TO to) (VP (VB be) (ADJP (JJ frank))))) (, ,) (SBAR (IN 
) (S (IN that) (S (PP (IN in) (S (VP (VBG noting) (NP (PRP$ your) (NNS fallacies))))) (NP (PRP I)) (VP (VBD was) (ADVP (RB occasionally)) (VP (VBN guided) (PP (IN towards) (NP (DT the) (NN truth))))))))) (. .))
(S (S (SBAR (WHADVP (WRB When)) (S (NP (PRP I)) (VP (VBD came) (PP (IN round) (NP (DT the) (NN balcony)))))) (NP (PRP he)) (VP (VBD had) (VP (VBN reached) (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT the) (JJR farther) (NN corridor))))))) (, ,) (CC 
) (CC and) (S (NP (PRP I)) (VP (MD could) (VP (VB see) (PP (IN from) (NP (NP (DT the) (NN glimmer)) (PP (IN of) (NP (NN light))))) (PP (IN through) (NP (DT an) (JJ open) (NN door))) (SBAR (IN that) (S (NP (PRP he)) (VP (VBD had) (VP (VBN entered) (NP (NP (CD one)) (PP (IN of) (NP (DT the) (NNS rooms))

In [82]:
# Stanza for constituency parsing
doc_1 = nlp_stanza(sent_1)
for sentence in doc_1.sentences:
    print(sentence.constituency)

(ROOT (SBAR (WHADVP (WRB When)) (S (NP (PRP i)) (VP (VBD said) (SBAR (IN that) (S (NP (PRP you)) (VP (VBD stimulated) (NP (NP (PRP me)) (SBAR (S (NP (PRP I)) (VP (VBD meant) (, ,) (S (VP (TO to) (VP (VB be) (ADJP (JJ frank))))) (, ,) (SBAR (IN that) (S (PP (IN in) (S (VP (VBG noting) (NP (PRP$ your) (NNS fallacies))))) (NP (PRP I)) (VP (VBD was) (ADVP (RB occasionally)) (VP (VBN guided) (PP (IN towards) (NP (DT the) (NN truth)))))))))))))))) (. .)))


In [79]:
doc_2 = nlp_stanza(sent_2)
for sentence in doc_2.sentences:
    print(sentence.constituency)

(ROOT (S (SBAR (WHADVP (WRB When)) (S (NP (PRP I)) (VP (VBD came) (PP (IN round) (NP (DT the) (NN balcony)))))) (S (NP (PRP he)) (VP (VBD had) (VP (VBN reached) (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT the) (JJR farther) (NN corridor))))))) (, ,) (CC and) (S (NP (PRP I)) (VP (MD could) (VP (VB see) (PP (IN from) (NP (NP (DT the) (NN glimmer)) (PP (IN of) (NP (NN light))))) (PP (IN through) (NP (NP (DT an) (JJ open) (NN door)) (SBAR (WHNP (WDT that)) (S (NP (PRP he)) (VP (VBD had) (VP (VBN entered) (NP (NP (CD one)) (PP (IN of) (NP (DT the) (NNS rooms))))))))))))) (. .)))


In [80]:
# from stanza.models.constituency.parse_tree import Tree

# doc_1 = nlp_stanza(sent_1)
# for sentence in doc_1.sentences:
#     sent = sentence.constituency
#     unary_depth = sent.get_constituent_counts
#     print(type(str(sentence)))
#     # Create a Tree object from a parse string
#     parse_string = sent
#     tree = Tree(parse_string)
#     print(sentence)
#     print()

#     # Extract all the constituents
#     constituents = [subtree for subtree in tree.subtree]

#     # Print the constituents
#     for c in constituents:
#         print(c)