In [None]:
pip install benepar

In [None]:
import benepar
benepar.download('benepar_en3')

In [None]:
!python -m spacy download en
!python -m spacy download en_core_web_md

Features to be extracted using spacy:
- Dependent of target word
- Head + full constituent from the head word

Dataset: SEM-2012-SharedTask-CD-SCO-training-simple.v2

In [16]:
# Load dependency
import spacy
import pandas as pd
import benepar

In [None]:
# Extract dependent of target word
nlp = spacy.load('en_core_web_sm')
# Sentence is taken from baskervilles08, 120th sentence
text_1 = "When I came round the balcony he had reached the end of the farther corridor, and I could see from the glimmer of light through an open door that he had entered one of the rooms."
# Sentence is taken from baskervilles01, 40th sentence
text_2 = "When i said that you stimulated me I meant, to be frank, that in noting your fallacies I was occasionally guided towards the truth."


doc_text1 = nlp(text_1)
doc_text2 = nlp(text_2)
# Iterate over to extract dependent of current token for text 1 and text 2
for token in doc_text1:
    print("Token:", token.text)
    print("Dependent:", token.dep_, "(", token.head.text, ")")
    for child in token.children:
        print("-->", child.text, child.dep_)
        
for token in doc_text2:
    print("Token:", token.text)
    print("Dependent:", token.dep_, "(", token.head.text, ")")
    for child in token.children:
        print("-->", child.text, child.dep_)

In [18]:
# Saving the output into dataframe
# Create an empty list to store the results
results = []

# Iterate over to extract dependent of current token for text 1 and text 2
for i, doc in enumerate([doc_text1, doc_text2]):
    for token in doc:
        dependent = token.dep_
        results.append({'text': f'Text {i+1}',
                        'token': token.text,
                        'dependent': dependent})

# Convert the list of dictionaries to a pandas dataframe
df = pd.DataFrame(results)
print(df)

      text    token dependent
0   Text 1     When    advmod
1   Text 1        I     nsubj
2   Text 1     came      ROOT
3   Text 1    round      prep
4   Text 1      the       det
..     ...      ...       ...
60  Text 2   guided      ROOT
61  Text 2  towards      prep
62  Text 2      the       det
63  Text 2    truth      pobj
64  Text 2        .     punct

[65 rows x 3 columns]


In [25]:
# Extract Head + full constituent from the head word
nlp_head = spacy.load('en_core_web_md')

if spacy.__version__.startswith('2'):
    nlp_head.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp_head.add_pipe("benepar", config={"model": "benepar_en3"})


doc_text_1 = nlp_head(text_1)
sent1 = list(doc_text_1.sents)[0]

doc_text_2 = nlp_head(text_2)
sent2 = list(doc_text_2.sents)[0]

# Save output to a dictionary
# Text 1
output_dict1 = {}
output_dict1['parse_string'] = sent1._.parse_string
output_dict1['labels'] = list(sent1._.labels)
output_dict1['children'] = list(sent1._.children)
print(output_dict1)

# Text 2
output_dict = {}
output_dict['parse_string'] = sent2._.parse_string
output_dict['labels'] = list(sent2._.labels)
output_dict['children'] = list(sent2._.children)
print(output_dict)


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'parse_string': '(S (S (SBAR (WHADVP (WRB When)) (S (NP (PRP I)) (VP (VBD came) (PP (IN round) (NP (DT the) (NN balcony)))))) (NP (PRP he)) (VP (VBD had) (VP (VBN reached) (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT the) (JJR farther) (NN corridor))))))) (, ,) (CC and) (S (NP (PRP I)) (VP (MD could) (VP (VB see) (PP (IN from) (NP (NP (DT the) (NN glimmer)) (PP (IN of) (NP (NN light))))) (PP (IN through) (NP (DT an) (JJ open) (NN door))) (SBAR (IN that) (S (NP (PRP he)) (VP (VBD had) (VP (VBN entered) (NP (NP (CD one)) (PP (IN of) (NP (DT the) (NNS rooms))))))))))) (. .))', 'labels': ['S'], 'children': [When I came round the balcony he had reached the end of the farther corridor, ,, and, I could see from the glimmer of light through an open door that he had entered one of the rooms, .]}
{'parse_string': '(S (SBAR (WHADVP (WRB When)) (S (NP (PRP i)) (VP (VBD said) (SBAR (IN that) (S (NP (PRP you)) (VP (VBD stimulated) (NP (PRP me)))))))) (NP (PRP I)) (VP (VBD meant) (, ,) (S (VP (TO 