In [None]:
# Code is run on collab and several packages need to be installed to use benepar and spacy packages
!pip install benepar

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting benepar
  Downloading benepar-0.2.0.tar.gz (33 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-struct>=0.5
  Downloading torch_struct-0.5-py3-none-any.whl (34 kB)
Collecting tokenizers>=0.9.4
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[tokenizers,torch]>=4.2.2
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece>=0.1.91
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m66.

In [None]:
!python -m spacy download en
!python -m spacy download en_core_web_md

2023-02-16 10:15:53.034579: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-16 10:15:55.321963: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-16 10:15:55.322099: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-16 10:15:58.961182: E tensorfl

Features to be extracted using spacy:
- Dependent of target word
- Head + full constituent from the head word

Dataset: SEM-2012-SharedTask-CD-SCO-training-simple.v2

In [None]:
# Load dependency
import spacy
import pandas as pd
import benepar
benepar.download('benepar_en3')

[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Unzipping models/benepar_en3.zip.


True

In [None]:
# Load the English language model
nlp = spacy.load('en_core_web_sm')

# Sentence is taken from baskervilles08, 120th sentence
sent_1 = "When I came round the balcony he had reached the end of the farther corridor, and I could see from the glimmer of light through an open door that he had entered one of the rooms."

# Sentence is taken from baskervilles01, 40th sentence
sent_2 = "When i said that you stimulated me I meant, to be frank, that in noting your fallacies I was occasionally guided towards the truth."

# Process the sentences using the nlp pipeline
doc_sent1 = nlp(sent_1)
doc_sent2 = nlp(sent_2)

# Define a function to extract the information for each token
def extract_features(doc):
    '''
    Function to extract features from Spacy 'doc' object
     
    :param: doc: 'Doc' object to extract the features from (spacy.tokens.doc.Doc)
    :return: pandas dataframe with several specified columns (pd.dataframe)
    
    '''
    rows = []
    for token in doc:
        row = {
            'Token': token.text,
            'Dependent': token.dep_,
            'Head': token.head.text
        }
        # Convert to list
        children = list(token.children) 
        # Iterate over each child token in the list 
        for i, child in enumerate(children):
            row[f'Child {i+1}'] = child.text
            row[f'Child {i+1} Dependencies'] = child.dep_
        # Adding a new row to a list of rows
        rows.append(row)
    return pd.DataFrame(rows)



# Extract the featyres for sentence 1 and sentence 2 using the function above
df_sent1 = extract_features(doc_sent1)
df_sent2 = extract_features(doc_sent2)

# Add a column to separate between sentence 1 and sentence 2
df_sent1['Sentence'] = 'Sentence 1'
df_sent2['Sentence'] = 'Sentence 2'

# Concatenate both dataframes into one dataframe
df = pd.concat([df_sent1, df_sent2], ignore_index=True)

# Print the dataframe
print(df)
# Save the DataFrame to a CSV file
# Adjust the path accordingly
df.to_csv('./data_result.csv', index=False)


      Token Dependent     Head  Child 1 Child 1 Dependencies Child 2  \
0      When    advmod     came      NaN                  NaN     NaN   
1         I     nsubj     came      NaN                  NaN     NaN   
2      came      ROOT     came     When               advmod       I   
3     round      prep     came  balcony                 pobj     NaN   
4       the       det  balcony      NaN                  NaN     NaN   
..      ...       ...      ...      ...                  ...     ...   
60   guided     ccomp   noting        I            nsubjpass     was   
61  towards      prep   guided    truth                 pobj     NaN   
62      the       det    truth      NaN                  NaN     NaN   
63    truth      pobj  towards      the                  det     NaN   
64        .     punct     said      NaN                  NaN     NaN   

   Child 2 Dependencies       Child 3 Child 3 Dependencies  Child 4  \
0                   NaN           NaN                  NaN      

In [None]:
# Extract Head + full constituent from the head word
# Code is adapted from (https://spacy.io/universe/project/self-attentive-parser)

# Load the English language model
nlp_head = spacy.load('en_core_web_md')

# If-else statement to add Benepar parser into spacy pipeline if spaCy version is 2.x
if spacy.__version__.startswith('2'):
    nlp_head.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp_head.add_pipe("benepar", config={"model": "benepar_en3"})

# Process the sentences using the nlp pipeline
# Sentence 1
doc_sent_1 = nlp_head(sent_1)
sent1 = list(doc_sent_1.sents)[0]

# Process the sentences using the nlp pipeline
# Sentence 2
doc_sent_2 = nlp_head(sent_2)
sent2 = list(doc_sent_2.sents)[0]

# Save output to a dictionary for both sentences
# Sentence 1
output_dict1 = {}
output_dict1['parse_string'] = sent1._.parse_string
output_dict1['labels'] = list(sent1._.labels)
output_dict1['children'] = list(sent1._.children)
print(output_dict1)

# Sentence 2
output_dict2 = {}
output_dict2['parse_string'] = sent2._.parse_string
output_dict2['labels'] = list(sent2._.labels)
output_dict2['children'] = list(sent2._.children)
print(output_dict2)


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'parse_string': '(S (S (SBAR (WHADVP (WRB When)) (S (NP (PRP I)) (VP (VBD came) (PP (IN round) (NP (DT the) (NN balcony)))))) (NP (PRP he)) (VP (VBD had) (VP (VBN reached) (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT the) (JJR farther) (NN corridor))))))) (, ,) (CC and) (S (NP (PRP I)) (VP (MD could) (VP (VB see) (PP (IN from) (NP (NP (DT the) (NN glimmer)) (PP (IN of) (NP (NN light))))) (PP (IN through) (NP (DT an) (JJ open) (NN door))) (SBAR (IN that) (S (NP (PRP he)) (VP (VBD had) (VP (VBN entered) (NP (NP (CD one)) (PP (IN of) (NP (DT the) (NNS rooms))))))))))) (. .))', 'labels': ['S'], 'children': [When I came round the balcony he had reached the end of the farther corridor, ,, and, I could see from the glimmer of light through an open door that he had entered one of the rooms, .]}
{'parse_string': '(S (SBAR (WHADVP (WRB When)) (S (NP (PRP i)) (VP (VBD said) (SBAR (IN that) (S (NP (PRP you)) (VP (VBD stimulated) (NP (PRP me)))))))) (NP (PRP I)) (VP (VBD meant) (, ,) (S (VP (TO 