## Downloads

In [None]:
!python -m spacy download en_core_web_sm -q
!pip install dframcy benepar -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m234.0/234.0 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 kB[0m [31m10.9 MB/s[0

## Imports

In [None]:
import spacy
import nltk
from dframcy import DframCy
from __future__ import unicode_literals
from spacy.matcher import Matcher
import networkx as nx
import benepar

## Initialize model and data

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
#https://medium.com/the-quintessential-q/three-sentence-stories-c8dca4bbe22f
paragraph = '''The dog hadn’t been fed for days. He howled and barked and whined,
 until a steak suddenly appeared in the backyard. It was charred beyond belief, but to the dog,
  unlike the human next door, it was perfect.'''.replace('\n', '')
first_sentence = 'The dog hadn’t been fed for days.'

In [None]:
spacy_par = nlp(paragraph)
dframcy = DframCy(nlp)
doc = dframcy.nlp(paragraph)
df = dframcy.to_dataframe(doc)
df

Unnamed: 0,token_text,token_start,token_end,token_pos_,token_tag_,token_dep_,token_head,token_ent_type_
0,The,0,3,DET,DT,det,dog,
1,dog,4,7,NOUN,NN,nsubjpass,fed,
2,had,8,11,AUX,VBD,aux,fed,
3,n’t,11,14,PART,RB,neg,fed,
4,been,15,19,AUX,VBN,auxpass,fed,
5,fed,20,23,VERB,VBN,ROOT,fed,
6,for,24,27,ADP,IN,prep,fed,
7,days,28,32,NOUN,NNS,pobj,for,DATE
8,.,32,33,PUNCT,.,punct,fed,
9,He,34,36,PRON,PRP,nsubj,howled,


## Extract features

### POS tag, head word, dependency relation tag, head word POS, token bigrams:

In [None]:
#Features: Part_Of_Speech, Head Word, Dependency Relation, Head Word POS, bigrams
for token in spacy_par:
  print(
      f"""
      TOKEN: {str(token)}
      TAG: {str(token.tag_):10}
      POS: {token.pos_}
      EXPLANATION_POS: {spacy.explain(token.tag_)}
      HEAD: {str(token.head.text)}
      HEAD_POS: {str(token.head.pos_)}
      DEPENDENCY: {str(token.dep_)}
      EXPLANATION_DEP: {spacy.explain(token.dep_)}
      bi-grams: {', '.join([''.join(x) for x in nltk.ngrams(token.text, 2)])}
      """)


      TOKEN: The
      TAG: DT        
      POS: DET
      EXPLANATION_POS: determiner
      HEAD: dog
      HEAD_POS: NOUN
      DEPENDENCY: det
      EXPLANATION_DEP: determiner
      bi-grams: Th, he
      

      TOKEN: dog
      TAG: NN        
      POS: NOUN
      EXPLANATION_POS: noun, singular or mass
      HEAD: fed
      HEAD_POS: VERB
      DEPENDENCY: nsubjpass
      EXPLANATION_DEP: nominal subject (passive)
      bi-grams: do, og
      

      TOKEN: had
      TAG: VBD       
      POS: AUX
      EXPLANATION_POS: verb, past tense
      HEAD: fed
      HEAD_POS: VERB
      DEPENDENCY: aux
      EXPLANATION_DEP: auxiliary
      bi-grams: ha, ad
      

      TOKEN: n’t
      TAG: RB        
      POS: PART
      EXPLANATION_POS: adverb
      HEAD: fed
      HEAD_POS: VERB
      DEPENDENCY: neg
      EXPLANATION_DEP: negation modifier
      bi-grams: n’, ’t
      

      TOKEN: been
      TAG: VBN       
      POS: AUX
      EXPLANATION_POS: verb, past participle
      HE

In [None]:
bigrams = [', '.join([''.join(x) for x in nltk.ngrams(token.text, 2)]) for token in spacy_par]
df = df.assign(bigrams=bigrams)
df.head()

Unnamed: 0,token_text,token_start,token_end,token_pos_,token_tag_,token_dep_,token_head,token_ent_type_,bigrams
0,The,0,3,DET,DT,det,dog,,"Th, he"
1,dog,4,7,NOUN,NN,nsubjpass,fed,,"do, og"
2,had,8,11,AUX,VBD,aux,fed,,"ha, ad"
3,n’t,11,14,PART,RB,neg,fed,,"n’, ’t"
4,been,15,19,AUX,VBN,auxpass,fed,,"be, ee, en"


### Ancestors, children

In [None]:
#Features: ANCESTORS, CHILDREN
children_list = []

for token in spacy_par:
  whole = [None,None]
  for a_token in token.ancestors:
    a_tuple = (a_token)
    whole[0] = (a_token)
  for b_token in token.children:
    children_list.append(b_token)
    whole[1] = (b_token)
  #print('whole', whole)
  # print(
  #       f"""
  #     TOKEN: {str(token)}
  #     ANCESTORS: {whole[0]}
  #     CHILDREN: {whole[1]}
  #     """
  #   )


#Find Headword first
headwords = []
# spacy_par = nlp(first_sentence)
for token in spacy_par:
    headwords.append(token.head)
df = df.assign(headwords=headwords)

#Give the parent of target word:
parent_of_each_token = []
for token in spacy_par:
  small_pair = []
  small_pair.append(token.text)
  small_pair.append(token.head)
  parent_of_each_token.append(small_pair)
#print(parent_of_each_token)
df = df.assign(parent=[p[1] for p in parent_of_each_token])

# Give POS of the target word and its parent:
parent_of_each_token_POS = []
for token in spacy_par:
  small_pair = []
  small_pair.append(str(token.pos_))
  small_pair.append(str(token.head.pos_))
  parent_of_each_token_POS.append(small_pair)
#print(parent_of_each_token_POS)
df = df.assign(parent_pos=[p[1] for p in parent_of_each_token_POS])


#Feature: Distance to head
distance_from_token_head = []
tokens_list = [t for t in spacy_par]
tokens_heads_big_list = list(zip(tokens_list, headwords))
G = nx.Graph()
for token, head in tokens_heads_big_list:
  G.add_edge(token,head)
  path = nx.shortest_path_length(G, source=token, target=head)
  distance_from_token_head.append(path)
# print(distance_from_token_head)
# df = df.assign(distance_to_head=distance_from_token_head) - yields 1 always, fixed later

#Feature: Path to head word
def find_target_to_head():
#   spacy_par = nlp(first_sentence)
  big_pos_until_target = []
  for token in spacy_par:
    tokens_until_target = []
    headword = token.head
    while token.text != headword.text:
      tokens_until_target.append(token.text)
      token = token.head
      headword = token.head
    tokens_until_target.append(headword.text)
    big_pos_until_target.append(tokens_until_target)
  return big_pos_until_target
df = df.assign(path_to_head=find_target_to_head())

#Feature: Path to head word (POS)
def find_pos_to_head():
#   spacy_par = nlp(first_sentence)
  big_pos_until_target = []
  for token in spacy_par:
    tokens_until_target = []
    headword = token.head
    while token.text != headword.text:
      tokens_until_target.append(token.pos_)
      token = token.head
      headword = token.head
    tokens_until_target.append(headword.pos_)
    big_pos_until_target.append(tokens_until_target)
  return big_pos_until_target

df = df.assign(pos_to_head=find_pos_to_head())

### NPs, VPs, PPs in sentence

In [None]:
# Feauture: Phrase Type
## VP
phrase_type = []
patterns = [{'POS': 'VBP', 'OP': '?'},
            {'LEMMA': 'have', 'TAG': 'VBP', 'OP': '?'},
            {'TEXT': 'not', 'OP': '?'},
            {'TAG': 'VBP', 'OP': '?'},
            {'POS': 'VERB', 'OP': '?'},
            {'POS': 'ADV', 'OP': '*'},
            {'POS': 'AUX', 'OP': '*'},
            {'POS': 'VERB', 'OP': '+'}]

matcher = spacy.matcher.Matcher(nlp.vocab)
matcher.add("Verb phrase", [patterns])

doc = nlp(paragraph)
matches = matcher(doc)
spans = [doc[start:end] for _, start, end in matches]
print('VP', spans)

## PP
pps = []
for element in spacy_par:
    if element.pos_ == 'ADP':
        pps.append(element)

print('PPs:', pps)

## NP
print('NPs:', [np for np in spacy_par.noun_chunks])

## S
print('S:', spacy_par)

VP [been fed, fed, howled, barked, whined, suddenly appeared, appeared, was charred, charred]
PPs: [for, in, beyond, to, unlike]
NPs: [The dog, days, He, a steak, the backyard, It, belief, the dog, the human next door, it]
S: The dog hadn’t been fed for days. He howled and barked and whined, until a steak suddenly appeared in the backyard. It was charred beyond belief, but to the dog,  unlike the human next door, it was perfect.




### Constituent label, depth, and length:

In [None]:
first_use = 0  # ensures benepar is only added to the pipeline once, do not re-run this cell within the same runtime

In [None]:
# Consituency parser
import benepar
benepar.download('benepar_en3')

if first_use == 0:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})
    first_use += 1

[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


ValueError: [E007] 'benepar' already exists in pipeline. Existing names: ['tok2vec', 'tagger', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner', 'benepar']

In [None]:
# Constituents for each word
paragraph = '''The dog hadn’t been fed for days. He howled and barked and whined,
 until a steak suddenly appeared in the backyard. It was charred beyond belief, but to the dog,
  unlike the human next door, it was perfect.'''.replace('\n', '')
doc = nlp(paragraph)
sent = list(doc.sents)[1]
print(sent._.parse_string)
constituents = []
for sent in list(doc.sents):
    constituents += list(sent._.constituents)
# constituents = list(sent._.constituents)
print(constituents, sep='\n')



(S (NP (PRP He)) (VP (VBD howled) (CC and) (VBD barked) (CC and) (VBD whined) (, ,) (SBAR (IN until) (S (NP (DT a) (NN steak)) (ADVP (RB suddenly)) (VP (VBD appeared) (PP (IN in) (NP (DT the) (NN backyard))))))) (. .))
[The dog hadn’t been fed for days., The dog, The, dog, hadn’t been fed for days, had, n’t, been fed for days, been, fed for days, fed, for days, for, days, ., He howled and barked and whined, until a steak suddenly appeared in the backyard., He, howled and barked and whined, until a steak suddenly appeared in the backyard, howled, and, barked, and, whined, ,, until a steak suddenly appeared in the backyard, until, a steak suddenly appeared in the backyard, a steak, a, steak, suddenly, appeared in the backyard, appeared, in the backyard, in, the backyard, the, backyard, ., It was charred beyond belief, but to the dog,  unlike the human next door, it was perfect., It was charred beyond belief, It, was charred beyond belief, was, charred beyond belief, charred, beyond belie

In [None]:
# Features: label, depth, and length of highest constituent with target as the left-most token

def depth_in_tree(token):
    depth = 0
    while token.head != token:  # Move up the tree towards the root to count the depth
        token = token.head
        depth += 1
    return depth


def obtain_label(token, constituent):
    if len(constituent._.labels):  # Use constituent label if node is non-terminal
        return constituent._.labels[0]
    else:  # Use POS tag (as shown in tree) for terminal nodes
        return token.tag_

depths = []
labels = []
const_len = []
consts = []
for sent in list(doc.sents):
    depths += [depth_in_tree(t) for t in sent]
    for t in sent:
        constituents = sent._.constituents
        for c in constituents:
            if t == c[0]:
                const_len.append(len(c))
                labels.append(obtain_label(t, c))
                consts.append(c.text)
                break
# depths = [depth_in_tree(t) for t in paragraph]

df = df.assign(depth=depths)

# labels = [obtain_label(t, c) for t, c in zip(paragraph, constituents) if t==c[0]]
df = df.assign(label=labels)

# const_len = [len(c) for t, c in zip(paragraph, constituents) if t==c[0]]
df = df.assign(constituent_length=const_len)

df = df.assign(constituent=consts)

In [None]:
df.distance_to_head = [len(p) for p in df.path_to_head.values]

  df.distance_to_head = [len(p) for p in df.path_to_head.values]


In [None]:
df.head()

Unnamed: 0,token_text,token_start,token_end,token_pos_,token_tag_,token_dep_,token_head,token_ent_type_,bigrams,headwords,parent,parent_pos,path_to_head,depth,label,constituent_length,constituent,pos_to_head
0,The,0,3,DET,DT,det,dog,,"Th, he",dog,dog,NOUN,"[The, dog, fed]",2,S,9,The dog hadn’t been fed for days.,"[DET, NOUN, VERB]"
1,dog,4,7,NOUN,NN,nsubjpass,fed,,"do, og",fed,fed,VERB,"[dog, fed]",1,NN,1,dog,"[NOUN, VERB]"
2,had,8,11,AUX,VBD,aux,fed,,"ha, ad",fed,fed,VERB,"[had, fed]",1,VP,6,hadn’t been fed for days,"[AUX, VERB]"
3,n’t,11,14,PART,RB,neg,fed,,"n’, ’t",fed,fed,VERB,"[n’t, fed]",1,RB,1,n’t,"[PART, VERB]"
4,been,15,19,AUX,VBN,auxpass,fed,,"be, ee, en",fed,fed,VERB,"[been, fed]",1,VP,4,been fed for days,"[AUX, VERB]"


In [None]:
df.shape

(47, 18)