# Install the dependencies

In [2]:
!pip install -U pip setuptools wheel -q
!pip install -U spacy -q
!python -m spacy download en_core_web_lg -q
!pip install -U treelib -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.4/777.4 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for treelib (setup.py) ... [?25l[?25hdone
[0m

In [3]:
# Load the en_core_web_lg model

import spacy
nlp  = spacy.load('en_core_web_lg')

In [4]:
# Check the details of the pipeline

nlp.meta['pipeline']

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

# Run the named entity module on a sample sentence

In [11]:
sent = 'Creative ARR for this year was $3.2 billion and was $8.78 billion for last year.'
docs = [sent]

doc = nlp(sent)

data = {
    'text': [],
    'start': [],
    'end': [],
    'label': [],
}

for ent in doc.ents:
  data['text'].append(ent.text)
  data['start'].append(ent.start_char)
  data['end'].append(ent.end_char)
  data['label'].append(ent.label_)


import pandas as pd
df = pd.DataFrame(data)
df

Unnamed: 0,text,start,end,label
0,this year,17,26,DATE
1,$3.2 billion,31,43,MONEY
2,$8.78 billion,52,65,MONEY
3,last year,70,79,DATE


## Replace the Named Entities in the sentence with the tags

In [12]:
for i, row in df.iterrows():
  sent = sent.replace(row['text'], f"{row['label']}-{i}")

In [18]:
doc = nlp(sent)
sent

'Creative ARR for DATE-0 was MONEY-1 and was MONEY-2 for DATE-3.'

Create the tree using the depepndencies extracted  from the dependency parser

In [19]:
from treelib import Tree, Node

# Node class of the tree
class Node:
  def __init__(self, text):
    self.txt = text
    self.par = None
    self.prv = None
    self.nxt = []

tree = Tree()

# Find root node token
root = None
for token in doc:
  if token.dep_ == 'ROOT':
    root = token

# do depth first search to create tree
money = []
def dfs(root, depth, parent, tid):
  global tree
  
  cur = Node(root.text)
  if "MONEY" in root.text:
    money.append(cur)

  if depth != 0:
    tree.create_node(tag=root.text, identifier=tid, parent=parent)
  else:
    tree.create_node(tag=root.text, identifier=tid)

  pid = tid
  numChild = 1

  for child in root.children:
    tid += 1
    cnode, nchild = dfs(child, depth+1, pid, tid)
    cnode.par = cur
    cur.nxt.append(cnode)

    numChild += nchild
    tid += nchild-1

  return cur, numChild

head = dfs(root, 0, None, 0)

In [20]:
tree.show()

was
├── .
├── ARR
│   ├── Creative
│   └── for
│       └── DATE-0
├── MONEY-1
├── and
└── was
    ├── MONEY-2
    └── for
        └── DATE-3



Define the logic to match the money with the dates using lowest common ancestor

In [21]:
def getDate(node):

  # Helper function
  def getDateST(node, child):
    found = False
    rDate = None

    for nxt in node.nxt:
      if nxt != child:
        date, inChild = getDateST(nxt, node)
        if inChild:
          found = True
          rDate = date
          break
    
    if not found and 'DATE-' in node.txt:
      rDate = node.txt
      found= True
      node.txt.replace('DATE-', 'date')

    return rDate, found

  trueDate = ''
  ichild = node

  while True:
    trueDate, found = getDateST(node, ichild)
    if found or not node.par:
      break
    ichild = node
    node = node.par

  return trueDate

In [22]:
print(getDate(money[0]))
print(getDate(money[1]))

DATE-0
DATE-3
