### Explore Rule Based Model

In [1]:
import json
import re
from collections import Counter, defaultdict
from pathlib import Path
from typing import Dict, List, Tuple, Any

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

#### Exploration over single example

In [2]:
import conllu
import re

def parse_conllu_file(filepath: str) -> List[Dict[str, Any]]:
    """
    Parse CoNLL-U file into structured examples using conllu library.
    """
    with open(filepath, "r", encoding="utf-8") as f:
        data = conllu.parse(f.read())
    
    examples = []
    for sent in data:
        example = sent.metadata.copy()
        
        tokens = []
        for t in sent:
            token = {
                "id": str(t["id"]),
                "form": t["form"],
                "lemma": t["lemma"],
                "upos": t["upos"],
                "xpos": t["xpos"],
                "feats": t["feats"] if t["feats"] else "_",
                "head": str(t["head"]),
                "deprel": t["deprel"],
                "entity": None
            }
            
            if t["feats"]:
                feats_str = "|".join([f"{k}={v}" for k, v in t["feats"].items()])
                token["feats"] = feats_str
            
            if t["misc"] and "Entity" in t["misc"]:
                token["entity"] = f"Entity={t['misc']['Entity']}"
            
            tokens.append(token)
        example["tokens"] = tokens
        
        # Post-process e1 and e2 positions from metadata
        for key in ["e1", "e2"]:
            if key in example:
                val = example[key]
                match = re.match(r"(.+?)\s+\[(\d+):(\d+)\]", val)
                if match:
                    word, start, end = match.groups()
                    example[key] = {
                        "text": word,
                        "start": int(start),
                        "end": int(end)
                    }
        examples.append(example)
    return examples

In [6]:
# Load training data
train_data_path = "../../../data/processed/train/train.conllu"
train_examples = parse_conllu_file(train_data_path)

In [4]:
ex = train_examples[0]

print("Sentence:")
print(ex["text"])
print("\nRelation:", ex["relation"])

print("\nEntity 1:")
print(ex["e1"])

print("\nEntity 2:")
print(ex["e2"])

Sentence:
The system as described above has its greatest application in an arrayed configuration of antenna elements.

Relation: Component-Whole(e2,e1)

Entity 1:
{'text': 'configuration', 'start': 12, 'end': 13}

Entity 2:
{'text': 'elements', 'start': 15, 'end': 16}


In [7]:
import pandas as pd

df = pd.DataFrame(ex["tokens"])
df

Unnamed: 0,id,form,lemma,upos,xpos,feats,head,deprel,entity
0,1,The,the,DET,DT,Definite=Def|PronType=Art,2,det,
1,2,system,system,NOUN,NN,Number=Sing,6,nsubj,
2,3,as,as,SCONJ,IN,_,4,mark,
3,4,described,describe,VERB,VBN,Aspect=Perf|Tense=Past|VerbForm=Part,2,advcl,
4,5,above,above,ADV,RB,_,4,advmod,
5,6,has,have,VERB,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,0,ROOT,
6,7,its,its,PRON,PRP$,Gender=Neut|Number=Sing|Person=3|Poss=Yes|Pron...,9,poss,
7,8,greatest,great,ADJ,JJS,Degree=Sup,9,amod,
8,9,application,application,NOUN,NN,Number=Sing,6,dobj,
9,10,in,in,ADP,IN,_,9,prep,


In [12]:
from spacy import displacy
from IPython.display import HTML

def visualize_conllu_dependency(example):
    """
    Create a displaCy visualization from a CoNLL-U parsed example
    without re-parsing with spaCy.
    """
    tokens = example["tokens"]

    # Build nodes
    words = []
    arcs = []

    for t in tokens:
        tid = int(t["id"])
        head = int(t["head"])
        dep = t["deprel"]

        words.append({"text": t["form"], "tag": t["upos"]})

        # Skip ROOT (head = 0)
        if head != 0:
            start = min(tid - 1, head - 1)
            end = max(tid - 1, head - 1)

            arcs.append({
                "start": start,
                "end": end,
                "label": dep,
                "dir": "left" if head < tid else "right"
            })

    # displacy expects this dict
    dep_doc = {
        "words": words,
        "arcs": arcs
    }

    # Render inline HTML
    html = displacy.render(dep_doc, style="dep", manual=True, jupyter=False)
    return HTML(html)

In [13]:
visualize_conllu_dependency(train_examples[0])

In [15]:
def find_entities(example):
    e1_pos = None
    e2_pos = None
    
    for tok in example["tokens"]:
        if tok["entity"] == "Entity=e1":
            e1_pos = int(tok["id"])
        if tok["entity"] == "Entity=e2":
            e2_pos = int(tok["id"])
    
    return e1_pos, e2_pos

e1_id, e2_id = find_entities(ex)
print("e1 id:", e1_id, "→", ex["tokens"][e1_id-1]["form"])
print("e2 id:", e2_id, "→", ex["tokens"][e2_id-1]["form"])

e1 id: 13 → configuration
e2 id: 16 → elements


In [16]:
from collections import deque

def shortest_dep_path(example, start, end):
    # Build undirected adjacency from dependency edges
    graph = defaultdict(list)
    for tok in example["tokens"]:
        tid = int(tok["id"])
        head = int(tok["head"])
        if head != 0:  # skip ROOT
            graph[tid].append(head)
            graph[head].append(tid)

    # BFS shortest path
    queue = deque([(start, [start])])
    visited = set([start])
    while queue:
        node, path = queue.popleft()
        if node == end:
            return path
        for neigh in graph[node]:
            if neigh not in visited:
                visited.add(neigh)
                queue.append((neigh, path + [neigh]))
    return None

path_ids = shortest_dep_path(ex, e1_id, e2_id)
print("SDP (token ids):", path_ids)

print("\nSDP (tokens):", [ex["tokens"][i-1]["form"] for i in path_ids])

SDP (token ids): [13, 14, 16]

SDP (tokens): ['configuration', 'of', 'elements']


In [17]:
def get_deprels(example, path_ids):
    tlist = example["tokens"]
    labels = []
    for i in range(len(path_ids)-1):
        a, b = path_ids[i], path_ids[i+1]

        # check both directions (because we built undirected graph)
        tok_a = tlist[a-1]
        tok_b = tlist[b-1]

        if int(tok_a["head"]) == b:
            labels.append((tok_a["deprel"], "up (child→head)"))
        elif int(tok_b["head"]) == a:
            labels.append((tok_b["deprel"], "down (head→child)"))
        else:
            labels.append(("unknown", "???"))
    return labels

labels = get_deprels(ex, path_ids)
labels

[('prep', 'down (head→child)'), ('pobj', 'down (head→child)')]

In [18]:
print("\nFull SDP explanation:\n")
tokens = ex["tokens"]
for i in range(len(path_ids)):
    tid = path_ids[i]
    t = tokens[tid-1]
    print(f"{tid}:{t['form']} ({t['deprel']})")
    if i < len(path_ids)-1:
        print("   │")
        print("   ↓")


Full SDP explanation:

13:configuration (pobj)
   │
   ↓
14:of (prep)
   │
   ↓
16:elements (pobj)


#### Exploration over single category of relations

In [20]:
comp_whole_examples = [
    ex for ex in train_examples 
    if ex["relation"].startswith("Component-Whole")
]

print("Number of Component-Whole examples:", len(comp_whole_examples))

Number of Component-Whole examples: 941


In [21]:
from collections import defaultdict, deque

def get_sdp(example):
    tokens = example["tokens"]

    # map id → head, id → token
    graph = defaultdict(list)
    for tok in tokens:
        tid = int(tok["id"])
        head = int(tok["head"])
        if head != 0:
            graph[tid].append(head)
            graph[head].append(tid)

    # find e1 and e2 token ids
    e1_id, e2_id = None, None
    for tok in tokens:
        if tok["entity"] == "Entity=e1":
            e1_id = int(tok["id"])
        if tok["entity"] == "Entity=e2":
            e2_id = int(tok["id"])

    # BFS shortest path
    queue = deque([(e1_id, [e1_id])])
    visited = {e1_id}

    while queue:
        node, path = queue.popleft()
        if node == e2_id:
            return path
        for nei in graph[node]:
            if nei not in visited:
                visited.add(nei)
                queue.append((nei, path + [nei]))

    return None

In [22]:
def sdp_pattern(example):
    path = get_sdp(example)
    tokens = example["tokens"]

    words = []
    lemmas = []
    pos = []
    deps = []

    for i in range(len(path)):
        tid = path[i]
        tok = tokens[tid - 1]

        words.append(tok["form"])
        lemmas.append(tok["lemma"])
        pos.append(tok["upos"])

        # dependency relation between this node and next
        if i < len(path) - 1:
            t1 = tokens[path[i] - 1]
            t2 = tokens[path[i+1] - 1]

            # determine which token is the dependent
            if int(t1["head"]) == int(t2["id"]):
                deps.append(t1["deprel"])  # t1 → head = t2
            elif int(t2["head"]) == int(t1["id"]):
                deps.append(t2["deprel"])  # t2 → head = t1
            else:
                deps.append("unknown")

    return {
        "words": tuple(words),
        "lemmas": tuple(lemmas),
        "pos": tuple(pos),
        "deps": tuple(deps)
    }

In [24]:
from collections import Counter

lemma_paths = Counter([p["lemmas"] for p in patterns])
pos_paths    = Counter([p["pos"] for p in patterns])
dep_paths    = Counter([p["deps"] for p in patterns])

print("Top lemma paths:")
for path, count in lemma_paths.most_common(15):
    print(count, "→", " -- ".join(path))

print("\nTop POS paths:")
for path, count in pos_paths.most_common(10):
    print(count, "→", " -- ".join(path))

print("\nTop dependency-label paths:")
for path, count in dep_paths.most_common(10):
    print(count, "→", " -- ".join(path))

Top lemma paths:
4 → stem -- of -- tree
3 → cover -- of -- magazine
3 → knife -- blade
3 → shelf -- of -- refrigerator
3 → television -- screen
2 → ear -- lobe
2 → umbrella -- frame
2 → rope -- of -- bell
2 → flower -- bud
2 → pin -- of -- connector
2 → floor -- of -- cottage
2 → mouse -- button
2 → bristle -- of -- brush
2 → fish -- with -- lung
2 → jaw -- bone

Top POS paths:
345 → NOUN -- ADP -- NOUN
161 → NOUN -- VERB -- NOUN
120 → NOUN -- VERB -- ADP -- NOUN
105 → NOUN -- NOUN
32 → NOUN -- AUX -- NOUN -- ADP -- NOUN
13 → NOUN -- VERB -- NOUN -- ADP -- NOUN
11 → NOUN -- VERB -- VERB -- NOUN
8 → NOUN -- ADP -- NOUN -- VERB -- NOUN
8 → NOUN -- VERB -- NOUN -- NOUN
7 → NOUN -- ADP -- PROPN

Top dependency-label paths:
354 → prep -- pobj
147 → nsubj -- dobj
88 → compound
43 → nsubjpass -- prep -- pobj
34 → nsubj -- prep -- pobj
31 → nsubj -- attr -- prep -- pobj
28 → dobj -- prep -- pobj
16 → poss
14 → acl -- prep -- pobj
12 → nsubj -- dobj -- prep -- pobj


#### Usage of DepencenyMatcher on single category through single pattern

In [33]:
import spacy
from spacy.matcher import DependencyMatcher
from spacy.tokens import Doc

nlp = spacy.load("en_core_web_lg")
matcher = DependencyMatcher(nlp.vocab)

In [34]:
pattern_of_phrase = [
    # The WHOLE token (head of "of")
    {
        "RIGHT_ID": "whole",
        "RIGHT_ATTRS": {"POS": "NOUN"}
    },
    # The preposition "of"
    {
        "LEFT_ID": "whole",
        "REL_OP": ">",
        "RIGHT_ID": "prep_of",
        "RIGHT_ATTRS": {"LOWER": "of"}
    },
    # The COMPONENT token
    {
        "LEFT_ID": "prep_of",
        "REL_OP": ">",
        "RIGHT_ID": "component",
        "RIGHT_ATTRS": {"POS": "NOUN"}
    }
]
matcher.add("COMP_WHOLE_OF_PHRASE", [pattern_of_phrase])

In [35]:
def doc_from_conllu(example):
    words = [t["form"] for t in example["tokens"]]
    spaces = [True] * len(words)
    if len(spaces) > 0:
        spaces[-1] = False # Last token usually doesn't have space if it's punctuation, but simple approx
    
    doc = Doc(nlp.vocab, words=words, spaces=spaces)
    
    for i, t in enumerate(example["tokens"]):
        # Set POS and dependency labels if needed, but DependencyMatcher works on Doc attributes
        # We need to set them manually if we want to match on them
        # Note: Spacy's DependencyMatcher requires a parsed Doc. 
        # Since we have CoNLL-U data, we can either:
        # 1. Use the CoNLL-U tags/deps to set attributes on the Doc (requires custom extension or setting protected attrs)
        # 2. Run nlp() on the text (might differ from CoNLL-U)
        pass
        
    # For simplicity and to use the matcher effectively with the *exact* CoNLL-U structure,
    # we should ideally construct the Doc with the CoNLL-U dependencies.
    # However, setting heads/deps manually on a Doc is tricky.
    # Let's try running nlp() on the text first as a quick start, 
    # or better, let's use the provided example text.
    
    text = example.get("text", " ".join(words))
    return nlp(text)

In [40]:
# Filter examples for Component-Whole relations
comp_whole_examples = [ex for ex in train_examples if "Component-Whole" in ex["relation"]]

# Apply matcher to a subset of examples
print(f"Scanning {len(comp_whole_examples)} Component-Whole examples...")
for ex in comp_whole_examples[:5]: # Test on first 5 examples
    doc = doc_from_conllu(ex)
    matches = matcher(doc)
    if matches:
        print(f"\nSentence: {ex['text']}")
        for match_id, token_ids in matches:
            w_id, p_id, c_id = token_ids
            print(f"Match: {doc[w_id]} -> {doc[p_id]} -> {doc[c_id]}")

Scanning 941 Component-Whole examples...

Sentence: The system as described above has its greatest application in an arrayed configuration of antenna elements.
Match: configuration -> of -> elements

Sentence: The girl showed a photo of apple tree blossom on a fruit tree in the Central Valley.
Match: photo -> of -> tree

Sentence: The timer of the device automatically eliminates wasted "standby power" consumption by automatically turn off electronics plugged into the "auto off" outlets.
Match: timer -> of -> device


In [45]:
def explore_matches(examples):
    print(f"\nAnalyzing {len(examples)} Component-Whole examples...")

    for ex in examples:
        doc = doc_from_conllu(ex)
        matches = matcher(doc)

        if not matches:
            continue

        print("\n" + "="*80)
        print("Sentence:", ex["text"])
        print(f"Gold e1: {ex['e1']['text']}   Gold e2: {ex['e2']['text']}")
        print("-"*80)

        for match_id, token_ids in matches:
            pattern = nlp.vocab.strings[match_id]

            # dynamic length: compound patterns may only have 2 tokens
            tokens = [doc[t] for t in token_ids]

            # extract for readable formatting
            words = [f"{tok.text}/{tok.dep_}" for tok in tokens]

            # try to guess component vs whole
            if len(tokens) == 3:
                whole = tokens[0]
                prep = tokens[1]
                comp = tokens[2]
            else:
                # compound or poss patterns
                whole = tokens[0]
                comp = tokens[-1]

            # Compare with gold labels
            gold_e1 = ex["e1"]["text"].lower()
            gold_e2 = ex["e2"]["text"].lower()

            print(f"Pattern: {pattern}")
            print("Tokens:", " -> ".join(words))
            print(f"Extracted component: {comp.text}, whole: {whole.text}")
            print("-"*80)

In [46]:
explore_matches(comp_whole_examples[:20])   # inspect first 20


Analyzing 20 Component-Whole examples...

Sentence: The system as described above has its greatest application in an arrayed configuration of antenna elements.
Gold e1: configuration   Gold e2: elements
--------------------------------------------------------------------------------
Pattern: COMP_WHOLE_OF_PHRASE
Tokens: configuration/pobj -> of/prep -> elements/pobj
Extracted component: elements, whole: configuration
--------------------------------------------------------------------------------

Sentence: The girl showed a photo of apple tree blossom on a fruit tree in the Central Valley.
Gold e1: tree   Gold e2: blossom
--------------------------------------------------------------------------------
Pattern: COMP_WHOLE_OF_PHRASE
Tokens: photo/dobj -> of/prep -> tree/pobj
Extracted component: tree, whole: photo
--------------------------------------------------------------------------------

Sentence: The timer of the device automatically eliminates wasted "standby power" consumptio

In [47]:
correct = 0
wrong = 0
total = len(comp_whole_examples)

for ex in comp_whole_examples:
    doc = doc_from_conllu(ex)
    matches = matcher(doc)

    gold_e1 = ex["e1"]["text"].lower()
    gold_e2 = ex["e2"]["text"].lower()

    found = False

    for _, (w_id, p_id, c_id) in matches:
        whole = doc[w_id].text.lower()
        comp  = doc[c_id].text.lower()

        if whole == gold_e1 and comp == gold_e2:
            found = True
            break

    if found:
        correct += 1
    else:
        wrong += 1

print("Total Component-Whole examples:", total)
print("Correct matches (via `of` phrase):", correct)
print("Not captured by this pattern:", wrong)
print("Coverage %:", correct / total * 100)

Total Component-Whole examples: 941
Correct matches (via `of` phrase): 279
Not captured by this pattern: 662
Coverage %: 29.64930924548353


#### Adding other patterns

In [48]:
# whole --has/contains/includes/consists--> component
pattern_verb_has = [
    # whole = subject of verb
    {"RIGHT_ID": "whole", "RIGHT_ATTRS": {"POS": "NOUN"}},
    {"LEFT_ID": "whole", "REL_OP": ">", "RIGHT_ID": "verb",
     "RIGHT_ATTRS": {"LEMMA": {"IN": ["have", "contain", "include", "consist"]}}},
    # component = object of verb or object of prep after verb
    {"LEFT_ID": "verb", "REL_OP": ">", "RIGHT_ID": "component",
     "RIGHT_ATTRS": {"POS": "NOUN", "DEP": {"IN": ["dobj", "pobj"]}}}
]

matcher.add("COMP_WHOLE_VERB_HAS", [pattern_verb_has])

In [49]:
pattern_compound = [
    # component is modifier (left noun)
    {"RIGHT_ID": "component", "RIGHT_ATTRS": {"POS": "NOUN", "DEP": "compound"}},
    # whole is its head noun
    {"LEFT_ID": "component", "REL_OP": ">", "RIGHT_ID": "whole",
     "RIGHT_ATTRS": {"POS": "NOUN"}}
]

matcher.add("COMP_WHOLE_COMPOUND", [pattern_compound])

In [50]:
pattern_with = [
    # whole noun
    {"RIGHT_ID": "whole", "RIGHT_ATTRS": {"POS": "NOUN"}},
    # prep = with
    {"LEFT_ID": "whole", "REL_OP": ">", "RIGHT_ID": "prep_with",
     "RIGHT_ATTRS": {"LOWER": "with"}},
    # component noun
    {"LEFT_ID": "prep_with", "REL_OP": ">", "RIGHT_ID": "component",
     "RIGHT_ATTRS": {"POS": "NOUN"}}
]

matcher.add("COMP_WHOLE_WITH_PHRASE", [pattern_with])

In [55]:
pattern_possessive = [
    # whole is possessor
    {"RIGHT_ID": "whole", "RIGHT_ATTRS": {"POS": "NOUN"}},
    # possessive edge
    {"LEFT_ID": "whole", "REL_OP": ">", "RIGHT_ID": "poss_marker",
     "RIGHT_ATTRS": {"DEP": "poss"}},
    # component is head noun
    {"LEFT_ID": "poss_marker", "REL_OP": ">", "RIGHT_ID": "component",
     "RIGHT_ATTRS": {"POS": "NOUN"}}
]

matcher.add("COMP_WHOLE_POSSESSIVE", [pattern_possessive])

In [56]:
len(matcher)

5

In [None]:
# Track per-pattern statistics
pattern_correct = defaultdict(int)
pattern_wrong = defaultdict(int)

overall_correct = 0
overall_wrong = 0
total = len(comp_whole_examples)


def extract_whole_component(doc, token_ids):
    """
    General extractor for any pattern:
    - Find 2 nouns in the match
    - The earlier one in the dependency direction is WHOLE
    - The later one is COMPONENT

    Works for:
    - NOUN -> of -> NOUN
    - NOUN -> with -> NOUN
    - compound
    - possessive
    - verb-mediated (whole--verb-->component)
    """
    nouns = [i for i in token_ids if doc[i].pos_ == "NOUN"]
    if len(nouns) < 2:
        return None, None
    
    # Sort tokens by index to get deterministic ordering
    nouns = sorted(nouns)

    whole = doc[nouns[0]].text.lower()
    component = doc[nouns[-1]].text.lower()
    
    return whole, component


print(f"Evaluating {total} Component-Whole examples...\n")

for ex in comp_whole_examples:
    doc = doc_from_conllu(ex)
    gold_e1 = ex["e1"]["text"].lower()
    gold_e2 = ex["e2"]["text"].lower()

    matches = matcher(doc)
    found_match_for_sentence = False

    for match_id, token_ids in matches:
        rule_name = nlp.vocab.strings[match_id]

        whole, comp = extract_whole_component(doc, token_ids)
        if whole is None:
            continue

        if whole == gold_e1 and comp == gold_e2:
            pattern_correct[rule_name] += 1
            found_match_for_sentence = True
        else:
            pattern_wrong[rule_name] += 1

    if found_match_for_sentence:
        overall_correct += 1
    else:
        overall_wrong += 1


# ---------------------------
# PRINT RESULTS
# ---------------------------
print("========== OVERALL COVERAGE ==========")
print("Total examples:", total)
print("Correctly captured:", overall_correct)
print("Missed:", overall_wrong)
print("Coverage %:", round(overall_correct / total * 100, 2))
print()

print("========== PER-PATTERN COVERAGE ==========")
for pattern_name in pattern_correct:
    total_matches = pattern_correct[pattern_name] + pattern_wrong[pattern_name]
    accuracy = pattern_correct[pattern_name] / total_matches * 100 if total_matches > 0 else 0

    print(f"{pattern_name}:")
    print(f"  correct: {pattern_correct[pattern_name]}")
    print(f"  wrong:   {pattern_wrong[pattern_name]}")
    print(f"  accuracy: {accuracy:.2f}%")
    print()

Evaluating 941 Component-Whole examples...

Total examples: 941
Correctly captured: 302
Missed: 639
Coverage %: 32.09

COMP_WHOLE_OF_PHRASE:
  correct: 280
  wrong:   418
  accuracy: 40.11%

COMP_WHOLE_WITH_PHRASE:
  correct: 16
  wrong:   42
  accuracy: 27.59%

COMP_WHOLE_COMPOUND:
  correct: 4
  wrong:   57
  accuracy: 6.56%

COMP_WHOLE_VERB_HAS:
  correct: 3
  wrong:   17
  accuracy: 15.00%

