In [1]:
# Colab: cài spaCy và mô hình tiếng Anh
# Ghi chú: Colab thường chạy được ngay sau pip install; nếu báo lỗi import spacy, khởi động lại runtime.
!pip install -U spacy
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import spacy
from spacy import displacy
from IPython.display import HTML, display

# Load model (en_core_web_md đã được cài ở ô trước)
nlp = spacy.load("en_core_web_md")

def render_dep(doc_or_text, style="dep", options=None):
    """
    Render dependency parse inline in Colab.
    Accepts either a Doc (spaCy) or raw text.
    """
    if isinstance(doc_or_text, str):
        doc = nlp(doc_or_text)
    else:
        doc = doc_or_text

    # FIX: spaCy đòi options phải là dict
    if options is None:
        options = {}

    html = displacy.render(doc, style=style, options=options)

    # Colab hiển thị đúng khi bọc bằng display(HTML)
    display(HTML(html))
    return doc


doc = render_dep("The quick brown fox jumps over the lazy dog.")


<IPython.core.display.HTML object>

In [3]:
def print_token_table(doc):
    print(f"{'TEXT':<12} | {'DEP':<12} | {'HEAD TEXT':<12} | {'HEAD POS':<8} | CHILDREN")
    print("-" * 80)
    for token in doc:
        children = [child.text for child in token.children]
        print(f"{token.text:<12} | {token.dep_:<12} | {token.head.text:<12} | {token.head.pos_:<8} | {children}")

# Test
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
print_token_table(doc)


TEXT         | DEP          | HEAD TEXT    | HEAD POS | CHILDREN
--------------------------------------------------------------------------------
Apple        | nsubj        | looking      | VERB     | []
is           | aux          | looking      | VERB     | []
looking      | ROOT         | looking      | VERB     | ['Apple', 'is', 'at']
at           | prep         | looking      | VERB     | ['buying']
buying       | pcomp        | at           | ADP      | ['startup']
U.K.         | compound     | startup      | NOUN     | []
startup      | dobj         | buying       | VERB     | ['U.K.', 'for']
for          | prep         | startup      | NOUN     | ['billion']
$            | quantmod     | billion      | NUM      | []
1            | compound     | billion      | NUM      | []
billion      | pobj         | for          | ADP      | ['$', '1']


In [4]:
def extract_svo_triples(doc):
    results = []
    for token in doc:
        if token.pos_ == "VERB":
            subj = None
            obj = None
            for child in token.children:
                if child.dep_ in ("nsubj", "nsubjpass"):
                    subj = child.text
                if child.dep_ in ("dobj", "pobj", "obj"):
                    obj = child.text
            if subj and obj:
                results.append((subj, token.text, obj))
    return results

# Test
text = "The cat chased the mouse and the dog watched them."
doc = nlp(text)
extract_svo_triples(doc)


[('cat', 'chased', 'mouse'), ('dog', 'watched', 'them')]

In [5]:
def adjectives_for_nouns(doc):
    out = {}
    for token in doc:
        if token.pos_ == "NOUN":
            adjs = [child.text for child in token.children if child.dep_ == "amod"]
            if adjs:
                out[token.text] = adjs
    return out

# Test
doc = nlp("The big fluffy white cat is sleeping on the warm mat.")
adjectives_for_nouns(doc)


{'cat': ['big', 'fluffy', 'white'], 'mat': ['warm']}

In [6]:
def find_main_verb(doc):
    """
    Return the ROOT of the dependency tree.
    """
    for token in doc:
        if token.dep_ == "ROOT":
            return token
    return None


In [7]:
def extract_noun_chunk_custom(doc):
    chunks = []
    visited = set()

    for token in doc:
        if token.pos_ in ("NOUN", "PROPN", "PRON") and token.i not in visited:
            left_mods = [
                t for t in token.lefts
                if t.dep_ in ("det", "amod", "compound", "nummod", "poss")
            ]

            parts = [t.text for t in left_mods] + [token.text]

            for t in left_mods:
                visited.add(t.i)
            visited.add(token.i)

            chunks.append(" ".join(parts))

    return chunks

# Test
doc = nlp("The big fluffy white cat and the small dog with a bone are outside.")
extract_noun_chunk_custom(doc)


['The big fluffy white cat', 'the small dog', 'a bone']

In [8]:
def get_path_to_root(token):
    path = [token]
    cur = token
    while cur.head != cur:
        cur = cur.head
        path.append(cur)
        if len(path) > 100:
            break
    return path

# Example
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for tok in doc:
    if tok.text == "startup":
        print([t.text for t in get_path_to_root(tok)])


['startup', 'buying', 'at', 'looking']


In [9]:
examples = [
    "The quick brown fox jumps over the lazy dog.",
    "Apple is looking at buying U.K. startup for $1 billion.",
    "The cat chased the mouse and the dog watched them.",
    "The big fluffy white cat is sleeping on the warm mat."
]

for s in examples:
    print("\n" + "="*80)
    print("Sentence:", s)
    doc = render_dep(s)  # updated version (no error)
    print_token_table(doc)

    print("SVO triples:", extract_svo_triples(doc))
    print("Main verb:", find_main_verb(doc).text)
    print("Custom noun chunks:", extract_noun_chunk_custom(doc))

    for token in doc:
        if token.pos_ in ("NOUN","PROPN"):
            path = get_path_to_root(token)
            print(f"Path to ROOT for '{token.text}':", " -> ".join([t.text for t in path]))



Sentence: The quick brown fox jumps over the lazy dog.


<IPython.core.display.HTML object>

TEXT         | DEP          | HEAD TEXT    | HEAD POS | CHILDREN
--------------------------------------------------------------------------------
The          | det          | fox          | NOUN     | []
quick        | amod         | fox          | NOUN     | []
brown        | amod         | fox          | NOUN     | []
fox          | nsubj        | jumps        | VERB     | ['The', 'quick', 'brown']
jumps        | ROOT         | jumps        | VERB     | ['fox', 'over', '.']
over         | prep         | jumps        | VERB     | ['dog']
the          | det          | dog          | NOUN     | []
lazy         | amod         | dog          | NOUN     | []
dog          | pobj         | over         | ADP      | ['the', 'lazy']
.            | punct        | jumps        | VERB     | []
SVO triples: []
Main verb: jumps
Custom noun chunks: ['The quick brown fox', 'the lazy dog']
Path to ROOT for 'fox': fox -> jumps
Path to ROOT for 'dog': dog -> over -> jumps

Sentence: Apple is looking at

<IPython.core.display.HTML object>

TEXT         | DEP          | HEAD TEXT    | HEAD POS | CHILDREN
--------------------------------------------------------------------------------
Apple        | nsubj        | looking      | VERB     | []
is           | aux          | looking      | VERB     | []
looking      | ROOT         | looking      | VERB     | ['Apple', 'is', 'at', '.']
at           | prep         | looking      | VERB     | ['buying']
buying       | pcomp        | at           | ADP      | ['startup']
U.K.         | compound     | startup      | NOUN     | []
startup      | dobj         | buying       | VERB     | ['U.K.', 'for']
for          | prep         | startup      | NOUN     | ['billion']
$            | quantmod     | billion      | NUM      | []
1            | compound     | billion      | NUM      | []
billion      | pobj         | for          | ADP      | ['$', '1']
.            | punct        | looking      | VERB     | []
SVO triples: []
Main verb: looking
Custom noun chunks: ['Apple', 'U.K.', 'U

<IPython.core.display.HTML object>

TEXT         | DEP          | HEAD TEXT    | HEAD POS | CHILDREN
--------------------------------------------------------------------------------
The          | det          | cat          | NOUN     | []
cat          | nsubj        | chased       | VERB     | ['The']
chased       | ROOT         | chased       | VERB     | ['cat', 'mouse', 'and', 'watched']
the          | det          | mouse        | NOUN     | []
mouse        | dobj         | chased       | VERB     | ['the']
and          | cc           | chased       | VERB     | []
the          | det          | dog          | NOUN     | []
dog          | nsubj        | watched      | VERB     | ['the']
watched      | conj         | chased       | VERB     | ['dog', 'them', '.']
them         | dobj         | watched      | VERB     | []
.            | punct        | watched      | VERB     | []
SVO triples: [('cat', 'chased', 'mouse'), ('dog', 'watched', 'them')]
Main verb: chased
Custom noun chunks: ['The cat', 'the mouse', 'the do

<IPython.core.display.HTML object>

TEXT         | DEP          | HEAD TEXT    | HEAD POS | CHILDREN
--------------------------------------------------------------------------------
The          | det          | cat          | NOUN     | []
big          | amod         | cat          | NOUN     | []
fluffy       | amod         | cat          | NOUN     | []
white        | amod         | cat          | NOUN     | []
cat          | nsubj        | sleeping     | VERB     | ['The', 'big', 'fluffy', 'white']
is           | aux          | sleeping     | VERB     | []
sleeping     | ROOT         | sleeping     | VERB     | ['cat', 'is', 'on', '.']
on           | prep         | sleeping     | VERB     | ['mat']
the          | det          | mat          | NOUN     | []
warm         | amod         | mat          | NOUN     | []
mat          | pobj         | on           | ADP      | ['the', 'warm']
.            | punct        | sleeping     | VERB     | []
SVO triples: []
Main verb: sleeping
Custom noun chunks: ['The big fluffy wh

In [10]:
s = "The quick brown fox jumps over the lazy dog."
doc = nlp(s)

out = []
out.append(f"Sentence: {s}")
out.append("Tokens:")
for token in doc:
    out.append(f"{token.text}\tDEP:{token.dep_}\tHEAD:{token.head.text}\tPOS:{token.pos_}\tCHILDREN:{[c.text for c in token.children]}")

open("/content/dependency_parse_example.txt","w",encoding="utf-8").write("\n".join(out))
print("Saved to /content/dependency_parse_example.txt")


Saved to /content/dependency_parse_example.txt
