<a href="https://colab.research.google.com/github/bogdanbabych/experiments_NLTK/blob/main/Sentence_Phrase_Segmentation_and_Dependency_Annotation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
from tabulate import tabulate # Used for generating nice table output

# Load English tokenizer, tagger, parser, and NER.
# This block attempts to load the model and downloads it if not found.
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    # If the model is not found, download it. This requires internet access.
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

def segment_sentence_to_phrases_and_dependencies(sentence: str) -> str:
    """
    Performs segmentation of a sentence into phrases (primarily noun phrases
    and individual word segments), along with their Part-of-Speech (POS)
    tags and dependency structure annotations. The analysis is presented
    in a structured table format.

    The function first identifies noun phrases using spaCy's built-in
    noun chunker. Then, it iterates through all remaining tokens that
    were not part of a recognized noun phrase, treating them as individual
    segments. For each segment, it extracts:
    - The segment text
    - Its type (e.g., 'Noun Phrase', or the POS tag for single words)
    - The original words within the segment
    - Their respective Part-of-Speech tags
    - A detailed annotation of their dependency relationships within the sentence.

    Args:
        sentence (str): The input sentence as a string to be analyzed.

    Returns:
        str: A string containing the formatted table with the segmentation
             results, POS tags, and dependency annotations.
    """
    # Process the input sentence using the loaded spaCy NLP pipeline
    doc = nlp(sentence)

    # This list will store dictionaries, each representing a segment to be
    # displayed in the table. We store 'start_index' to allow sorting later
    # to maintain the original sentence order in the table.
    processed_segments = []

    # A set to keep track of indices of tokens that have already been covered
    # by a noun chunk, to avoid re-processing them as individual words.
    covered_tokens_indices = set()

    # --- Step 1: Process Noun Phrases ---
    # Iterate through all detected noun chunks in the document.
    for chunk in doc.noun_chunks:
        # Extract the text of words within the current noun chunk
        segment_words = [token.text for token in chunk]
        # Extract the POS tags for each word in the noun chunk
        segment_pos = [token.pos_ for token in chunk]

        # Prepare dependency annotations for each token within the chunk.
        # This shows the token, its head, and the dependency type.
        dependency_annotations = []
        for token in chunk:
            # If a token is its own head, it's typically the root of the sentence
            head_text = token.head.text if token.head != token else "SELF"
            dep_type = token.dep_
            dependency_annotations.append(f"{token.text} -> {head_text} ({dep_type})")
            # Mark the token's index as covered
            covered_tokens_indices.add(token.i)

        # Add the processed noun chunk's data to our list of segments.
        processed_segments.append({
            'start_index': chunk.start_char, # Starting character index for sorting
            'data_row': [
                chunk.text,                   # The full text of the noun chunk
                "Noun Phrase",                # Type of segment
                ", ".join(segment_words),     # Comma-separated original words
                ", ".join(segment_pos),       # Comma-separated POS tags
                "; ".join(dependency_annotations) # Semicolon-separated dependency annotations
            ]
        })

    # --- Step 2: Process Individual Tokens (not part of noun chunks) ---
    # Iterate through all tokens in the document.
    for token in doc:
        # If the token's index has not been covered by a noun chunk, process it.
        if token.i not in covered_tokens_indices:
            segment_text = token.text
            segment_type = token.pos_ # For single words, use their POS as the type
            segment_words = token.text # The word itself
            segment_pos = token.pos_   # The word's POS tag

            # Get dependency information for this single token.
            head_text = token.head.text if token.head != token else "ROOT" # "ROOT" for the main verb
            dep_type = token.dep_
            dependency_annotations = f"{token.text} -> {head_text} ({dep_type})"

            # Add the processed individual token's data to our list of segments.
            processed_segments.append({
                'start_index': token.idx, # Starting character index for sorting
                'data_row': [
                    segment_text,
                    segment_type,
                    segment_words,
                    segment_pos,
                    dependency_annotations
                ]
            })

    # --- Step 3: Sort and Format Output ---
    # Sort all collected segments by their starting character index to ensure
    # they appear in the table in the order they appear in the original sentence.
    processed_segments.sort(key=lambda x: x['start_index'])

    # Extract only the data rows for the tabulate function.
    final_table_data = [item['data_row'] for item in processed_segments]

    # Define the headers for the table.
    headers = ["Segment", "Type", "Words (Original)", "POS Tags", "Dependency Annotation"]

    # Use the tabulate library to format the data into a clean grid table.
    return tabulate(final_table_data, headers=headers, tablefmt="grid")


def segment_sentence_to_clauses_and_dependencies(sentence: str) -> str:
    """
    Segments a sentence into clauses based on dependency parsing,
    identifying main verbs as clause heads and grouping their associated
    words (subject, objects, and core modifiers). The output is presented
    in a structured table format.

    This function identifies clauses by:
    1. Finding potential clause heads (verbs and auxiliaries with specific
       dependency types like ROOT, advcl, ccomp, xcomp, acl, relcl, csubj).
    2. For each clause head, it collects its 'core' tokens, which include
       the verb itself, its direct subjects (nsubj, csubj), and its direct
       objects/complements (dobj, pobj, attr, acomp, opbj).
    3. The function attempts to avoid overlapping tokens by marking them
       as 'assigned' once they are included in a clause.

    Args:
        sentence (str): The input sentence as a string to be analyzed.

    Returns:
        str: A string containing the formatted table with the clause
             segmentation results, POS tags, and dependency annotations.
    """
    doc = nlp(sentence)
    clauses_data = []
    # Keep track of indices of tokens already assigned to a clause to avoid duplication
    assigned_token_indices = set()

    def collect_core_clause_tokens(head_token):
        """
        Collects core tokens for a clause given its head token (verb).
        Includes the head, its subject(s), object(s), and immediate adverbial/negation modifiers.
        """
        clause_tokens = {head_token}

        # Add subject and its direct modifiers
        for child in head_token.children:
            if child.dep_ in ("nsubj", "nsubjpass", "csubj", "csubjpass"):
                clause_tokens.add(child)
                for subj_mod in child.children:
                    if subj_mod.dep_ in ("amod", "det", "compound", "nummod", "quantmod", "poss"):
                        clause_tokens.add(subj_mod)
            # Add objects/complements and their direct modifiers
            elif child.dep_ in ("dobj", "pobj", "attr", "acomp", "opbj"):
                clause_tokens.add(child)
                for obj_mod in child.children:
                    if obj_mod.dep_ in ("amod", "det", "compound", "nummod", "quantmod", "poss", "prep"):
                        clause_tokens.add(obj_mod)
            # Add direct verb modifiers (adverbs, negation)
            elif child.dep_ in ("advmod", "neg", "prt"):
                clause_tokens.add(child)
            # Add coordination (e.g., 'and' in "He ran and she jumped.")
            elif child.dep_ in ("cc", "conj") and (child.pos_ == "CCONJ" or child.pos_ == "VERB"):
                clause_tokens.add(child)

        # Sort tokens by their index to form the clause text in original order
        return sorted(list(clause_tokens), key=lambda t: t.i)

    # Prioritize processing the ROOT clause first
    root_clause_head = None
    for token in doc:
        if token.dep_ == "ROOT" and (token.pos_ == "VERB" or token.pos_ == "AUX"):
            root_clause_head = token
            break

    if root_clause_head:
        main_clause_tokens = collect_core_clause_tokens(root_clause_head)

        # Mark all tokens in the main clause as assigned
        for token in main_clause_tokens:
            assigned_token_indices.add(token.i)

        main_clause_dep_annotations = []
        for token in main_clause_tokens:
            head_text = token.head.text if token.head != token else "ROOT"
            dep_type = token.dep_
            main_clause_dep_annotations.append(f"{token.text} -> {head_text} ({dep_type})")

        clauses_data.append({
            'start_index': main_clause_tokens[0].idx if main_clause_tokens else 0,
            'data_row': [
                " ".join([t.text for t in main_clause_tokens]),
                "Main Clause",
                ", ".join([t.text for t in main_clause_tokens]),
                ", ".join([t.pos_ for t in main_clause_tokens]),
                "; ".join(main_clause_dep_annotations)
            ]
        })

    # Process other potential clause heads (subordinate clauses)
    for token in doc:
        # A token is a potential subordinate clause head if it's a verb/auxiliary,
        # has a dependency relation indicating a subordinate clause, and hasn't
        # been assigned to a previous clause (e.g., the main clause).
        if (token.pos_ == "VERB" or token.pos_ == "AUX") and \
           token.dep_ in ("advcl", "ccomp", "xcomp", "acl", "relcl", "csubj", "conj") and \
           token.i not in assigned_token_indices:

            sub_clause_tokens = collect_core_clause_tokens(token)

            # Filter out tokens that were already assigned to earlier clauses (e.g., the main clause)
            # This ensures that each token ideally belongs to only one clause segment.
            sub_clause_tokens = [t for t in sub_clause_tokens if t.i not in assigned_token_indices]

            if sub_clause_tokens:
                # Mark these newly found tokens as assigned
                for t in sub_clause_tokens:
                    assigned_token_indices.add(t.i)

                sub_clause_dep_annotations = []
                for sub_token in sub_clause_tokens:
                    head_text = sub_token.head.text if sub_token.head != sub_token else "SELF"
                    dep_type = sub_token.dep_
                    sub_clause_dep_annotations.append(f"{sub_token.text} -> {head_text} ({dep_type})")

                clauses_data.append({
                    'start_index': sub_clause_tokens[0].idx,
                    'data_row': [
                        " ".join([t.text for t in sub_clause_tokens]),
                        f"Subordinate Clause ({token.dep_})",
                        ", ".join([t.text for t in sub_clause_tokens]),
                        ", ".join([t.pos_ for t in sub_clause_tokens]),
                        "; ".join(sub_clause_dep_annotations)
                    ]
                })

    # Sort clauses by their starting index to maintain the original sentence order
    clauses_data.sort(key=lambda x: x['start_index'])

    final_table_data = [item['data_row'] for item in clauses_data]
    headers = ["Clause Text", "Type", "Words (Original)", "POS Tags", "Dependency Annotation"]

    return tabulate(final_table_data, headers=headers, tablefmt="grid")

# Example Usage:
if __name__ == "__main__":
    # Test cases to demonstrate the function's capabilities.
    sentence1 = "The quick brown fox jumps over the lazy dog."
    sentence2 = "I saw a girl with a telescope."
    sentence3 = "Artificial intelligence is transforming many industries around the globe."
    sentence4 = "She quickly ran to the store and bought some fresh apples."
    sentence5 = "He believed that she would come, but she never did."
    sentence6 = "Running quickly, the dog chased the ball that was thrown by the boy."


    print("Analyzing sentence: \"" + sentence1 + "\" (Phrase Segmentation)")
    print(segment_sentence_to_phrases_and_dependencies(sentence1))
    print("\n" + "="*100 + "\n") # Separator for clarity

    print("Analyzing sentence: \"" + sentence2 + "\" (Phrase Segmentation)")
    print(segment_sentence_to_phrases_and_dependencies(sentence2))
    print("\n" + "="*100 + "\n")

    print("Analyzing sentence: \"" + sentence3 + "\" (Phrase Segmentation)")
    print(segment_sentence_to_phrases_and_dependencies(sentence3))
    print("\n" + "="*100 + "\n")

    print("Analyzing sentence: \"" + sentence4 + "\" (Phrase Segmentation)")
    print(segment_sentence_to_phrases_and_dependencies(sentence4))
    print("\n" + "="*100 + "\n")

    print("Analyzing sentence: \"" + sentence5 + "\" (Clause Segmentation)")
    print(segment_sentence_to_clauses_and_dependencies(sentence5))
    print("\n" + "="*100 + "\n")

    print("Analyzing sentence: \"" + sentence6 + "\" (Clause Segmentation)")
    print(segment_sentence_to_clauses_and_dependencies(sentence6))
    print("\n" + "="*100 + "\n")