<a href="https://colab.research.google.com/github/bogdanbabych/experiments_NLTK/blob/main/phrase_segmentation_experiment_v01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
from tabulate import tabulate # Used for generating nice table output

# Load English tokenizer, tagger, parser, and NER.
# This block attempts to load the model and downloads it if not found.
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    # If the model is not found, download it. This requires internet access.
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

def segment_sentence_to_phrases_and_dependencies(sentence: str) -> str:
    """
    Performs segmentation of a sentence into phrases (primarily noun phrases
    and individual word segments), along with their Part-of-Speech (POS)
    tags and dependency structure annotations. The analysis is presented
    in a structured table format.

    The function first identifies noun phrases using spaCy's built-in
    noun chunker. Then, it iterates through all remaining tokens that
    were not part of a recognized noun phrase, treating them as individual
    segments. For each segment, it extracts:
    - The segment text
    - Its type (e.g., 'Noun Phrase', or the POS tag for single words)
    - The original words within the segment
    - Their respective Part-of-Speech tags
    - A detailed annotation of their dependency relationships within the sentence.

    Args:
        sentence (str): The input sentence as a string to be analyzed.

    Returns:
        str: A string containing the formatted table with the segmentation
             results, POS tags, and dependency annotations.
    """
    # Process the input sentence using the loaded spaCy NLP pipeline
    doc = nlp(sentence)

    # This list will store dictionaries, each representing a segment to be
    # displayed in the table. We store 'start_index' to allow sorting later
    # to maintain the original sentence order in the table.
    processed_segments = []

    # A set to keep track of indices of tokens that have already been covered
    # by a noun chunk, to avoid re-processing them as individual words.
    covered_tokens_indices = set()

    # --- Step 1: Process Noun Phrases ---
    # Iterate through all detected noun chunks in the document.
    for chunk in doc.noun_chunks:
        # Extract the text of words within the current noun chunk
        segment_words = [token.text for token in chunk]
        # Extract the POS tags for each word in the noun chunk
        segment_pos = [token.pos_ for token in chunk]

        # Prepare dependency annotations for each token within the chunk.
        # This shows the token, its head, and the dependency type.
        dependency_annotations = []
        for token in chunk:
            # If a token is its own head, it's typically the root of the sentence
            head_text = token.head.text if token.head != token else "SELF"
            dep_type = token.dep_
            dependency_annotations.append(f"{token.text} -> {head_text} ({dep_type})")
            # Mark the token's index as covered
            covered_tokens_indices.add(token.i)

        # Add the processed noun chunk's data to our list of segments.
        processed_segments.append({
            'start_index': chunk.start_char, # Starting character index for sorting
            'data_row': [
                chunk.text,                   # The full text of the noun chunk
                "Noun Phrase",                # Type of segment
                ", ".join(segment_words),     # Comma-separated original words
                ", ".join(segment_pos),       # Comma-separated POS tags
                "; ".join(dependency_annotations) # Semicolon-separated dependency annotations
            ]
        })

    # --- Step 2: Process Individual Tokens (not part of noun chunks) ---
    # Iterate through all tokens in the document.
    for token in doc:
        # If the token's index has not been covered by a noun chunk, process it.
        if token.i not in covered_tokens_indices:
            segment_text = token.text
            segment_type = token.pos_ # For single words, use their POS as the type
            segment_words = token.text # The word itself
            segment_pos = token.pos_   # The word's POS tag

            # Get dependency information for this single token.
            head_text = token.head.text if token.head != token else "ROOT" # "ROOT" for the main verb
            dep_type = token.dep_
            dependency_annotations = f"{token.text} -> {head_text} ({dep_type})"

            # Add the processed individual token's data to our list of segments.
            processed_segments.append({
                'start_index': token.idx, # Starting character index for sorting
                'data_row': [
                    segment_text,
                    segment_type,
                    segment_words,
                    segment_pos,
                    dependency_annotations
                ]
            })

    # --- Step 3: Sort and Format Output ---
    # Sort all collected segments by their starting character index to ensure
    # they appear in the table in the order they appear in the original sentence.
    processed_segments.sort(key=lambda x: x['start_index'])

    # Extract only the data rows for the tabulate function.
    final_table_data = [item['data_row'] for item in processed_segments]

    # Define the headers for the table.
    headers = ["Segment", "Type", "Words (Original)", "POS Tags", "Dependency Annotation"]

    # Use the tabulate library to format the data into a clean grid table.
    return tabulate(final_table_data, headers=headers, tablefmt="grid")

# Example Usage:
if __name__ == "__main__":
    # Test cases to demonstrate the function's capabilities.
    sentence1 = "The quick brown fox jumps over the lazy dog."
    sentence2 = "I saw a girl with a telescope."
    sentence3 = "Artificial intelligence is transforming many industries around the globe."
    sentence4 = "She quickly ran to the store and bought some fresh apples."

    print("Analyzing sentence: \"" + sentence1 + "\"")
    print(segment_sentence_to_phrases_and_dependencies(sentence1))
    print("\n" + "="*100 + "\n") # Separator for clarity

    print("Analyzing sentence: \"" + sentence2 + "\"")
    print(segment_sentence_to_phrases_and_dependencies(sentence2))
    print("\n" + "="*100 + "\n")

    print("Analyzing sentence: \"" + sentence3 + "\"")
    print(segment_sentence_to_phrases_and_dependencies(sentence3))
    print("\n" + "="*100 + "\n")

    print("Analyzing sentence: \"" + sentence4 + "\"")
    print(segment_sentence_to_phrases_and_dependencies(sentence4))
    print("\n" + "="*100 + "\n")


Analyzing sentence: "The quick brown fox jumps over the lazy dog."
+---------------------+-------------+------------------------+---------------------+----------------------------------------------------------------------------------+
| Segment             | Type        | Words (Original)       | POS Tags            | Dependency Annotation                                                            |
| The quick brown fox | Noun Phrase | The, quick, brown, fox | DET, ADJ, ADJ, NOUN | The -> fox (det); quick -> fox (amod); brown -> fox (amod); fox -> jumps (nsubj) |
+---------------------+-------------+------------------------+---------------------+----------------------------------------------------------------------------------+
| jumps               | VERB        | jumps                  | VERB                | jumps -> ROOT (ROOT)                                                             |
+---------------------+-------------+------------------------+---------------------+---------