<a href="https://colab.research.google.com/github/bogdanbabych/experiments_NLTK/blob/main/Sentence_Clause_Segmenter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import spacy

def segment_clauses(sentence_text: str) -> str:
    """
    Segments a sentence into clauses based on dependency annotation using spaCy,
    and returns the result in a table format.

    This function identifies clause boundaries using heuristics based on
    dependency types and part-of-speech tags commonly associated with
    clause introduction (e.g., subordinating conjunctions, coordinating
    conjunctions, clausal complements).

    Args:
        sentence_text (str): The input sentence.

    Returns:
        str: A string representing the clauses in a structured table format.
             Returns an error message if the spaCy model is not found.
    """
    try:
        # Load the English language model from spaCy.
        # If this is your first time using spaCy, you might need to install
        # the model by running: python -m spacy download en_core_web_sm
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        return (
            "Error: spaCy model 'en_core_web_sm' not found. "
            "Please run 'python -m spacy download en_core_web_sm' in your terminal "
            "to install it. This model is necessary for dependency parsing."
        )

    # Process the input sentence with the loaded spaCy model to get a Doc object.
    doc = nlp(sentence_text)

    clauses_data = []
    clause_num = 1

    # Use a set to store unique indices where new clauses are identified to start.
    # The sentence beginning (index 0) is always a clause start.
    clause_starts = {0}

    # Iterate through each token in the processed document to identify clause boundaries.
    for token in doc:
        # Rule 1: Subordinating conjunctions ('mark' dependency).
        # Examples: "when", "if", "because", "that".
        # These tokens typically introduce a new subordinate clause.
        if token.dep_ == "mark" and token.head.pos_ == "VERB":
            clause_starts.add(token.i)

        # Rule 2: Coordinating conjunctions ('cc' dependency).
        # Examples: "and", "but", "or".
        # These often connect independent clauses. The new clause typically starts
        # *after* the coordinating conjunction.
        elif token.dep_ == "cc":
            # Add the index of the token immediately following the 'cc' as a clause start.
            if token.i + 1 < len(doc):
                clause_starts.add(token.i + 1)

        # Rule 3: Verbs that initiate new clausal structures.
        # This includes clausal complements (ccomp), adverbial clauses (advcl),
        # relative clauses (relcl), and open clausal complements (xcomp).
        # The verb itself usually marks the start of this new clause.
        elif token.pos_ == "VERB" and token.dep_ in ["ccomp", "advcl", "relcl", "xcomp"]:
            clause_starts.add(token.i)

        # Rule 4: Verbs that are coordinated ('conj') with another verb.
        # Example: "He ran and jumped." "jumped" starts a new conceptual unit.
        elif token.pos_ == "VERB" and token.dep_ == "conj" and token.head.pos_ == "VERB":
             clause_starts.add(token.i)

    # Convert the set of clause start indices to a sorted list to process them sequentially.
    sorted_clause_starts = sorted(list(clause_starts))

    # Construct the clauses based on the identified start points.
    for i in range(len(sorted_clause_starts)):
        start_idx = sorted_clause_starts[i]
        # The end index is either the start of the next identified clause or the end of the document.
        end_idx = sorted_clause_starts[i+1] if i + 1 < len(sorted_clause_starts) else len(doc)

        # Create a spaCy span object for the current clause segment.
        clause_span = doc[start_idx:end_idx]

        # Skip empty spans, which can sometimes occur from parsing quirks.
        if not clause_span.text.strip():
            continue

        clause_text = clause_span.text.strip()

        main_verb = ""
        # Find the main verb within the current clause span.
        # Prioritize the token with 'ROOT' dependency within the span, or the first verb found.
        for t in clause_span:
            if t.dep_ == "ROOT" and t.pos_ == "VERB":
                main_verb = t.text
                break
        # If no ROOT verb is explicitly found (common in subordinate clauses or fragments),
        # take the first verb in the span as the main verb.
        if not main_verb:
            for t in clause_span:
                if t.pos_ == "VERB":
                    main_verb = t.text
                    break

        # Determine the type of the clause based on its introductory token/structure.
        clause_type = "Main" # Default type
        first_token_of_span = clause_span[0]

        # Check for subordinate clause indicators.
        # 'mark' dependency (subordinating conjunction).
        # Relative pronouns that are subjects of relative clauses (e.g., "who" in "who came").
        # Verbs initiating clausal complements (ccomp, advcl, xcomp, relcl).
        if first_token_of_span.dep_ == "mark" or \
           (first_token_of_span.pos_ == "PRON" and first_token_of_span.dep_ == "nsubj" and
            first_token_of_span.head and first_token_of_span.head.dep_ == "relcl") or \
           (first_token_of_span.pos_ == "VERB" and first_token_of_span.dep_ in ["ccomp", "advcl", "xcomp", "relcl"]):
            clause_type = "Subordinate"
        # Check for coordinated clause indicator.
        # 'cc' dependency (coordinating conjunction).
        elif first_token_of_span.dep_ == "cc":
            clause_type = "Coordinated"

        # Append the extracted clause data to the list.
        clauses_data.append({
            "Clause Number": clause_num,
            "Clause Text": clause_text,
            "Main Verb": main_verb,
            "Clause Type": clause_type
        })
        clause_num += 1

    # If no clauses were identified, return an informative message.
    if not clauses_data:
        return "No clauses found for the given sentence based on current heuristics."

    # Format the collected clause data into a clean, readable table string.
    headers = ["Clause Number", "Clause Text", "Main Verb", "Clause Type"]

    # Calculate the maximum width for each column to ensure proper alignment.
    col_widths = {header: len(header) for header in headers}
    for row in clauses_data:
        for header in headers:
            col_widths[header] = max(col_widths[header], len(str(row[header])))

    # Create the table header line.
    header_line = " | ".join([f"{h:<{col_widths[h]}}" for h in headers])
    # Create the separator line between header and data rows.
    separator_line = "-+-".join(["-" * col_widths[h] for h in headers])

    # Build all table rows.
    table_rows = [header_line, separator_line]
    for row in clauses_data:
        row_str = " | ".join([f"{str(row[h]):<{col_widths[h]}}" for h in headers])
        table_rows.append(row_str)

    # Join all lines to form the final table string.
    return "\n".join(table_rows)

# Example Usage (you can uncomment and run this to test):
if __name__ == "__main__":
     sentence1 = "I went to the store, and I bought some milk because I needed it for coffee."
     print(f"Sentence: '{sentence1}'\n")
     print(segment_clauses(sentence1))
     print("\n" + "="*80 + "\n")

     sentence2 = "She believed that he would succeed, even though it was a difficult task."
     print(f"Sentence: '{sentence2}'\n")
     print(segment_clauses(sentence2))
     print("\n" + "="*80 + "\n")

     sentence3 = "The dog barked loudly, startling the cat, which then ran under the bed."
     print(f"Sentence: '{sentence3}'\n")
     print(segment_clauses(sentence3))
     print("\n" + "="*80 + "\n")

     sentence4 = "Running quickly, he caught the ball and threw it back."
     print(f"Sentence: '{sentence4}'\n")
     print(segment_clauses(sentence4))
     print("\n" + "="*80 + "\n")

     sentence5 = "The sun rose, painting the sky with vibrant colors."
     print(f"Sentence: '{sentence5}'\n")
     print(segment_clauses(sentence5))
     print("\n" + "="*80 + "\n")

Sentence: 'I went to the store, and I bought some milk because I needed it for coffee.'

Clause Number | Clause Text              | Main Verb | Clause Type
--------------+--------------------------+-----------+------------
1             | I went to the store, and | went      | Main       
2             | I                        |           | Main       
3             | bought some milk         | bought    | Main       
4             | because I                |           | Subordinate
5             | needed it for coffee.    | needed    | Subordinate


Sentence: 'She believed that he would succeed, even though it was a difficult task.'

Clause Number | Clause Text                                   | Main Verb | Clause Type
--------------+-----------------------------------------------+-----------+------------
1             | She believed                                  | believed  | Main       
2             | that he would                                 |           | Subordinate
3 