In [34]:
!pip install stanza



In [35]:
import stanza
stanza.download('hi')
nlp=stanza.Pipeline('hi')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: hi (Hindi) ...
INFO:stanza:File exists: /root/stanza_resources/hi/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: hi (Hindi):
| Processor | Package       |
-----------------------------
| tokenize  | hdtb          |
| pos       | hdtb_charlm   |
| lemma     | hdtb_nocharlm |
| depparse  | hdtb_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


In [36]:
text="राम बहुत अच्छा लड़का है और वह रोज़ाना स्कूल जाता है। राम फुटबॉल भी अच्छा खेलता है। एक दिन वह फुटबॉल खेलने गया और उसके पैर में चोट लग गई।"
doc=nlp(text)
for i, sentence in enumerate(doc.sentences):
    print(f"\nSentence {i+1}: {' '.join([word.text for word in sentence.words])}")
    for word in sentence.words:
        print(f" - {word.text} ({word.upos})")


Sentence 1: राम बहुत अच्छा लड़का है और वह रोज़ाना स्कूल जाता है ।
 - राम (PROPN)
 - बहुत (ADV)
 - अच्छा (ADJ)
 - लड़का (NOUN)
 - है (AUX)
 - और (CCONJ)
 - वह (PRON)
 - रोज़ाना (ADV)
 - स्कूल (NOUN)
 - जाता (VERB)
 - है (AUX)
 - । (PUNCT)

Sentence 2: राम फुटबॉल भी अच्छा खेलता है ।
 - राम (PROPN)
 - फुटबॉल (PROPN)
 - भी (PART)
 - अच्छा (ADJ)
 - खेलता (VERB)
 - है (AUX)
 - । (PUNCT)

Sentence 3: एक दिन वह फुटबॉल खेलने गया और उसके पैर में चोट लग गई ।
 - एक (NUM)
 - दिन (NOUN)
 - वह (PRON)
 - फुटबॉल (NOUN)
 - खेलने (VERB)
 - गया (VERB)
 - और (CCONJ)
 - उसके (PRON)
 - पैर (NOUN)
 - में (ADP)
 - चोट (NOUN)
 - लग (VERB)
 - गई (AUX)
 - । (PUNCT)


In [37]:
def extract_conjs(doc):
    conjs = set()
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.upos == "CCONJ":
                conjs.add(word.text)
    return list(conjs)

In [38]:

conjs_list = extract_conjs(doc)

In [39]:
def segment_hindi_text(doc):
    segments = []
    current_segment = []

    last_subject = None

    for sentence in doc.sentences:
        first_word = sentence.words[0]
        first_pos = first_word.upos
        first_lemma = first_word.lemma

        # Simple rules
        if first_pos == 'PRON' and last_subject:
            # Continue the current segment
            current_segment.append(sentence.text)
        elif first_pos == 'CCONJ' or first_lemma in conjs_list:
            # Conjunctions → continue
            current_segment.append(sentence.text)
        else:
            # Start a new segment
            if current_segment:
                segments.append(" ".join(current_segment))
            current_segment = [sentence.text]

            # Save subject if named
            if first_pos in ['PROPN', 'NOUN']:
                last_subject = first_word.text

    # Append the final segment
    if current_segment:
        segments.append(" ".join(current_segment))

    return segments

In [40]:
segments = segment_hindi_text(doc)

print("\nSegmented Output:\n")
for i, segment in enumerate(segments, 1):
    print(f"[Segment {i}]: {segment}")


Segmented Output:

[Segment 1]: राम बहुत अच्छा लड़का है और वह रोज़ाना स्कूल जाता है।
[Segment 2]: राम फुटबॉल भी अच्छा खेलता है।
[Segment 3]: एक दिन वह फुटबॉल खेलने गया और उसके पैर में चोट लग गई।


In [41]:
def extract_pronouns(doc):
    pronoun = set()
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.upos == "PRON":
                pronoun.add(word.text)
    return list(pronoun)

In [42]:
pron_list = extract_pronouns(doc)

In [43]:
def segment_with_subject_tracking(doc):
    segments = []
    current_segment = []
    last_subject = None
    pronoun_list = pron_list

    for sentence in doc.sentences:
        words = sentence.words
        first_word = words[0].text
        first_pos = words[0].upos

        # Case 1: Sentence starts with a pronoun like "वह"
        if first_word in pronoun_list and last_subject:
            current_segment.append(sentence.text)

        # Case 2: Sentence starts with named subject (NOUN/PROPN)
        elif first_pos in ['NOUN', 'PROPN']:
            # If same as last subject, continue; else start new segment
            if first_word == last_subject:
                current_segment.append(sentence.text)
            else:
                if current_segment:
                    segments.append(" ".join(current_segment))
                current_segment = [sentence.text]
                last_subject = first_word

        # Case 3: Other types (events, conjunctions, adverbs)
        else:
            # If sentence starts a new scene (e.g., "एक दिन"), break
            if first_word in ['एक', 'फिर', 'तब', 'उसके', 'बाद']:
                if current_segment:
                    segments.append(" ".join(current_segment))
                current_segment = [sentence.text]
            else:
                current_segment.append(sentence.text)

    if current_segment:
        segments.append(" ".join(current_segment))

    return segments


In [44]:

segments = segment_with_subject_tracking(doc)

print("\nSegmented Output:\n")
for i, segment in enumerate(segments, 1):
    print(f"[Segment {i}]: {segment}")


Segmented Output:

[Segment 1]: राम बहुत अच्छा लड़का है और वह रोज़ाना स्कूल जाता है। राम फुटबॉल भी अच्छा खेलता है।
[Segment 2]: एक दिन वह फुटबॉल खेलने गया और उसके पैर में चोट लग गई।


In [45]:
scene_starting_pos = ['ADV', 'SCONJ', 'ADP', 'DET', 'NUM']

def is_scene_change(sentence):
    if not sentence.words:
        return False
    first_word = sentence.words[0]
    return first_word.upos in scene_starting_pos

In [47]:
def segment_hindi_text_rule_based(text, nlp):
    doc = nlp(text)
    pronoun_list = extract_pronouns(doc)

    segments = []
    current_segment = []
    last_subject = None

    for sentence in doc.sentences:
        words = sentence.words
        if not words:
            continue  # Skip empty sentences

        first_word = words[0].text
        first_pos = words[0].upos

        if first_word in pronoun_list and last_subject:
            current_segment.append(sentence.text)

        elif first_pos in ['NOUN', 'PROPN']:
            if first_word == last_subject:
                current_segment.append(sentence.text)
            else:
                if current_segment:
                    segments.append(" ".join(current_segment))
                current_segment = [sentence.text]
                last_subject = first_word

        elif is_scene_change(sentence):  # ✅ generalized check
            if current_segment:
                segments.append(" ".join(current_segment))
            current_segment = [sentence.text]

        else:
            current_segment.append(sentence.text)

    if current_segment:
        segments.append(" ".join(current_segment))

    return segments

