In [54]:
import pandas as pd
import spacy as sp

tokens = pd.read_csv("../data/tokenized_data.csv")
nlp = sp.load("en_core_web_sm")

print("read")

read


In [55]:


def clean_clause(tokens):
    text = " ".join(t.text for t in tokens)
    text = text.replace(" ,", ",").replace("  ", " ").strip()
    return text

#NP core without appositions/relcl
def _np_core(head):
    keep = {head}
    for c in head.children:
        if c.dep_ in ("det","amod","poss","compound","nummod","flat","fixed"):
            keep.add(c)
        #allow 'of' PP inside NP (e.g., "place of birth")
        if c.dep_ == "prep" and c.lemma_ == "of":
            keep.update(list(c.subtree))
    toks = sorted(keep, key=lambda x: x.i)
    return toks

def _subject_token(root):
    subs = [t for t in root.lefts if t.dep_ in ("nsubj","nsubjpass")]
    return subs[0] if subs else None

def _subject_text(subj):
    if subj is None: return "X"
    keep = {subj}
    for c in subj.children:
        if c.dep_ in ("det","amod","poss","compound","nummod","flat","fixed"):
            keep.add(c)
    toks = sorted(keep, key=lambda x: x.i)
    return clean_clause(toks)

#core verb phrase
def _verb_phrase_core(root):
    keep = {root}
    for c in root.children:
        if c.dep_ in ("aux","auxpass","neg","prt","mark"):
            keep.add(c)
        if c.dep_ in ("dobj","attr"):
            keep.update(list(c.subtree))
        #only include 'to' + NP core of its pobj (no appos/relcl)
        if c.dep_ == "prep" and c.lemma_ == "to":
            pobj = next((gc for gc in c.children if gc.dep_=="pobj"), None)
            if pobj is not None:
                keep.add(c)
                keep.update(_np_core(pobj))
        #xcomp chain (keep its objs and 'to'-PP NP core)
        if c.dep_ == "xcomp" and c.pos_ == "VERB":
            keep.add(c)
            for rc in c.children:
                if rc.dep_ in ("aux","auxpass","neg","prt","mark"):
                    keep.add(rc)
                if rc.dep_ in ("dobj","attr"):
                    keep.update(list(rc.subtree))
                if rc.dep_ == "prep" and rc.lemma_ == "to":
                    pobj2 = next((gc for gc in rc.children if gc.dep_=="pobj"), None)
                    if pobj2 is not None:
                        keep.add(rc)
                        keep.update(_np_core(pobj2))
    toks = sorted(keep, key=lambda x: x.i)
    return toks  #return tokens so we can clip later

def _capitalize(s):
    return s[0].upper() + s[1:] if s else s

def _endsent(s):
    s = s.rstrip()
    return s if s.endswith(('.', '!', '?')) else s + "."

#build a clause from a (finite or participial) head and borrow subject if needed
def _finite_clause_from(head, borrow_subj_txt):
    keep = {head}
    for c in head.children:
        if c.dep_ in ("aux","auxpass","neg","prt","mark"):
            keep.add(c)
        if c.dep_ in ("nsubj","nsubjpass","dobj","attr","ccomp","xcomp"):
            keep.update(list(c.subtree))
        if c.dep_ == "prep":
            keep.update(list(c.subtree))
        #include coordinated verb and its arguments
        if c.dep_ == "conj" and c.pos_ == "VERB":
            keep.add(c)
            for gc in c.children:
                if gc.dep_ in ("aux","auxpass","neg","prt","nsubj","nsubjpass","dobj","attr","ccomp","xcomp","prep"):
                    keep.update(list(gc.subtree))

    toks = sorted(keep, key=lambda x: x.i)

    has_subj = any(t.dep_ in ("nsubj","nsubjpass") for t in toks)
    sent_text = " ".join(t.text for t in toks)
    if not has_subj and borrow_subj_txt:
        sent_text = f"{borrow_subj_txt} " + sent_text

    #drop sentence-initial relativizers/markers
    for w in ("that","which","who","whom","where","when"):
        if sent_text.lower().startswith(w + " "):
            sent_text = sent_text[len(w)+1:]
            break

    sent_text = sent_text.replace(" ,", ",").replace("  ", " ").strip()
    if sent_text and not sent_text.endswith((".", "!", "?")):
        sent_text += "."
    if sent_text:
        sent_text = sent_text[0].upper() + sent_text[1:]
    return sent_text

def atomic_sentence_extract(doc):
    atomic_sentences = []

    for sentence in doc.sents:
        root = sentence.root

        #main
        subj_tok = _subject_token(root)
        subj_txt = _subject_text(subj_tok)

        vp_toks = _verb_phrase_core(root)

        #drop any material before the subject in the main clause
        if subj_tok is not None:
            vp_toks = [t for t in vp_toks if t.i >= subj_tok.i or t == root or t.head == root]

        vp_txt = clean_clause(vp_toks)

        #trim leading temporal openers if still present
        if vp_txt.lower().startswith(("after ", "before ", "because ", "while ", "when ")):
            pos = vp_txt.lower().find(subj_txt.lower())
            if pos > 0:
                vp_txt = vp_txt[pos:]
            elif "," in vp_txt:
                vp_txt = vp_txt.split(",", 1)[1].strip()

        main_text = vp_txt if vp_txt.lower().startswith(subj_txt.lower()) else f"{subj_txt} {vp_txt}"
        main_text = _endsent(_capitalize(main_text))
        if len(main_text.split()) >= 3:
            atomic_sentences.append(main_text)

        #appositive fact (to)
        roots_pobj = None
        for c in root.children:
            if c.dep_ == "prep" and c.lemma_ == "to":
                roots_pobj = next((gc for gc in c.children if gc.dep_=="pobj"), None)
                if roots_pobj: break

        if roots_pobj is not None:
            appos_list = [ch for ch in roots_pobj.children if ch.dep_=="appos" and ch.pos_ in ("NOUN","PROPN")]
            if appos_list:
                anchor_txt = clean_clause(_np_core(roots_pobj))
                right = [clean_clause(_np_core(a)) for a in appos_list]
                cop = _endsent(_capitalize(f"{anchor_txt} are " + ", ".join(right)))
                if len(cop.split()) >= 3:
                    atomic_sentences.append(cop)

        #looks at relative clauses (splits into types of relcl later on)
        for rel in [t for t in sentence if t.dep_=="relcl"]:
            rkeep = {rel}
            for rc in rel.children:
                if rc.dep_ in ("aux","auxpass","neg","prt","dobj","attr","ccomp","xcomp"):
                    rkeep.update(list(rc.subtree))
                if rc.dep_ == "prep":
                    rkeep.update(list(rc.subtree))
            r_toks = sorted(rkeep, key=lambda x: x.i)
            #strip relativizers
            rel_words = [t for t in r_toks if t.text.lower() not in {"where","that","who","which"} and t.dep_ != "mark"]
            if not any(t.pos_ == "VERB" for t in rel_words):
                continue
            rel_text = clean_clause(rel_words)
            #ensure subject if missing
            if not any(t.dep_ in ("nsubj","nsubjpass") for t in rel_words):
                rel_text = f"{('He' if subj_tok is None or subj_tok.pos_=='PROPN' else subj_txt)} {rel_text}"
            rel_text = _endsent(_capitalize(rel_text))
            if len(rel_text.split()) >= 3:
                atomic_sentences.append(rel_text)

        #participial/adverbial clause
        for tok in sentence:
            is_participle = (tok.pos_ == "VERB" and tok.tag_ == "VBG")
            looks_clausey = tok.dep_ in ("advcl","acl","conj") and tok.head == root
            if is_participle or looks_clausey:
                if tok.lemma_ in {"produce","contribute","generate","account"} or is_participle:
                    produced = _finite_clause_from(tok, borrow_subj_txt=subj_txt)
                    # require a number or an object-ish cue to avoid junk
                    if any(ch.isdigit() for ch in produced) or any(w in produced.lower() for w in ("percent","%","tonne","billion","million","all","of ")):
                        atomic_sentences.append(produced)

        #coordinated finite clauses
        for tok in sentence:
            is_finite_like = (tok.pos_ == "VERB" and tok.morph.get("VerbForm") != ["Inf"])
            if is_finite_like and tok != root and tok.dep_ in ("conj","parataxis"):
                extra = _finite_clause_from(tok, borrow_subj_txt=subj_txt)
                if extra and len(extra.split()) >= 4:
                    atomic_sentences.append(extra)

    #strong deduplication
    seen, final = set(), []
    for s in atomic_sentences:
        key = " ".join(s.lower().split())
        if key not in seen:
            seen.add(key); final.append(s)
    return final





In [56]:
all_sentences = []
for idx, row in tokens.iterrows(): 
    sentence_id = row["sentence_id"] 
    sentence_text = row["sentence_text"] 
    doc = nlp(sentence_text) 
    atomic_sentences = atomic_sentence_extract(doc) 
    
    for s in atomic_sentences: 
        all_sentences.append({ "sentence_id": sentence_id, "atomic_sentence": s }) 
        
import pandas as pd 
out_df = pd.DataFrame(all_sentences).drop_duplicates()

In [57]:
df_atomic = pd.DataFrame(all_sentences)

df_atomic.to_csv('../data/split_sentences_v2.csv', index=False)

print("Atomic sentences extraction complete. Saved to 'split_sentences_v2.csv'.")

Atomic sentences extraction complete. Saved to 'split_sentences_v2.csv'.
