In [5]:
import pandas as pd
import spacy as sp

tokens = pd.read_csv("../data/tokenized_data.csv")
nlp = sp.load("en_core_web_sm")

print("read")

read


In [9]:

###combined strategy from earlier preprocessing code
def clean_clause(tokens):
    text = " ".join([t.text for t in tokens])
    text = text.replace(" ,", ",").replace("  ", " ")
    return text.strip()

def atomic_sentence_extract(doc):
    atomic_sentences = []

    for sent in doc.sents:
        root = sent.root

        ###wwant to find the main clause (so main subject verb object)
        subjects = [t.text for t in root.lefts if t.dep_ in ("nsubj","nsubjpass")]
        subj = subjects[0] if subjects else "X"
        main_clause_tokens = [tok for tok in root.subtree if tok.head == root or tok == root]
        main_clause_text = clean_clause(main_clause_tokens)
        if main_clause_text not in atomic_sentences:
            atomic_sentences.append(main_clause_text)

        #dealing w relative clauses (relcl)
        for tok in sent:
            if tok.dep_ == "relcl":
                clause_tokens = list(tok.subtree)


                #all atomic sentences need a subject, so we want to check if a subject (nsubj) exists
                if not any(t.dep_ in ("nsubj","nsubjpass") for t in clause_tokens):
                    clause_tokens = [nlp(subj)[0]] + clause_tokens
                clause_text = clean_clause(clause_tokens)

                #if it doesnt't then we have to add one
                if clause_text not in atomic_sentences:
                    atomic_sentences.append(clause_text)

        #Prepositional/location clauses (prep + pobj)
        for prep in [t for t in sent if t.dep_ == "prep"]:
            objs = [c for c in prep.children if c.dep_ == "pobj"]
            for pobj in objs:
                clause_tokens = [prep] + list(pobj.subtree)
                clause_text = clean_clause([nlp(subj)[0]] + clause_tokens)
                if clause_text not in atomic_sentences:
                    atomic_sentences.append(clause_text)

        # Appositions (attr/appos)
        for tok in sent:
            if tok.dep_ in ("attr","appos") and tok.pos_ in ("NOUN","PROPN"):
                fact = f"{subj} is {tok.text}"
                if fact not in atomic_sentences:
                    atomic_sentences.append(fact)

    return atomic_sentences

In [10]:
all_sentences = []

for col, row in tokens.iterrows():
    sentence_id = row["sentence_id"]
    sentence_text = row["sentence_text"]

    doc = nlp(sentence_text)

    atomic_sentences = atomic_sentence_extract(doc)

    for sentence in atomic_sentences:
        all_sentences.append({
            "sentence_id": sentence_id,
            "atomic_sentence": sentence
        })

In [11]:
df_atomic = pd.DataFrame(all_sentences)

df_atomic.to_csv('../data/split_sentences_v1.csv', index=False)

print("Atomic sentences extraction complete. Saved to 'split_sentences_v1.csv'.")

Atomic sentences extraction complete. Saved to 'split_sentences_v1.csv'.


In [None]:
""""
def atomic_sentence_extract(doc):
    atomic_sentences = []

    for sentence in doc.sents:
        #extract the subjects and objects and the main verb ROOT
        ##first get the subjects.

        subjects = []
        for tok in sentence.root.lefts:
            if tok.dep_ in ("nsubj", "nsubpass"):
                subjects.append(tok.text)
        if not subjects:
            subjects = ["X"]

        #get the objects/complements
        objects = []
        for tok in sentence.root.rights:
            if tok.dep_ in ("dobj", "attr", "prep", "xcomp"):
                objects.append(tok.text)
        if not objects:
            objects = ["X"]

        #also want to separate relative clauses
        for tok in sentence:
            if tok.dep_ in ("relcl", "advcl", "ccomp"):
                relative_clause = " ".join([t.text for t in tok.subtree])
                atomic_sentences.append(relative_clause)

        #now we want to combine subjs/objs to create the other atomic sentences
        atomic = f"{subjects[0]} {sentence.root.lemma_} {' '.join(objects)}".strip()
        atomic_sentences.append(atomic)

    return atomic_sentences

"""