In [4]:
import pandas as pd

df = pd.read_csv("../data/tokenized_data.csv")

print(df.head(10))


   sentence_id                                      sentence_text  token_text  \
0            0  after the very heavy '' dragontown '' album al...       after   
1            0  after the very heavy '' dragontown '' album al...         the   
2            0  after the very heavy '' dragontown '' album al...        very   
3            0  after the very heavy '' dragontown '' album al...       heavy   
4            0  after the very heavy '' dragontown '' album al...          ''   
5            0  after the very heavy '' dragontown '' album al...  dragontown   
6            0  after the very heavy '' dragontown '' album al...          ''   
7            0  after the very heavy '' dragontown '' album al...       album   
8            0  after the very heavy '' dragontown '' album al...       alice   
9            0  after the very heavy '' dragontown '' album al...     decided   

  pos_tag dep_relation       lemma  is_stop  is_punct  
0   SCONJ         mark       after     True     Fals

In [5]:
#checking if tokens are printed in order

for sent_id, group in df.groupby("sentence_id"):
    print(f"Sent ID: {sent_id}")
    print("Tokens:", group["token_text"].tolist())
    print()

Sent ID: 0
Tokens: ['after', 'the', 'very', 'heavy', "''", 'dragontown', "''", 'album', 'alice', 'decided', 'to', 'return', 'to', 'his', 'roots', ',', 'his', 'place', 'of', 'birth', ',', 'detroit', ',', 'where', 'he', 'accidently', 'joined', 'in', 'with', 'a', 'festival', 'together', 'with', 'mc5', ' ', 'iggy', 'and', 'the', 'stooges', '.']

Sent ID: 1
Tokens: ['the', 'museum', 'runs', 'a', 'library', 'with', 'photographic', 'books', 'and', 'magazines', ',', 'and', 'a', 'small', 'museum', 'store', 'that', 'sells', 'postcards', ',', 'posters', 'and', 'more', '.']

Sent ID: 2
Tokens: ['jakobshavn', 'isbr', 'is', 'a', 'major', 'contributor', 'to', 'the', 'mass', 'balance', 'of', 'the', 'greenland', 'ice', 'sheet', ',', 'producing', 'some', '10', ' ', 'of', 'all', 'greenland', 'icebergs', 'some', '35', 'billion', 'tonnes', 'of', 'icebergs', 'calved', 'off', 'and', 'passing', 'out', 'of', 'the', 'fjord', 'every', 'year', '.']

Sent ID: 3
Tokens: ['wright', 'also', 'played', 'the', 'part', '

In [6]:
def find_clause_heads(tokens_df):
    clause_heads = []

    #loop through every token, which is row in the sentence
    for i, row in tokens_df.iterrows():
        #obtain part of speech and dependency relation
        pos, dep = row["pos_tag"], row["dep_relation"]

        #identifies verbs that function as clause heads based on their dependency relations
        if pos == "VERB" and dep in ("ROOT", "advcl", "xcomp", "ccomp", "relcl", "conj"):
            clause_heads.append(i)

        #Nouns or proper nouns that serve as appositive or attributive clause heads
        elif dep in ("appos", "attr") and pos in ("NOUN", "PROPN"):
            clause_heads.append(i)
    return sorted(clause_heads)

In [7]:
#testing the find_clause_heads function
for sent_id, group in df.groupby("sentence_id"):
    heads = find_clause_heads(group)
    print(f"\nSentence {sent_id}: {len(heads)} clause heads --> {heads}")
    for idx in heads:
        print("-", group.loc[idx, "token_text"], "(", group.loc[idx, "dep_relation"], ")")
    break



Sentence 0: 5 clause heads --> [9, 11, 17, 21, 26]
- decided ( ROOT )
- return ( xcomp )
- place ( appos )
- detroit ( appos )
- joined ( relcl )


In [11]:
def get_clause_subtree(tokens_df, head_idx, all_head_idx):
    #converting data fram to a list of dictionaries
    tokens = tokens_df.reset_index(drop=True).to_dict(orient="records")
    clause_tokens = []

    for i in range(head_idx, len(tokens)):
        tok = tokens[i]

        #stop if punctuation or another clause head is found
        if tok["is_punct"] or (i in all_head_idx and i != head_idx):
            break

        clause_tokens.append(tok["token_text"])
    return " ".join(clause_tokens)
        

In [12]:
for sent_id, group in df.groupby("sentence_id"):
    heads = find_clause_heads(group)
    print(f"\nSentence {sent_id}:")
    
    for idx in heads:

        clause = get_clause_subtree(group, idx, heads)
        token = group.loc[idx, "token_text"]
        dep = group.loc[idx, "dep_relation"]
        print(f"Head: {token} ({dep})")
        print(f" → Clause: {clause}\n")
    break



Sentence 0:
Head: decided (ROOT)
 → Clause: decided to

Head: return (xcomp)
 → Clause: return to his roots

Head: place (appos)
 → Clause: place of birth

Head: detroit (appos)
 → Clause: detroit

Head: joined (relcl)
 → Clause: joined in with a festival together with mc5   iggy and the stooges



In [13]:
#helper function for the normalizer
def cap_and_punctuate(text):

    text = text.strip()
    if not text:
        return ""
    if not text[0].isupper():
        text = text[0].upper() + text[1:]
    if not text.endswith("."):
        text += "."
    return text


In [17]:
def normalize_clause(head_row, clause, context_subject):
    dep = head_row["dep_relation"]
    head = head_row["token_text"]
    text = clause.strip()
    
    #skip empty clauses
    if not text:
        return None
    
    #main clause (ROOT)
    if dep == "ROOT":
        if context_subject and context_subject.lower() not in text.lower():
            text = f"{context_subject} {text}"
        return cap_and_punctuate(text)
    
    #open/complement clauses (xcomp, advcl, ccomp)
    elif dep in ("xcomp", "advcl", "ccomp"):
        if context_subject and context_subject.lower() not in text.lower():
            text = f"{context_subject} {text}"
        return cap_and_punctuate(text)
    
    #appositive or attribute
    elif dep in ("appos", "attr"):
        return cap_and_punctuate(f"{context_subject}'s {text} is Detroit")
    
    #relative clause
    elif dep == "relcl":
        if context_subject:
            text = f"{context_subject} {text}"
        return cap_and_punctuate(text)
    
    return None


In [18]:
def extract_atomic_sentences(group):
    atoms = []
    heads = find_clause_heads(group)
    
    #find main subject (first nsubj or nsubjpass)
    subject = None
    for _, row in group.iterrows():
        if row["dep_relation"] in ("nsubj", "nsubjpass"):
            subject = row["token_text"]
            break
    
    #process each clause
    for idx in heads:
        head_row = group.loc[idx]
        clause = get_clause_subtree(group, idx, heads)
        norm = normalize_clause(head_row, clause, subject)
        if norm:
            atoms.append(norm)
    return atoms


In [16]:
#testing the extract_atomic_sentences function
for sid, group in df.groupby("sentence_id"):
    atoms = extract_atomic_sentences(group)
    print(f"\nSentence {sid}:")
    for a in atoms:
        print("-", a)
    break


Sentence 0:
- Alice decided to.
- Alice return to his roots.
- Alice's place of birth is Detroit.
- Alice's detroit is Detroit.
- Alice joined in with a festival together with mc5   iggy and the stooges.
