In [1]:
import numpy as np
import scipy
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use('ggplot')
%matplotlib inline
from tqdm import tqdm
from datasets import load_dataset

import logging
import spacy
import re


In [2]:
import pandas as pd
from src.PrepareSentenceContext import PrepareSentenceContext
from src.SentenceParser import SentenceParser
from spacy.lang.en import English

In [7]:
import spacy
from spacy.language import Language
from spacy.tokens import Doc

In [121]:
input_file = "Data/Bio/mixed/Toni Morrison_edited.txt"

with open(input_file, 'rt') as f:
    text = f.read()

#full_parser = PrepareSentenceContext(sentence_parser="spacy")
#pout = full_parser(text)

In [122]:
nlp = spacy.load("en_core_web_sm")
#nlp.add_pipe('sentencizer')
#text = re.sub("(</?[a-zA-Z0-9 ]+>\.?)\s+", r"\1.\n", text)  # to make sure that tags are in separate sentences
parsed = nlp(text)

In [123]:
def parse_sentences(text):
    pattern_close = r"(.*?)</edit>"
    pattern_open = r"<edit>(.*?)"
    MIN_TOKEN_LEN = 3

    texts = []
    tags = []
    lengths = []
    contexts = []
    
    def update_sent(sent_text, tag, sent_length):
        texts.append(sent_text)
        tags.append(tag)
        lengths.append(sent_length)
        context = None
        contexts.append(context)

    curr_tag = None
    parsed = nlp(text)
    for s in parsed.sents:
        prev_tag = curr_tag
        matches_close = re.findall(pattern_close, s.text)
        matches_open = re.findall(pattern_open, s.text)
        matches_between = re.findall(r"<edit>(.*?)</edit>", s.text)
        
        print(f"Current sentence: {s.text}")
        logging.debug(f"Matches open: {matches_open}")
        logging.debug(f"Matches close: {matches_close}")
        logging.debug(f"Matches between: {matches_between}")
        if len(matches_close)>0 and len(matches_open)>0: 
            logging.debug("Found an opening and a closing tag in the same sentence.")
            if prev_tag is None and len(matches_open[0]) >= MIN_TOKEN_LEN:
                logging.debug("Openning followed by closing with some text in between.")
                update_sent(matches_open[0], "<edit>", len(s)-2)
                curr_tag = None
            if prev_tag == "<edit>" and len(matches_close[0]) >= MIN_TOKEN_LEN:
                logging.warning(f"Wierd case: closing/openning followed by openning in sentence {len(texts)}")
                update_sent(matches_close[0], prev_tag, len(s)-1)
                curr_tag = None
            if prev_tag == "</edit>":
                logging.debug("Closing followed by openning.")
                curr_tag = "<edit>"
                if len(matches_between[0]) > MIN_TOKEN_LEN:
                    update_sent(matches_between[0], None, len(s)-2)
        elif len(matches_open) > 0:
            curr_tag = "<edit>"
            assert prev_tag is None, f"Found an opening tag without a closing tag in sentence num. {len(texts)}"
            if len(matches_open[0]) >= MIN_TOKEN_LEN:
                # text and tag are in the same sentence
                sent_text = matches_open[0]
                update_sent(sent_text, curr_tag, len(s)-1)      
        elif len(matches_close) > 0:
            curr_tag = "</edit>"
            assert prev_tag == "<edit>", f"Found a closing tag without an opening tag in sentence num. {len(texts)}"
            if len(matches_close[0]) >= MIN_TOKEN_LEN:
                # text and tag are in the same sentence
                update_sent(matches_close[0], prev_tag, len(s)-1)
            curr_tag = None
        else:
            #if len(matches_close)==0 and len(matches_open)==0: 
            # no tag
            update_sent(s.text, curr_tag, len(s))
    return {'text': texts, 'length': lengths, 'context': contexts, 'tag': tags,
                'number_in_par': np.arange(1,1+len(texts))}

In [125]:
logging.basicConfig(level=logging.DEBUG)
pd.DataFrame(parse_sentences(text))

AssertionError: Found a closing tag without an opening tag in sentence num. 37

In [83]:
pattern_close = r"(.*?)</edit>"
pattern_open = r"<edit>(.*?)"
MIN_TOKEN_LEN = 3

curr_tag = None
sentences = []
tags = []
for s in parsed.sents:
    prev_tag = curr_tag
    matches_close = re.findall(pattern_close, s.text)
    matched_open = re.findall(pattern_open, s.text)
    if len(matched_open) > 0:
        curr_tag = "<edit>"
        assert prev_tag is None
        if len(matched_open[0]) >= MIN_TOKEN_LEN:
            # text and tag are in the same sentence
            sent_text = matched_open[0]
            sentences.append(sent_text)
            tags.append(curr_tag)
    if len(matches_close) > 0:
        curr_tag = "</edit>"
        assert prev_tag == "<edit>"
        if len(matches_close[0]) >= MIN_TOKEN_LEN:
            # text and tag are in the same sentence
            sent_text = matches_close[0]
            sentences.append(sent_text)
            tags.append(prev_tag)
        curr_tag = None
        
    if len(matches_close)==0 and len(matched_open)==0: 
        # no tag
        sent_text = s.text
        sentences.append(sent_text)
        tags.append(curr_tag)

for (s,t) in zip(sentences, tags):
    print(f"({t}), {s}")

(None), Written by ChatGPT November 21, 2023.

(None), Anne Rice.
Section 1: Early Life and Childhood.

(None), Anne Rice, born Howard Allen Frances O'Brien on October 4, 1941, in New Orleans, Louisiana, was an American author best known for her gothic fiction and horror novels.

(None), Rice spent her formative years in the culturally rich and diverse environment of New Orleans, a city that would later serve as a vivid backdrop for many of her literary works.

(None), She was the second of four sisters and grew up in a Catholic household, deeply influenced by the city's unique blend of religious traditions and supernatural folklore.

(None), Tragedy struck the young Anne early on when her mother, Katherine Allen O'Brien, passed away of alcoholism when Anne was just 14 years old.

(None), This profound loss would shape much of Rice's later writing, infusing it with themes of grief, redemption, and the supernatural.

(None), Despite the challenges, she found solace in literature and dev

In [55]:
matched_open

['']

In [54]:
prev_tag

'</edit>'

In [44]:
html_tags = ['<edit>', '</edit>']  # Add your HTML tags here
sentences = []
start = 0
for token in parsed:
    if token.text in html_tags:
        continue
    if token.is_sent_start:
        print(token.text)
        sentences.append(parsed.char_span(start, token.i, alignment_mode="contract"))
        start = token.i
sentences.append(parsed.char_span(start,len(parsed)))
Doc.from_docs(sentences)

Written
Anne
Anne
Rice
She
Tragedy
This
Despite
Her
Rice
<
When
<
Following
<
<
After
<
The
While
Devastated
<
In
<
The
The
Over
"
Rice
<
Her
<
While
Despite
Section
Anne
<
While
<
The
One
"
The
Rice
In
Her
Beyond
<
Rice
<
This
Rice
Her
Section
<
Rice
<
Her
Rice
<
Her
<
Her
Beyond
Her
While
<
Rice
<
Anne
Her
As


AttributeError: 'NoneType' object has no attribute 'vocab'

In [43]:
sentences

[None,
 Written,
 ChatGPT,
 21, 2023.
 Anne Rice.
 Section,
 1: Early Life and Childhood.
 Anne,
 Rice, born Howard Allen Frances,
 O'Brien on October 4, 1941,,
 in New Orleans, Louisiana,
 , was an American,
 best known for her,
 fiction and horror,
 None,
 .
 Rice spent her,
 None,
 years in the culturally rich and,
 diverse,
 None,
 of New Orleans, a city,
 would later serve as a vivid,
 for many of,
 her literary works.
 She was the,
 of four sisters and,
 None,
 in a Catholic household, deeply,
 None,
 by the city's unique,
 of religious,
 and supernatural,
 .
 Tragedy struck,
 young Anne,
 None,
 when her mother, Katherine Allen O'Brien, passed,
 None,
 of alcoholism when Anne was just 14 years old.,
 profound loss would shape much,
 of Rice,
 's later writing,,
 None,
 with themes of,
 , redemption, and the,
 .
 Despite the,
 , she found,
 in literature and developed,
 a keen interest in,
 macabre,
 fantastical.
 Her,
 fascination with the,
 and,
 None,
 laid the foundation,
 he

In [11]:
pattern = r"<edit>?(.*?)</edit>"

for sent in sents:
    matches = re.findall(pattern, sent.text)
    print(matches)
    print(sent)
    print("----")

[]
Anne Rice.
Section 1: Early Life and Childhood.

----
[]
Anne Rice, born Howard Allen Frances O'Brien on October 4, 1941, in New Orleans, Louisiana, was an American author best known for her gothic fiction and horror novels.

----
[]
Rice spent her formative years in the culturally rich and diverse environment of New Orleans, a city that would later serve as a vivid backdrop for many of her literary works.

----
[]
She was the second of four sisters and grew up in a Catholic household, deeply influenced by the city's unique blend of religious traditions and supernatural folklore.

----
[]
Tragedy struck the young Anne early on when her mother, Katherine Allen O'Brien, passed away of alcoholism when Anne was just 14 years old.

----
[]
This profound loss would shape much of Rice's later writing, infusing it with themes of grief, redemption, and the supernatural.

----
[]
Despite the challenges, she found solace in literature and developed a keen interest in the macabre and fantastica

In [7]:
import re

pattern = r"<edit>(.*?)</edit>"
matches = re.findall(pattern, text)

for match in matches:
    print(match)


 When Rice was sixteen, her father moved the family to north Texas, purchasing their first home in Richardson. 
 Following her graduatation from Richardson High in 1959, she completed her first year at Texas Woman's University in Denton and transferred to North Texas State College for her second year. 
 After dropping out of college due to financial struggles, Rice moved to San Francisco and stayed with the family of a friend until she found work as an insurance claims processor. 
 In 1973, while still grieving the loss of her daughter, Rice took a previously written short story and turned it into her first novel, the bestselling Interview with the Vampire”  which was published in 1976. 
 Her books that solidified her status as a master storyteller in the realm of gothic fiction are:  "The Queen of the Damned" (1988), "The Tale of the Body Thief" (1992), and "Memnoch the Devil" (1995). 
 While reaction to her early works was initially mixed, she gained a better reception with critics i

In [78]:
def sentence_segmenter(doc):
    return doc.split('\n')

@English.component("sentence_segmenter")
def set_sent_starts(doc):
    for sent_start_char, sent_end_char in sentence_segmenter(doc.text):
        sent = doc.char_span(sent_start_char, sent_end_char)
        sent[0].sent_start = True
        for token in sent[1:]:
            token.sent_start = False
    return doc

nlp.add_pipe("sentence_segmenter", name='sentence_segmenter', before='parser')
# for d in nlp.pipe([text], disable=["tok2vec", "tagger", "attribute_ruler", "lemmatizer"]):
#     for s in d.sents:
#         print(s)
#         print("=============")


ValueError: [E007] 'sentence_segmenter' already exists in pipeline. Existing names: ['tok2vec', 'tagger', 'sentence_segmenter', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner']

In [79]:
for s in nlp(text).sents:
    print("=============")
    print(s)


ValueError: too many values to unpack (expected 2)

In [4]:
parser = PrepareSentenceContext(engine='spacy')

In [5]:
df = pd.DataFrame(parser.parse_sentences(text))

In [8]:
df.head(10)

Unnamed: 0,text,length,context,tag,number_in_par
0,July 18 2023.,4,,,1
1,Welsh Corgi.\n\nIntroduction.\n,7,,,2
2,"The Welsh Corgi, often simply referred to as C...",33,,,3
3,"Renowned for their distinctive appearance, cha...",27,,,4
4,"Originally bred to herd cattle, sheep, and hor...",28,,<edit>,5
5,This article delves into the fascinating histo...,20,,,6
6,From their origin as indispensable herding aid...,41,,,7
7,"Moreover, we examine the different varieties o...",28,,,8
8,Join us on this journey to discover the captiv...,34,,,9
9,History.\n,3,,,10


In [6]:
df[(df['tag'] == '<edit>') & (df['length'] < 15) ]

Unnamed: 0,text,length,context,tag,number_in_par
36,"With proper care, Welsh Corgis live between 11...",13,,<edit>,37
71,She also bred several litters from her beloved...,12,,<edit>,72
75,Their enthusiastic nature and fox-like feature...,13,,<edit>,76


In [33]:
for token in parsed:
    if str(token) == "<EDIT>":
        token.is_sent_start = True
        token.is_sent_end = False

    if str(token) == "</EDIT>":
        token.is_sent_start = False
        token.is_sent_end = True

[(t, t.is_sent_start) for t in parsed[70:]]

[(1922, False),
 (., False),
 (, False),
 (<, True),
 (edit, False),
 (>, False),
 (He, True),
 (first, False),
 (began, False),
 (work, False),
 (in, False),
 (the, False),
 (Cubist, False),
 (style, False),
 (initially, False),
 (popularized, False),
 (by, False),
 (Picasso, False),
 (., False),
 (<, True),
 (/edit, False),
 (>, False),
 (, False),
 (Later, False),
 (,, False),
 (he, False),
 (developed, False),
 (a, False),
 (reputation, False),
 (for, False),
 (his, False),
 (avant, False),
 (-, False),
 (garde, False),
 (ideas, False),
 (and, False),
 (eccentric, False),
 (behavior, False),
 (., False),
 (, False),
 (He, True),
 (soon, False),
 (became, False),
 (associated, False),
 (with, False),
 (the, False),
 (Surrealist, False),
 (movement, False),
 (,, False),
 (which, False),
 (sought, False),
 (to, False),
 (explore, False),
 (the, False),
 (subconscious, False),
 (mind, False),
 (and, False),
 (its, False),
 (influence, False),
 (on, False),
 (creativity, False),
 (., Fa

NameError: name 'nlp' is not defined