In [1]:
from .utils import strip_spaces, type_check
from copy import copy as cp
import re

class Document:
    def __init__(self, text):
        # Any type of preprocessing could be done or stored here
        type_check('text', text, [str])
        self.raw_text = text
        # Are there efficient ways of mapping charachters to these?
        self.paragraphs = [(item_found.start(), item_found.start() + len(item_found.group()), item_found.group()) for item_found in re.finditer(r'[^\r\n]+',text)]
        # sentences are in a paragraph
        def indexer(sentences):
            total_length = -1
            for sentence in sentences:
                total_length+=1
                start_pos = total_length
                total_length += len(sentence)
                yield (start_pos, total_length, sentence)
        self.sentences = [(paragraph_start +sentence_start,paragraph_start + sentence_stop ,sentence, paragraph_idx) for paragraph_idx, (paragraph_start,_,paragraph) in enumerate(self.paragraphs) for sentence_start, sentence_stop, sentence in indexer(self.split_sentences(cp(paragraph)))]
        self.sentence_paragraph_mapping = {sentence_idx : paragraph_idx for sentence_idx, (_,_,_,paragraph_idx) in enumerate(self.sentences)}
        _,_,sentence_strings,_ = zip(*self.sentences)
        self.text = " ".join(sentence_strings)
        self.cleaned_sentences = [strip_spaces(sentence) for sentence in sentence_strings]
        self.cleaned_text = " ".join(self.cleaned_sentences)
        self.cleaned_text_sentence_mapping = []
        current_idx = -1
        for cleaned_sentence_idx, cleaned_sentence in enumerate(self.cleaned_sentences):
            current_idx+=1
            self.cleaned_text_sentence_mapping.append([current_idx, current_idx+ len(cleaned_sentence),cleaned_sentence_idx])
            current_idx+= len(cleaned_sentence)
            
    def split_sentences(self, text):
        alphabets= "([A-Za-z])"
        alphabets= "([A-Za-z])"
        prefixes = "(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt|Inc|Ltd|Jr|Sr|Co|Ann|Rev|Stat|Fin|Admin)[.]"
        starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
        acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
        top_level_domains = "[.](com|net|org|io|gov|me|edu|us|fr|ca)"
        digits = r"([0-9]+)"
        # TODO: ADD I.E. E.G. detection
        ie_eg = r"(e\.g\.|i\.e\.)"
        text = " " + text + "  "
        text = text.replace("\n"," ")
        text = text.replace("\r"," ")
        text = re.sub(prefixes,"\\1<prd>",text)
        text = re.sub(top_level_domains,"<prd>\\1",text)
        text = re.sub("Ph.D.","Ph<prd>D<prd>", text)
        text = re.sub("\.{3}","<prd><prd><prd>", text)
        
        text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
        text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
        
        # ACRONYMS + i.e. +e.g.
        text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]",r"\1<prd>\2<prd>\3<prd>",text)
        text = re.sub(alphabets + "[.]" + alphabets + "[.]",r"\1<prd>\2<prd>",text)
        
        # Handle digits
        text = re.sub(digits + "[.]" + digits,r"\1<prd>\2",text)
        
        # Handle lists
        text = re.sub(r"^(\s*[a-zA-Z0-9])\.", r"\1<prd>",text)
        
        # Handle punctuation in qoutes
        text = re.sub(r'([!?.])(”|")',r'\2\1',text)
        '''
        if "”" in text: text = text.replace(".”","”.")
        if "\"" in text: text = text.replace(".\"","\".")
        if "!" in text: text = text.replace("!\"","\"!")
        if "?" in text: text = text.replace("?\"","\"?")
        '''
        
        # Insert special char on EOS
        text = re.sub('([!?.])','\1<stop>',text)
        '''
        text = text.replace(".",".<stop>")
        text = text.replace("?","?<stop>")
        text = text.replace("!","!<stop>")
        '''
        # Replace escape characters for period
        text = text.replace("<prd>",".")
        
        # Split using that char
        sentences = text.split("<stop>")
        
        # Handle case if there are no splits
        if len(sentences) == 1:
            sentences[0] = sentences[0].strip()
        else:
            sentences = [s.strip() for s in sentences[:-1]]
        return sentences

    def get_context(self, char_position):
        sentence_idxs = [proposed_sentence[2] for proposed_sentence in self.cleaned_text_sentence_mapping if proposed_sentence[0] <= char_position and proposed_sentence[1] >= char_position]
        if len(sentence_idxs) >= 1:
            sentence_idx = sentence_idxs[0]
        else:
            #import pdb;pdb.set_trace()
            raise ValueError(f"Unable to find char at position {char_position}")
        
        paragraph_idx = self.sentence_paragraph_mapping[sentence_idx]
        position_string = "Paragraph # {}, sentence # {}".format(paragraph_idx + 1, sentence_idx + 1)
        context = self.sentences[sentence_idx][2]
        return position_string + ": " + context

ImportError: attempted relative import with no known parent package