# BEQ processor

Walkthrough of a proof of concept taking a set of epub files as input and rendering a corpus of XML-TEI files just as the Lexicoscope 2.0 wants them.

Step 0 involves actually getting the ePub, doc or docx files to process. 
Step 1 involves a set of imports and declarations for the code to work
Step 2 involves converting files from epub to XML
Step 3 involves converting files from doc/docx to XML


# Step 1 : Preparations
Preparations involve importing the libraries needed, as well as any installations needed. Then we declare the functions we'll be using.

In [7]:
# 1.1 imports : we'll be using these libraries
import subprocess
import os
import sys

from bs4 import BeautifulSoup
from lxml import etree
from tqdm import tqdm
from docx import Document


from collections import defaultdict



In [8]:
# 1.2 installs that may be needed 

##! pip install ebooklib
#! pip install docx
#! pip install python-docx
#! brew install --cask libreoffice
#! libreoffice --version


In [3]:
# 1.3 functions
## functions for step2::
def epub_to_xml(epub_file):
    '''
    Transform an epub into an xml file iterating over xml elements
    Input: 
        epub_file : string : absolute path to an epub file to process
    Returns :
        No return object. An xml file is written to the same directory as the source file with a modified extension.
    '''

    # Use the input file as the basis of the name of the output file
    output_xml = epub_file.replace('.epub','v2.xml')

    
    # Read the epub file and create an XML root element
    book = epub.read_epub(epub_file)
    root = etree.Element("book")

    # iterate over the spine of the book element, usign the BS parser to parse each item
    for item_id, _ in book.spine:
      item = book.get_item_with_id(item_id)
    
      if item and item.media_type == "application/xhtml+xml":

        soup = BeautifulSoup(item.get_content(), features="xml")
        
        # Create a new div block for each chapter
        div = etree.SubElement(root, "div", {"class": "chapter"})

        # get the text of these tag elements,  make a new p element, then add the text to this p element
        for tag in soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6", "b", "i", "strong", "em"]):
            if tag.name == "p":
                p = etree.SubElement(div, "p")
                p.text = tag.get_text()
            # TO TO : this seems unnecessaary with the h elements listed above
            elif tag.name in ["h1", "h2", "h3"]:
                # Treat headings as chapter div markers
                div = etree.SubElement(root, "div", {"class": "chapter"})
                title = etree.SubElement(div, "title")
                title.text = tag.get_text()
            ## ensure that b, string, i and em elements are rendered as either b or i elements
            elif tag.name in ["b", "strong"]:
                b = etree.SubElement(div, "b")
                b.text = tag.get_text()
            
            elif tag.name in ["i", "em"]:
                i = etree.SubElement(div, "i")
                i.text = tag.get_text()
        
        # Remove links & references
        for a_tag in soup.find_all("a"):
            a_tag.extract()
    
    
    # Convert to XML string
    xml_str = etree.tostring(root, pretty_print=True, encoding="utf-8").decode()

    # Save to file
    with open(output_xml, "w", encoding="utf-8") as f:
        f.write(xml_str)

    print(f"XML saved to {output_xml}")
    

## functions for step3
def docx_to_xml(docx_path, xml_path):
    """Parses .docx and writes XML version"""
    doc = Document(docx_path)
    root = etree.Element("document")

    for i, para in enumerate(doc.paragraphs):
        p_el = etree.SubElement(root, "paragraph", index=str(i))
        p_el.text = para.text

    tree = etree.ElementTree(root)
    tree.write(xml_path, pretty_print=True, xml_declaration=True, encoding="utf-8")

def doc_to_xml(doc_path):
    '''
    Convert doc files to xml
    Inputs:
        doc_path : string : absolute path to a doc file to be converted
    Returns :
        No return object. On success, an xml file will be created at `xml_path` : `xml_path` will be printed to the console
    
    '''
    # Ensure file exists
    if not os.path.isfile(doc_path):
        raise FileNotFoundError(f"No such file: {doc_path}")

    
    def convert_doc_to_docx(doc_path):
        """
        Convert a doc file to docx
        Inputs:
            doc_path : str : absolute path to a doc file to be read
        """
        subprocess.run([
        "/Applications/LibreOffice.app/Contents/MacOS/soffice",
            "--headless",
            "--convert-to",
            "docx",
            doc_path,
            "--outdir",
            os.path.dirname(doc_path)
        ], check=True)
        
        
    # Build paths for docx file to be created as an interim format, and the output xml file
    base, _ = os.path.splitext(doc_path)
    docx_path = base + ".docx"
    xml_path = base + ".xml"

    # Convert .doc to .docx
    convert_doc_to_docx(doc_path)

    # Convert .docx to .xml
    docx_to_xml(docx_path, xml_path)

    print(f"XML saved to: {xml_path}")


def docx_source_to_xml(docx_path):
    '''
    Convert a docx file to an xml file
    Inputs:
        docx_path : str: absolute path to a docx file to convert
    Returns :
        No return object : a file will be exported to `xml_path` : this path will be printed in console on success.
    '''
    # build the output path from the input path
    xml_path = docx_path.replace('.docx','.xml')

    # Convert .docx to .xml
    docx_to_xml(docx_path, xml_path)

    print(f"XML saved to: {xml_path}")

    
# functions for step 4:
def p_to_s_as_w(p_blocks, tok_count):
  '''
  extract the text from p blocks and attach tokens as w children of this p block to tokenise. The additional attribute `EOS` it set to True for tokens in the list of canonical end-of-sentence punctuation, eos_els..
  Inputs :
      p_blocks: list : a list of etree elements. 
      tok_count : int : token counter, reset to 0 for each text
  Reutrns : 
      No return object. The etree elements are modified in situ.
  ## TO DO : update this tokeniser with the tokeniser function, pool processor… 
  '''
  prev_tok = "_"
  # define a list of elements that mark the end of sentences
  eos_els = ('…',r'?','!',r'.',r'\n')
  excl_list = ()#('1','2','3','4','5','6','7','8','9','0', 'L', "M", "R")
  ## iterate over the p blocks, getting the text and tidying it
  for p_block in tqdm(p_blocks):
  # p_block = p_blocks[134]
    current_parent = p_block.getparent()
    line_raw = ''.join([chunk for chunk in p_block.itertext()]).strip()
    line_raw = re.sub(r'\.\.\.',' … ', line_raw)
    line_tok = re.sub(r'([\?\!\(\)\.,:;\"])', r' \1 ', line_raw)
    line_tok = re.sub(r'[\'’]', r"' ", line_tok)
    line_tok = re.sub(r"aujourd' hui", r"aujourd'hui", line_tok)
    line_tok = re.sub(r"rud' homm", r"rud'homm", line_tok)
    line_tok = re.sub(r'(-je|-tu|-il|-elle|-on|-ça|-cela|-nous|-vous|-ils|-elles|-moi|-toi|-lui|-leur|-en|-y|-ilz)',r' \1',line_tok)
    line_tok = re.sub('  ', ' ', line_tok )
    line_tok = re.sub('  ', ' ', line_tok )
    line_tok = re.sub('  ', ' ', line_tok )
    line_tok = re.sub('  ', ' ', line_tok )
    line_tok = re.sub('\t\t','\t',line_tok)
    line_tok = re.sub('(\n\n)+','\n',line_tok)
    line_tidy = re.sub(r'^ | $','', line_tok) 
    
    # when text is tidied, split the text on spaces to yield tokens
    all_toks = line_tidy.split(' ')
    ## for each token, nake a w element as a child of the p element, and add the token count as an attribute.
    for tok in all_toks:
      tok_count +=1
      w_el = etree.Element('w')
      w_el.set('id', str(tok_count))
      w_el.text = tok
      ## if the token is an 
      if tok in eos_els and prev_tok not in excl_list:
        w_el.set('EOS','True')
      # print(tok, tok_count, tok in eos_els)
      prev_tok = tok
      p_block.append(w_el)
    p_block.text = ""  

def run_sentencisation_modifier(input_file):
  '''
  run the sentencisation processor over the tokenised files to attribute tokens to sentences and adjust sentence boundaries based on the presence of further punctuation marks
  Inputs:
      input_file : string : absolute path to an xml file to be sentencised
  Returns :
      No return object. A file is exported in the same directory as the input file, with an incremented suffix : v2 in, v3 out.
  '''
  # parse the file as a tree and get the list of w elements
  input_tree = etree.parse(input_file)
  w_elements = list(input_tree.iter("w"))
  s_count =1
  # attach each word to a sentence by giving it a sentnum attribute and value
  for w in tqdm(w_elements):
      w.set("sentnum", str(s_count))
      if w.text == "…" and 'EOS' in w.attrib.keys() :
        del w.attrib['EOS']
      # If this <w> element has EOS="true", finalize the current <s> and start a new one
      if w.get("EOS") == "True" or w.text in (':',';'):
        w.set("sentnum", str(s_count))
        s_count +=1 
  input_tree.write(input_file.replace('v2.xml','v3.xml'), encoding='UTF-8', pretty_print=True)


    
def group_w_by_num_and_wrap_with_s(input_tree):
    '''
    Build a dictionary of of w elements where sent numbers are keys then use this dictionary to make s elements that are parents of w elements
    Inputs :
        input_tree : etree : an lxml etree
    Returns :
        No return object. `input_tree` is modified in place and then exported to the same directory as the source file, with th esuffix incremented from v3 to v4
    '''  
    root = input_tree.getroot()
    # Step 1: Collect all <w> elements grouped by num
    sentnum_to_w = defaultdict(list)
    w_to_parent = {}

    for parent in root.iter():
        for w in list(parent):  # Convert to list to safely modify tree
            if w.tag == 'w' and 'sentnum' in w.attrib:
                sentnum = w.attrib['sentnum']
                sentnum_to_w[sentnum].append(w)
                w_to_parent[w] = parent

    # Step 2: Wrap groups with same num in <s>
    for sentnum, w_elements in sentnum_to_w.items():
        if len(w_elements) <= 1:
            continue  # Only wrap if group has more than one <w>

        # Assume all have same parent
        parent = w_to_parent[w_elements[0]]

        # Create new <s> element
        s_elem = etree.Element('s')
        s_elem.attrib['sentnum'] = sentnum

        # Find index of first <w> to insert <s> in correct position
        insert_index = parent.index(w_elements[0])
        parent.insert(insert_index, s_elem)

        for w in w_elements:
            try:
              parent.remove(w)
              s_elem.append(w)
            except Exception as e :
              s_elem.append(w)
              
    input_tree.write(input_file.replace('v3.xml','v4.xml'), encoding='UTF-8', pretty_print=True)

def make_xmlconllu(input_file):
  '''
  Transform an xml document into an xml-conllu document by making a series of raw conllu strings for each sentence
  Inputs:
      input_file : string : absolute path to an xml file
  Returns:
      No return object. An xml file is printed with the xml-conllu strings as the text attribute of sentence elements. The file is printed tot the same directory as the input, with the suffix incremented from v4 to v5

  '''
  # parse the xml 
  input_tree = etree.parse(input_file)
  w_els = input_tree.findall(".//w")
  # and get the w elements and delete the sentnum attribute which is now redundant
  for w_el in w_els:
      del w_el.attrib['sentnum']
  # get the s elements which contain the sentences and iterate over them
  s_blocks = input_tree.findall(".//s")
  for s_block in s_blocks:
    # make a new list for each sentence, adding the sentence numnber as the sent_id in a conll meta line
    outputlist = []
    metaline = f'\n\n# sent_id = {s_block.get("sentnum")}\n'
    outputlist.append(metaline)
    ## iterate over the tokens in the sentence, converting them to conll strings and appending them to the list of strings that are the sentence
    for n, w_block in enumerate(s_block.findall(".//w"), start=1):
      left = f'{str(n)}\t{w_block.text}'
      right = f"\t_\t_\t_\t_\t_\t_\t_\tw_{w_block.get('id')}"
      line = f'{left}{right}\n'
      outputlist.append(line)  
    ## join all the strings into 1  large string which constitutes the entire sentence as 1 valid conll string, and add this as the text of the s element
    s_conll = "".join([item for item in outputlist])
    s_block.text = s_conll
  input_tree.write(input_file.replace('v4','v5'), encoding='UTF-8-', pretty_print=True)


def make_conll_docs(input_file):
  '''
  Take an xml-conllu file and extract the conll strings to a validated conll document
  Inputs : 
    input_file : string : absolute path to the same input file as `make_xmlconllu` - the path will be automatically modified to point to the xml-cnllu file createdby this funciton.
  Returns :
    No return object. The function to the same directory as the input, with the extension changed to .conllu
  
  '''
  input_file = input_file.replace('.v4xml','.v5xml')
  input_tree = etree.parse(input_file)
  outputfile = input_file.replace('.xml','.conllu')
  s_chunks = input_tree.findall(".//s")
  output = []
  for s_chunk in s_chunks:
    conll_lines = s_chunk.text
    output.append(conll_lines)
  doc = CoNLL.conll2doc(input_str = "".join([item for item in output]))
  with open(outputfile, 'w', encoding='UTF-8') as c:
    for chunk in output:
      _ = c.write(chunk)

def reinsert_conll_annots(conll_annotation_file):
  '''
  Reinsert conll annotations into an existing xml-conll document
  Inputs:
      conll_annotation_file : string : absolute path to the file of conllu annotations from the parser
  Returns :
      No return object. The function will print an XML-conllu file to the same location as the conllu and xml-conllu files with the the xml extension and the suffix incremented from v6 to v6.
  '''  

  # building filepaths for the source xml file based on the inputconll file, and the output file to print to by incrementing the suffix
  target_xmlfile = input_conllfile.replace('_out.conllu','.xml').replace('07_conlluout','05_v5xml')
  output_xmlconllu = target_xmlfile.replace('05_v5xml','08_xmlconllu').replace('v5.xml','v6.xml')

  # loads the xml tree and build a dictionary of sentence numbers and elements
  input_tree = etree.parse(target_xmlfile)
  target_dict = {element.get("sentnum"):element for element in input_tree.findall(".//s")}

  # load the annotated conll doc and iterate over sentences
  conll_doc = CoNLL.conll2doc(input_conllfile)
  for sent in conll_doc.sentences:
    # remove the head from the conll meta line to yield the sentence number, which is the key in `target_dict`
    sent_idchunk = sent.comments[0].replace('# sent_id = ','')
    target_el = target_dict.get(sent_idchunk)
    # concatenate the conll text into a single string with the necessary line breaks to represent the sentence and add this as the attribute of the s element
    target_el.text =   "\n".join([token.to_conll_text() for token in sent.tokens])
  
  # remove the w elements from the tree, as we won't use them again, then print the tree to file
  # prune tree
  for w in input_tree.findall(".//w"):
    parent = w.getparent()
    parent.remove(w)
  input_tree.write(output_xmlconllu, encoding='UTF-8', pretty_print=True)
  

def make_bibdict(biblio_file):
    '''
    make a dictionary from an html file to automate adding bibliographical into to TEI headers
    Inputs :
        biblio_file : str : absolute file path to an html file of bibliographic data prepared for header creation
    Returns :
        bib_dict : dict : a python dictionary of the bibliographical information with filenames as keys
    '''
    bib_dict = {}
    biblio_file = '/Users/username/Downloads/beq/biblio.html'
    bib_tree = etree.parse(biblio_file)
    for author in bib_tree.findall(".//author"):
      author_name = author.findall(".//name")[0].text
      print(author_name)
      works = author.findall(".//work")
      for work in works:
        filename = work.get("version").replace('.pdf','').replace('.doc','')  
        title = work.get("title")
        data = author_name, title
        key = filename
        bib_dict[key] = data
    return bib_dict
    
def make_and_insert_headers(xml_conllufile, bib_dict, error_list):    
  '''
  Make teiHeaders based on a custom dictionary, inserting the header into a XML-conllu file which is exported.
  Inputs :
    xml_conllufile : string : absolute path to an xml_conllu file created as part of this pipeline
    bib_dict : dictionary : a python dictionary with filenames as keys and author-title information in subdicts
    error_list : list : a list of input files which there were errors during processing
  Returns : 
    No return object. An XML-CONLL file will be printed on success, otherwise, the filename will be added ot the error_list

  '''
    # raw text to parse into an etree element to define header    
  empty_header = '''
        <teiHeader>
            <fileDesc>
                <titleStmt>
                    <title></title>
                    <date></date>
                    <author></author>
                    <respStmt>
                        <name />
                        <resp />
                    </respStmt>
                    <respStmt>
                        <name />
                        <resp />
                    </respStmt>
                </titleStmt>
                <publicationStmt>add custom string here</publicationStmt>
                        <profileDesc>
            <langUsage>
                <language ident="fr" />
            </langUsage>
      </profileDesc>
            </fileDesc>
        </teiHeader>
  '''
  # make filenames based on replacements file names and paths
  outputname = xml_conllufile.replace('08_xmlconllu','09_xmlconllu').replace('v6.xml','v7.xml')
  filename = xml_conllufile.replace(path, '').replace('v6.xml','')

  # parse the empty header and the input xml-conllu file
  teiheader = etree.fromstring(empty_header)
  input_tree = etree.parse(xml_conllufile)

  # get the root and chante its tag to text
  root = input_tree.getroot()
  root.tag = "text"
  ## iterate over s blocks, setting the id attribute to the value of the sentnum attribute, then delete the sentnum attribute
  for sblock in root.findall(".//s"):
    sblock.set("id", str(sblock.get('sentnum')))
    del sblock.attrib['sentnum']

  # getting the author, title info from the dictionary based on the filename. Initially, try on the filename as it is ; if there is no match, try splitting the filename on th ehyphen, as some filenames contain both the author name adn the title.
  try:
    author, title = bib_dict[filename]
  except KeyError:
    errorlist.append(filename)
    try:
      author, title= filename.split("-")
    except ValueError:
      errorlist.append(filename)
      author, title= filename, filename
    
  # set the author and title attributes in the title statement
  teiheader.findall(".//titleStmt/title")[0].text = title
  teiheader.findall(".//titleStmt/author")[0].text = author
  # make a new top level element, and add append header and root elements to it
  new_tree = etree.Element("TEI.2")
  new_tree.append(teiheader)
  new_tree.append(root)
  
  # convert the new top level element into an Element tree, then print to file
  output_tree = etree.ElementTree(new_tree)
  output_tree.write(outputname, encoding='UTF-8', pretty_print=True)
    

Once these cells have been run, step 1 is done, and we're ready to actually process some files.

# Step 2
Step 2 has three parts, to deal with ePub files, doc files and docx files.

In [None]:
#2.1  get a folder of epub files : I put them in a folder of epubs here :
epub_files = glob.glob(f'/Users/{username}/Downloads/beq/epub/*.epub')

# Run the converter to convert the epub files to XML
for epub_file in tqdm(epub_files):
  epub_to_xml(epub_file)


In [None]:
# 2.2
# In step 3, we need to process documents according to their filetype. First, we'll run the processor for the docx files

# convert docX files to xml
docx_files = glob.glob(f'/Users/{username}/Downloads/beq/doc/*.docx')
for docx_path in tqdm(docx_files):
  docx_source_to_xml(docx_path)


In [None]:
# 2.3
# And then we'll run the special processor for the doc files.

doc_filelist = glob.glob(f'/Users/{username}/Downloads/beq/doc/*.doc')
for input_file in tqdm(doc_filelist):
  doc_to_xml(input_file)


This now ends step 2 : we've got a folder of XML files extracted from epub, doc and docx files

# Step 3
In step3 we'll tokenise : ie turn our paragraphs into words, for all the XML files we created.

In [None]:
current_files = glob.glob(f'/Users/{username}/Downloads/beq/epub/*.xml')
for current_file in current_files:
  current_tree = etree.parse(current_file)
  p_blocks = current_tree.findall(".//p")
  tok_count = 0
  p_to_s_as_w(p_blocks, tok_count)
  current_tree.write(current_file.replace('.xml','v2.xml'), encoding='UTF-8', pretty_print=True)


# Step 4
In step 4, we'll sentencise: say where the sentence boundaries are

In [None]:

input_files = glob.glob(f'/Users/{username}/Downloads/beq/epub/*v2.xml')
for input_file in tqdm(input_files):
    run_sentencisation_modifier(input_file)

# Step5
In step 5, we'll add sentences as an explicit level of tags between p and w based on the attributes, to get the needed XML structure

In [None]:
##   use the sent numbers to define sentences and create s elements as parents of w and children of p
## now have all w element siwht w id, and sent_num, and EOS attrib. need to it=erate over tree
root = input_tree.getroot()

input_files = glob.glob(f'/Users/{username}/Downloads/beq/epub/*v3.xml')
for input_file in tqdm(input_files):
  input_tree = etree.parse(input_file)
  group_w_by_num_and_wrap_with_s(input_tree)

    

# Step 6
In step 6, we convert the XML to XML-conllu, and extract just the conllu to send it to the parser.

In [None]:
######### next step: XML to xml-conllu and conllu
input_files = glob.glob(f'/Users/{username}/Downloads/beq/epub/*v4.xml')
# tidy w blocks, make s_conllu
for input_file in tqdm(input_files):
    make_xmlconllu(input_file)
    make_conll_docs(input_file)

# Step7
In step 7, the conll documents are parsed by  parser, such as Stanza (not documented here, as it's entirely standard processing to feed Stanza models conll documents and get conll documents out.

# Step8
In step 8, we get the annotated conll files from the parser and reinsert them back into the XML tree.

In [None]:

# 8 sent conll annotations back to xml  
input_conllfiles = glob.glob(f'/Users/{username}/Downloads/beq/epub/07_conlluout/*.conllu')
for conll_annotation_file in tqdm(input_conllfiles):
    reinsert_conll_annots(conll_annotation_file)

# Step 9 
In step 9, we add headers to the XML files based on a custom dictionary and export the XML-conll after some final structuring of the XML.

In [None]:

#step 10:   build a dictionary of bibliographic data from html file
biblio_file = f'/Users/{username}/Downloads/beq/biblio.html'
bib_dict = make_bibdict(biblio_file)
path = '/Users/Adam/Downloads/beq/epub/08_xmlconllu/'
errorlist =[]
xml_conllufiles = glob.glob(f'/Users/{username}/Downloads/beq/epub/08_xmlconllu/*.xml')
for xml_conllufile in tqdm(xml_conllufiles):
    make_and_insert_headers(xml_conllufile, bib_dict, error_list)

Done ! We've not got a corpus of texts formatted for the LExicoscope to ingest as xml-conllu