# Workbench

Notebook used to do development.

In [None]:
import pypandoc
import os
import subprocess
import yaml
import re

def compile_latex(latex_content: str
                               , output_directory: str='out'
                               , tex_filename: str='document.tex'
                               , pdf_filename: str='document.pdf') -> str:
    """
    Compile a LaTeX file to a PDF using some engine (so far: pdflatex).

    Args:
        tex_filepath (str): The path to the LaTeX (.tex) file to be compiled.
        output_directory (str): The directory where the compiled PDF should be saved. Default is 'output'.
        pdf_filename (str): The name of the generated PDF file. Default is 'document.pdf'.

    Returns:
        str: The path to the generated PDF file if compilation is successful, None otherwise.

    Note: Written with ChatGPT
    """

    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)
    
    # Write the LaTeX content to a .tex file
    tex_filepath = os.path.join(output_directory, tex_filename)
    with open(tex_filepath, 'w') as tex_file:
        tex_file.write(latex_content)
    
    # Compile the .tex file to a PDF using pdflatex
    try:
        process = subprocess.run(['pdflatex', '-halt-on-error'
                                  , '-output-directory', output_directory, tex_filepath],
                                 stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        
        # Check for errors
        if process.returncode != 0:
            print("Error during LaTeX compilation:")
            print(process.stdout)
            print(process.stderr)
            return None
        else:
            print("Compilation successful")
        
        # Return the path to the generated PDF
        return os.path.join(output_directory, pdf_filename)
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

class SingleDocument:
    """Class holding information for a single document"""

    ### CLASS VARIABLES
    filepath_source: str = ''
    """Path to which the filename is relative"""

    filepath_output: str = ''
    """Filepath relative to CWD where output files will lie
    
    Used to correctly link resources for Latex generation"""

    verbose: bool = True
    """Set to True for verbose info"""

    def __init__(self,filename: str):
        # set instance variables
        self.filename = filename
        self._markdown_raw: str = None #markdown file content as text (raw = unmodified)
        self._markdown_mod: str = None #markdown file content as text (modified)
        self._latex_raw: str = None #latex representation of file (raw = unmodified)
        self._latex_mod: str = None #latex representation of file (modified)

        self._metadata: dict = None #dictionary holding metadata
        return
    
    ### PUBLIC
    
    def get_markdown_text(self) -> str:
        """return the (modified) markdown text of this document
        
        if it has not been loaded, it will do so"""

        if self._markdown_mod is not None:
            return self._markdown_mod
        
        if self._markdown_raw is not None:
            return self._modify_markdown()
        
        self._read_markdown_text()
        return self._modify_markdown()
    
    def get_latex_text(self) -> str:
        """return the latex text of this document
        
        loads and converts it if needed"""

        if self._latex_mod is not None:
            return self._latex_mod
        
        if self._latex_raw is not None:
            return self._modify_latex()
        
        self.get_markdown_text() #get and modify markdown
        self._convert_to_latex()
        return self._modify_latex()

    
    ### PRIVATE

    def _read_markdown_text(self) -> str:
        """Reads the file specified by the filename, stores it, and returns it
        
        also loads the metadata"""
        if SingleDocument.verbose:
            print("Reading content of file "+self.filename+"...")

        filepath = SingleDocument.filepath_source + self.filename
        assert os.path.exists(filepath), \
            "Document "+filepath+" does not exist!"
        
        with open(filepath,'r',encoding='utf-8') as file:
            assert file.readable(), "File "+filepath+" is not readable!"
            content = file.read()

        ### Extract Metadata
        self._metadata = self._extract_yaml_header(content)

        self._markdown_raw = content
        return content
    
    def _modify_markdown(self) -> str:
        """Modifies the raw markdown string, stores it, and returns it"""
        if SingleDocument.verbose:
            print("Mopdifying raw markdown of file "+self.filename+"...")

        assert self._markdown_raw is not None, "need to load raw markdown first!"

        ### CONVERTIONS
        converted = self._markdown_raw

        # add relative path to metadata
        relative_path = os.path.relpath(self.filepath_source, start=self.filepath_output)
        converted = self._extend_yaml_header(content=converted, new_key="resource_path_mod"
                                             ,new_value=relative_path)
        # add relative path from current directory
        relative_path_curdir = os.path.relpath(self.filepath_source)
        converted = self._extend_yaml_header(content=converted, new_key="resource_path_mod_curdir"
                                             ,new_value=relative_path_curdir)
        
        # take care of non-supported latex characters
        converted = converted.replace('\u219D', '$\leadsto$') #↝
        converted = converted.replace('\u21D2', '$\Rightarrow$') #⇒ (U+21D2)
        converted = converted.replace('\u21D0', '$\Leftarrow$') #⇐ (U+21D0)
        converted = converted.replace('\u2194', '$\leftrightarrow$') #↔ (U+2194)
        converted = converted.replace('\u21D4', '$\Leftrightarrow$')#⇔ (U+21D4)
        converted = converted.replace('\u2264', '$\leq$')#≤ (U+2264)
        converted = converted.replace('\u2265', '$\geq$')#≥ (U+2265)
        converted = converted.replace('\u2260', '$\\neq$')#≠ (U+2260)
        converted = converted.replace('\u2154', '$\\frac{2}{3}$')#⅔ (U+2154)
        converted = converted.replace('\u2153', '$\\frac{1}{3}$')#⅓ (U+2153)

        ### STORING
        self._markdown_mod = converted
        return converted
    
    def _extend_yaml_header(self, content: str, new_key: str, new_value: str) -> str:
        """
        Add a new metadata field to the YAML front matter in a given string. Adds a YAML formatter in case none exists.

        Args:
            yaml_string (str): The string containing the YAML front matter and document content.
            new_key (str): The new metadata key to add.
            new_value (str): The new metadata value to add.

        Returns:
            str: The updated string with the new metadata field added.
        """
        # Define the regex pattern to match the YAML front matter block
        pattern = r'^(\-{3}\s*\n.*?\n)(\-{3}\s*)'
        replacement = f'\\1{new_key}: \"{new_value}\"\n\\2'
        
        # Check if YAML front matter exists
        if re.search(pattern, content, flags=re.DOTALL | re.MULTILINE):
            # If YAML front matter exists, update it
            updated_yaml_string = re.sub(pattern, replacement, content, flags=re.DOTALL | re.MULTILINE)
        else:
            # If YAML front matter does not exist, add it at the beginning
            new_yaml_front_matter = f"---\n{new_key}: \"{new_value}\"\n---\n"
            updated_yaml_string = new_yaml_front_matter + content
        
        return updated_yaml_string
    
    def _extract_yaml_header(self, markdown_content: str):
        # Use a regular expression to find the YAML header at the beginning of the file
        yaml_header = re.match(r'^---\n(.*?)\n---', markdown_content, re.DOTALL)
        if yaml_header:
            yaml_content = yaml_header.group(1)
            # Parse the YAML content
            return yaml.safe_load(yaml_content)
        return None
    
    # LATEX RELATED STUFF
    
    def _convert_to_latex(self) -> str:
        """Converts the modified markdown to latex, stores it, and returns it"""
        if SingleDocument.verbose:
            print("Converting modified markdown of file "+self.filename+" to Latex...")

        assert self._markdown_mod  is not None, "need to have a modified markdown text first!"

        converted = pypandoc.convert_text(source=self._markdown_mod, to='latex',format='md'
                                          ,filters=['pandocs_filters/curdir-reference-path-resources.lua'
                                                    ,'pandocs_filters/set_graphics_width.lua'
                                                    ,'pandocs_filters/mod-reference-path-resources.lua']
                                        ,extra_args=[]) #wirte '--standalone' to see full latex output

        self._latex_raw = converted

        return converted
    
    def _modify_latex(self) -> str:
        """Modifies the raw latex string, stores it, and returns it"""
        if SingleDocument.verbose:
            print("Modifying raw latex of file "+self.filename+"...")

        assert self._latex_raw is not None, "need to convert to latex first!"

        ### CONVERTIONS

        # use metadata if available
        header = ''

        if self._metadata:
            metadata: dict = self._metadata
            if "title" in metadata:
                header += "\\mezdoctitle{"+metadata["title"]+"}\n\n"

        converted = header + self._latex_raw

        # scale images to \columnwidth in case there is no width yet
        converted = self._add_width_to_includegraphics(converted)

        # make floats floating due to issues in multicolumn
        # (https://tex.stackexchange.com/questions/12262/multicol-and-figures)
        converted = self._add_option_H_to_figures(converted)

        # convert all internal refererences to the latex command
        converted = self._replace_zettler_internal_link_with_command(converted)

        ### STORING
        self._latex_mod = converted
        return converted
    
    def _add_width_to_includegraphics(self, content: str) -> str:
        """looks for all includegraphics and adds `\columnwidth` to the graphics in case there 
        is no defined with or height yet"""

        CM_IN_INCH = 2.54 #conversion from CM to INCH
        DPI_PICTURES = 400 #"true" DPI the pictures have
        DPI_INTERNAL = 96 #amount of DPI that pandocs uses to convert px --> inch
        MAX_SIZE_IMAGE_CM = 8.0 #maximal width of an image in CM


        # Regex pattern to find \includegraphics commands
        pattern = re.compile(r'(\\includegraphics)(\[[^\]]*\])?(\{[^}]*\})')

        def extract_numbers_from_match(match):
            #extract the inches values from a match that looks like '[width=2.66667in,height=2.66667in]'
            pattern = r'(\d+\.?\d*)'
            # Use re.findall() to extract all matching numbers
            numbers = re.findall(pattern, match)
            # Convert the extracted strings to floats
            numbers = [float(num) for num in numbers]
            return numbers

        def replace_match(match):
            prefix, options, filename = match.groups()
            if options is None:
                # No options provided, add width=\columnwidth
                return f'{prefix}[width=0.8\\columnwidth]{filename}'
            elif 'width=' not in options and 'height=' not in options:
                # Options provided but without width or height, add width=\columnwidth
                return f'{prefix}[width=0.8\\columnwidth{options[1:]}{filename}'
            else:
                # Options already contain width or height: scale it down and if too
                # large do scale
                width, height = extract_numbers_from_match(options)
                # convert to cm and scale down according to DPI
                width *= CM_IN_INCH * DPI_INTERNAL / DPI_PICTURES
                height *= CM_IN_INCH * DPI_INTERNAL / DPI_PICTURES

                # check if width is too wide --> then return the columnwidth
                if(width > MAX_SIZE_IMAGE_CM):
                    return f'{prefix}[width=0.8\\columnwidth]{filename}'
                
                return f'{prefix}[width={width}cm,height={height}cm]{filename}'

        # Replace all matches in the content
        new_content = pattern.sub(replace_match, content)
        
        return new_content
     
    def _add_option_H_to_figures(self, latex_text: str) -> str:
        # Regular expression to find figure environments without any options\n",
        pattern = r'(\\begin{figure})(?!\[\w*\])'
        # Replace those occurrences with the same string but with [H] appended
        replacement = r'\1[H]'

        modified_text = re.sub(pattern, replacement, latex_text)
        
        return modified_text
   
    def _replace_zettler_internal_link_with_command(self, latex_text: str) -> str:
        # replaces `[[20240719012226]]` with the latex command for it

        pattern = re.compile(r'\{\[\}\{\[\}\d{14}\{\]\}\{\]\}') #note: the [[ ]] are compiled into {[}... for soem reason
        # Replace the pattern with \somecommand
        result = pattern.sub(r'\\mezintreference', latex_text)

        return result


def merge_documents(documents: list[SingleDocument]) -> str:
    """Merges a bunch of documents into a single latex stirng"""

    concat = ''

    for document in documents:
        latex = document.get_latex_text()

        # wrap it into a minipage
        # latex = "\\begin{minipage}{\columnwidth}\n"\
        #     + latex +"\n"\
        #     + "\end{minipage}\n"
        
        # concatenate it
        concat += latex

    # put it into the tempalte
    with open('template/template_outputfile.tex','r') as templatefile:
        template_latex = templatefile.read()

    merged = template_latex.replace('%%<content_placeholder>%%',concat)

    return merged

def set_metadata_merged(text: str, author:str, title: str) -> str:
    """set the author and the title metadat in the results"""

    text = text.replace('%%<TITLE>%%',title)
    text = text.replace('%%<AUTHOR>%%',author)

    return text

In [None]:
### Settings

author = "Ford Prefect"

filenames = ['sample1.md','sample2.md','sample3.md', 'sample4.md']
filenames = ['sample2.md']
filepath_source = 'sample/'
filepath_output = 'out/'
title = "Test Samples"

# filenames = ['general_control_diagram.md' #Basics
#              ,'optimization-based-control.md'
#              ,'linear_time-invariant_system.md'
#              ,'bellmans-principle-optimality.md' 
#              ,'markovian_representation.md'
#              ,'SISO_system.md'
#              ,'MIMO_system.md'
#              ,'dpa_dynamic-programming-approch.md'
#              ,'optimal-linear-quadratic-regulation.md' #LQR
#              ,'lqr_finite-horizon.md'
#              ,'lqr_infinite-horizon.md'
#              ,'affine-lqr.md'
#              ,'model-predictive-control.md' # MPC
#              ,'mpc_stability.md'
#              ,'lyapunov_analysis.md'
#              ,'mpc_incremental-mpc.md'
#              ,'mpc_explicit.md'
#              ,'mpc_steady-state-selection.md'
#              ,'disturbances.md'
#              ,'mpc_disturbance-rejection.md'
#              ,'robust_mpc.md'
#              ,'robust-mpc_feedback-mpc.md'
#              ,'mpc_robust-mpc_soft-constrained-lqr.md'
#              ,'economic-mpc.md'
#              ,'behavioral-system-theory.md' #Identification
#              ,'system-identification.md'
#              ,'markov-parameters.md'
#              ,'controllability.md'
#              ,'controllability-matrix.md'
#              ,'observability.md'
#              ,'observability-matrix.md'
#              ,'impulse_response.md'
#              ,'kalman-ho_algorithm.md'
#              ,'data-enabled-predictive-control.md' #DeepC
#              ,'behavioral_K-step-predictor.md'
#              ,'hankel-matrix.md'
#              ,'deepc_regularized.md'
#              ,'markov-decision-process.md' #MDP
#              ,'markov-chain.md'
#              ,'policy_for-mdp.md'
#              ,'mdp_control-problem.md'
#              ,'mdp_q-function.md'
#              ,'mdp_value-function.md'
#              ,'mdp_finite-horizon.md'
#              ,'mdp_infinite-horizon.md'
#              ,'monte-carlo-learning.md' #Monte Carlo
#              ,'reinforcement-learning.md' #RL
#              ,'sarsa_temporal-difference.md'
#              ,'q-learning.md'
#              ,'policy-gradient.md'
#              ]
# filepath_source = '../../polybox/ZETTLR_STUDIES/CS2__computational-control/'
# filepath_output = 'out/'
# title = "Computational Control"

# filenames = ['antrieb_eisenbahn.md'
#              ,'antrieb-elektrisch.md'
#              ,'haupttransformator.md'
#              ,'drosselspule_dc.md'
#              ,'fahrmotoren.md'
#              ,'elektromotor.md'             
#              ,'asynchron-motor.md'
#              ,'stromrichter.md'
#              ,'netzstromrichter.md'
#              ,'zwischenkreis.md'
#              ,'motorstromrichter.md'
#              ,'hochspannung_eisenbahn.md'
#              ,'thermische_auslegung.md'
#              ,'diesel-antrieb.md'
#              ,'diesel-machanisch.md'
#              ,'diesel-hydrostatisch.md'
#              ,'diesel-hydrodynamisch.md'
#              ,'diesel-elektrisch_gleichstrom.md'
#              ,'diesel-elektrisch_umrichter.md'
#              ,'gasturbinen-antrieb.md'
#              ,'sicherheitssteuerung.md'
#              ,'zugbeeinflussung.md'
#              ,'etcs.md'
#              ,'2024-03-22_sbb-etcs.md'
#              ,'energieverbrauch.md'
#              ,'energiespeicher.md'
#              ,'2024-04-12_traktionsbaterien-gastvortrag.md'
#              ,'bahnstromversorgung.md'
#              ,'gleichstromsysteme.md'
#              ,'wechselstromsystem_sonderfrequenz.md'
#              ,'wechselstromsysteme_industriefrequenz.md'
#              ,'elektrische-systemkompatibilitaet.md'
#              ,'gleisstromkreis.md'
#              ,'achsenzaehler.md'
#              ,'netzimpendanz.md']
# filepath_source = '../../polybox/ZETTLR_STUDIES/EST2__eisenbahn-systemtechnik-2/'
# filepath_output = 'out/'
# title = "Eisenbahn-Systemtechnik 2"

# filenames = ['BDE_l01.md'
#              ,'BDE_l02.md'
#              ,'BDE_l03.md'
#              ,'BDE_l04.md'
#              ,'BDE_l05.md'
#              ,'BDE_l06.md'
#              ,'BDE_l07.md'
#              ,'BDE_l08.md'
#              ,'BDE_l09.md'
#              ,'BDE_l10.md'
#              ,'BDE_l11.md'
#              ,'BDE_l12.md'
#              ,'BDE_l13.md'
#              ,'BDE_l14.md']
filenames = ['big-data_stack.md' #STACK
             ,'big_data.md' #GENERAL
             ,'relational-database.md'
             ,'sql.md'
             ,'data-types.md'
             ,'cloud-storage.md' #Storage
             ,'object-stores.md'
             ,'amazon_s3.md'
             ,'azure-blob-storage.md'
             ,'key-value-storage.md'
             ,'distributed-file-system.md' #DFS
             ,'hdfs.md'
             ,'googlefs.md'
             ,'csv.md' #SYNTAX
             ,'syntax_tree.md'
             ,'json.md'
             ,'xml.md'
             ,'data-formats.md'
             ,'data-model.md' #Data Models
             ,'json-data-model.md'
             ,'xml-information-set.md'
             ,'validation.md' #Validation
             ,'jsound.md'
             ,'json_schema.md'
             ,'xml-schema.md'
             ,'dataframe.md'
             ,'map-reduce.md' #Processing
             ,'yarn.md'
             ,'apache-spark.md'
             ,'wide-column-store.md' #DATA STORE, Wide Column Store
             ,'hbase.md'
             ,'spanner.md'
             ,'document-stores.md' #Document Store
             ,'mongo-db.md'
             ,'querying-trees.md' #Querying
             ,'jsoniq.md'
             ]
filepath_source = '../../polybox/ZETTLR_STUDIES/BDE__big-data-for-engineers/'
filepath_output = 'out/'
title = "Big Data for Engineers"

SingleDocument.verbose = True

### Run

documents = []
SingleDocument.filepath_source = filepath_source
SingleDocument.filepath_output = filepath_output
for filename in filenames:
    documents.append(SingleDocument(filename))
    
# for document in documents:
#     print(document.get_markdown_text())
#     print(document.get_latex_text())

latex_total = merge_documents(documents=documents)
latex_total = set_metadata_merged(latex_total,author=author,title=title)

# print(latex_total)

compile_latex(latex_total
              ,output_directory=filepath_output)
