In [9]:
from pathlib import Path
from typing import Dict, List, Optional, Union
from dataclasses import dataclass
import logging
from TexSoup import TexSoup
from TexSoup.data import TexNode

@dataclass
class LatexElement:
    """Represents a node in the LaTeX document tree."""
    type: str  # The type of element (section, subsection, text, etc.)
    name: str  # The name or title of the element
    content: str  # Raw content
    level: int  # Nesting level
    children: List['LatexElement']  # Child elements
    parent: Optional['LatexElement'] = None  # Parent element

class LatexParser:
    """Parser for LaTeX documents that creates a tree structure."""
    
    def __init__(self, base_dir: Union[str, Path]):
        self.base_dir = Path(base_dir)
        self.root = None
        self.current_node = None
        self.section_levels = {
            'chapter': 0,
            'section': 1,
            'subsection': 2,
            'subsubsection': 3,
            'paragraph': 4,
            'subparagraph': 5
        }
        
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def parse_main_file(self, main_file: str) -> LatexElement:
        """
        Parse the main LaTeX file and create a document tree.
        
        Args:
            main_file: Name of the main .tex file
            
        Returns:
            LatexElement: Root node of the document tree
        """
        try:
            file_path = self.base_dir / main_file
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Parse the LaTeX content
            soup = TexSoup(content)
            
            # Create root node
            self.root = LatexElement(
                type='document',
                name='root',
                content='',
                level=-1,
                children=[]
            )
            self.current_node = self.root
            
            # Process document body
            self._process_node(soup)
            
            return self.root
            
        except FileNotFoundError:
            self.logger.error(f"File not found: {file_path}")
            raise
        except Exception as e:
            self.logger.error(f"Error parsing LaTeX document: {str(e)}")
            raise

    def _process_node(self, node: TexNode) -> None:
        """
        Recursively process a TexSoup node and its children.
        
        Args:
            node: Current TexSoup node being processed
        """
        # Handle different types of LaTeX elements
        for child in node.contents:
            if hasattr(child, 'name'):
                name = child.name
                
                # Handle sections and their variants
                if name in self.section_levels:
                    level = self.section_levels[name]
                    title = str(child.string) if hasattr(child, 'string') else ''
                    
                    # Create new section node
                    new_node = LatexElement(
                        type=name,
                        name=title,
                        content=str(child),
                        level=level,
                        children=[]
                    )
                    
                    # Adjust current node based on section level
                    while (self.current_node != self.root and 
                           self.current_node.level >= level):
                        self.current_node = self.current_node.parent
                    
                    new_node.parent = self.current_node
                    self.current_node.children.append(new_node)
                    self.current_node = new_node
                
                # Handle included files
                elif name == 'input' or name == 'include':
                    included_file = str(child.string)
                    if not included_file.endswith('.tex'):
                        included_file += '.tex'
                    
                    try:
                        with open(self.base_dir / included_file, 'r', encoding='utf-8') as f:
                            included_content = f.read()
                        included_soup = TexSoup(included_content)
                        self._process_node(included_soup)
                    except FileNotFoundError:
                        self.logger.warning(f"Included file not found: {included_file}")
                
                # Process other environments and commands
                else:
                    self._process_node(child)
            
            # Handle text content
            elif str(child).strip():
                text_node = LatexElement(
                    type='text',
                    name='',
                    content=str(child),
                    level=self.current_node.level + 1,
                    children=[],
                    parent=self.current_node
                )
                self.current_node.children.append(text_node)

    def print_tree(self, node: Optional[LatexElement] = None, level: int = 0) -> None:
        """
        Print the document tree in a hierarchical format.
        
        Args:
            node: Current node to print (defaults to root)
            level: Current indentation level
        """
        if node is None:
            node = self.root
            
        indent = "  " * level
        if node.type == 'text':
            content_preview = node.content[:50] + "..." if len(node.content) > 50 else node.content
            print(f"{indent}└─ Text: {content_preview}")
        else:
            print(f"{indent}└─ {node.type}: {node.name}")
        
        for child in node.children:
            self.print_tree(child, level + 1)

def main():
    """Example usage of the LaTeX parser."""
    # Example usage
    parser = LatexParser("./sources/2203.16481v1")
    try:
        root = parser.parse_main_file("paper.tex")
        print("\nDocument Structure:")
        parser.print_tree()
    except Exception as e:
        print(f"Error: {str(e)}")
     
if __name__ == "__main__":
    main()


Document Structure:
└─ document: root
  └─ Text: twoside,11pt
  └─ Text: article
  └─ Text: algorithm
  └─ Text: abbrvbib,nohyperref,preprint
  └─ Text: jmlr2e
  └─ Text: ref,caption
  └─ Text: leaf
  └─ Text: listings
  └─ Text: xcolor
  └─ Text: codegreen
  └─ Text: rgb
  └─ Text: 0,0.6,0
  └─ Text: codegray
  └─ Text: rgb
  └─ Text: 0.5,0.5,0.5
  └─ Text: codepurple
  └─ Text: rgb
  └─ Text: 0.58,0,0.82
  └─ Text: backcolour
  └─ Text: rgb
  └─ Text: 0.95,0.95,0.92
  └─ Text: dark-blue
  └─ Text: rgb
  └─ Text: 0.15,0.15,0.4
  └─ Text: medium-blue
  └─ Text: rgb
  └─ Text: 0,0,0.5
  └─ Text: mystyle
  └─ Text: 
    backgroundcolor=
  └─ Text: backcolour
  └─ Text: ,   
    commentstyle=
  └─ Text: codegreen
  └─ Text: ,
    keywordstyle=
  └─ Text: magenta
  └─ Text: ,
    numberstyle=
  └─ Text: codegray
  └─ Text: ,
    stringstyle=
  └─ Text: codepurple
  └─ Text: ,
    basicstyle=
  └─ Text: ,
    breakatwhitespace=false,         
    breakl...
  └─ Text: 
  colorlinks, linkcol

In [5]:
!ls ./sources/2203.16481v1/paper.tex

./sources/2203.16481v1/paper.tex


"\\documentclass[twoside,11pt]{article}\n\n\\newcommand{\\theHalgorithm}{\\arabic{algorithm}}\n\n\\usepackage[abbrvbib,nohyperref,preprint]{jmlr2e}\n\\usepackage[ref,caption]{leaf}\n\n\\usepackage{listings}\n\\usepackage{xcolor}\n\n\\definecolor{codegreen}{rgb}{0,0.6,0}\n\\definecolor{codegray}{rgb}{0.5,0.5,0.5}\n\\definecolor{codepurple}{rgb}{0.58,0,0.82}\n\\definecolor{backcolour}{rgb}{0.95,0.95,0.92}\n\\definecolor{dark-blue}{rgb}{0.15,0.15,0.4}\n\\definecolor{medium-blue}{rgb}{0,0,0.5}\n\n\\lstdefinestyle{mystyle}{\n    backgroundcolor=\\color{backcolour},   \n    commentstyle=\\color{codegreen},\n    keywordstyle=\\color{magenta},\n    numberstyle=\\tiny\\color{codegray},\n    stringstyle=\\color{codepurple},\n    basicstyle=\\ttfamily\\footnotesize,\n    breakatwhitespace=false,         \n    breaklines=true,                 \n    captionpos=b,                    \n    keepspaces=true,                 \n    numbers=left,                    \n    numbersep=5pt,                  \n