<a href="https://colab.research.google.com/github/ak0586/Python_projects/blob/main/Embedding_technique.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install latexcodec

Collecting latexcodec
  Downloading latexcodec-3.0.0-py3-none-any.whl.metadata (4.9 kB)
Downloading latexcodec-3.0.0-py3-none-any.whl (18 kB)
Installing collected packages: latexcodec
Successfully installed latexcodec-3.0.0


In [None]:

import re
import latexcodec
from typing import List, Tuple, Dict

class LaTeXEmbeddingGenerator:
    def __init__(self):
        # Minimal fallback symbol mappings only for cases where latexcodec might fail
        self.symbol_map = {
            r'\int': '∫',
            r'\sum': '∑',
            r'\prod': '∏',
            r'\infty': '∞',
            r'\partial': '∂',
        }

    def convert_latex_to_symbol(self, latex_str: str) -> str:
        """Convert LaTeX commands to Unicode symbols using latexcodec"""
        result = latex_str

        # Use latexcodec for LaTeX command conversion
        try:
            # Extract LaTeX commands and convert them
            latex_commands = re.findall(r'\\[a-zA-Z]+', result)
            for cmd in latex_commands:
                try:
                    # Try different formats for latexcodec
                    symbol = latexcodec.latex_decode(cmd + '{}')
                    if symbol != cmd + '{}':  # If conversion was successful
                        result = result.replace(cmd, symbol)
                    else:
                        # Try without braces
                        symbol = latexcodec.latex_decode(cmd)
                        if symbol != cmd:
                            result = result.replace(cmd, symbol)
                except:
                    # Fallback to custom mapping only if latexcodec fails
                    if cmd in self.symbol_map:
                        result = result.replace(cmd, self.symbol_map[cmd])
        except:
            pass

        return result

    def parse_limits(self, expr: str) -> Tuple[str, str, str]:
        """Parse \limits_{lower}^{upper} or _{lower}^{upper}"""
        # Match limits pattern
        limits_pattern = r'\\limits_\{([^}]*)\}\^\{([^}]*)\}'
        match = re.search(limits_pattern, expr)

        if match:
            lower = match.group(1)
            upper = match.group(2)
            # Remove the limits part from expression
            expr_without_limits = expr[:match.start()] + expr[match.end():]
            return expr_without_limits, lower, upper

        # Check for subscript/superscript without \limits
        sub_sup_pattern = r'_\{([^}]*)\}\^\{([^}]*)\}'
        match = re.search(sub_sup_pattern, expr)

        if match:
            lower = match.group(1)
            upper = match.group(2)
            expr_without_limits = expr[:match.start()] + expr[match.end():]
            return expr_without_limits, lower, upper

        return expr, None, None

    def parse_superscript(self, expr: str) -> List[str]:
        """Parse nested superscripts like ^{2}^{2}^{2}"""
        result = []
        i = 0

        while i < len(expr):
            if expr[i] == '^' and i + 1 < len(expr) and expr[i + 1] == '{':
                # Find matching closing brace
                brace_count = 0
                start = i + 2
                end = start

                for j in range(start, len(expr)):
                    if expr[j] == '{':
                        brace_count += 1
                    elif expr[j] == '}':
                        if brace_count == 0:
                            end = j
                            break
                        brace_count -= 1

                if end > start:
                    superscript_content = expr[start:end]
                    result.append(f"TR{{{superscript_content}}}")
                    i = end + 1
                else:
                    result.append(expr[i])
                    i += 1
            else:
                if expr[i] not in '{}':
                    result.append(expr[i])
                i += 1

        return result

    def parse_subscript(self, expr: str) -> List[str]:
        """Parse subscripts like _{content}"""
        result = []
        i = 0

        while i < len(expr):
            if expr[i] == '_' and i + 1 < len(expr) and expr[i + 1] == '{':
                # Find matching closing brace
                brace_count = 0
                start = i + 2
                end = start

                for j in range(start, len(expr)):
                    if expr[j] == '{':
                        brace_count += 1
                    elif expr[j] == '}':
                        if brace_count == 0:
                            end = j
                            break
                        brace_count -= 1

                if end > start:
                    subscript_content = expr[start:end]
                    result.append(f"BR{{{subscript_content}}}")
                    i = end + 1
                else:
                    result.append(expr[i])
                    i += 1
            else:
                if expr[i] not in '{}':
                    result.append(expr[i])
                i += 1

        return result

    def parse_parentheses(self, expr: str) -> List[str]:
        """Parse parentheses and brackets"""
        result = []
        i = 0

        while i < len(expr):
            if expr[i] == '(':
                result.append('L{(}')
            elif expr[i] == ')':
                result.append('R{)}')
            elif expr[i] == '[':
                result.append('L{[}')
            elif expr[i] == ']':
                result.append('R{]}')
            elif expr[i] == '{':
                result.append('L{{}')
            elif expr[i] == '}':
                result.append('R{}}')
            else:
                result.append(expr[i])
            i += 1

        return result

    def tokenize_expression(self, expr: str) -> List[str]:
        """Tokenize mathematical expression into components"""
        # Remove spaces
        expr = expr.replace(' ', '').replace('\\,', '')

        # Convert LaTeX symbols to Unicode
        expr = self.convert_latex_to_symbol(expr)

        tokens = []
        i = 0

        while i < len(expr):
            char = expr[i]

            # Handle multi-character operators and symbols
            if i + 1 < len(expr):
                two_char = expr[i:i+2]
                if two_char in ['dx', 'dy', 'dz', 'dt', 'du', 'dv', 'dw']:
                    tokens.extend(['d', two_char[1]])
                    i += 2
                    continue

            # Single character
            if char not in ' \t\n':
                tokens.append(char)

            i += 1

        return tokens

    def generate_embedding(self, latex_expr: str) -> List[str]:
        """Generate embedding for LaTeX mathematical expression"""
        # Clean the expression
        expr = latex_expr.strip()

        # Handle integral with limits
        if '\\int' in expr:
            expr_without_limits, lower, upper = self.parse_limits(expr)

            embedding = []

            # Add integral symbol with limits
            if lower is not None and upper is not None:
                lower_converted = self.convert_latex_to_symbol(lower)
                upper_converted = self.convert_latex_to_symbol(upper)
                embedding.append(f"∫B{{{lower_converted}}}T{{{upper_converted}}}")
            else:
                embedding.append('∫')

            # Process the rest of the expression
            remaining = expr_without_limits.replace('\\int', '')
            tokens = self.tokenize_expression(remaining)

            for token in tokens:
                if token == '(':
                    embedding.append('L{(}')
                elif token == ')':
                    embedding.append('R{)}')
                else:
                    embedding.append(token)

            return embedding

        # Handle expressions with superscripts
        elif '^{' in expr:
            # Handle nested superscripts
            base_expr = re.split(r'\^{', expr)[0]
            base_tokens = self.tokenize_expression(base_expr)

            embedding = []
            for token in base_tokens:
                embedding.append(token)

            # Parse superscripts
            superscript_parts = self.parse_superscript(expr[len(base_expr):])
            embedding.extend(superscript_parts)

            return embedding

        # Handle simple expressions
        else:
            tokens = self.tokenize_expression(expr)
            embedding = []

            for token in tokens:
                if token == '(':
                    embedding.append('L{(}')
                elif token == ')':
                    embedding.append('R{)}')
                else:
                    embedding.append(token)

            return embedding

    def format_embedding(self, embedding: List[str]) -> str:
        """Format embedding as a clean string"""
        return '[' + ','.join(embedding) + ']'

# Example usage and testing
def main():
    generator = LaTeXEmbeddingGenerator()

    # Test cases
    test_cases = [
        r'\int\limits_{0}^{1} f(x)\,dx',
        r'x^{2}^{2}^{2} + 1',
        r'\int_{0}^{\infty} e^{-x} dx',
        r'x^{2} + y^{2} = r^{2}',
        r'\sum_{i=1}^{n} x_i',
        r'f(x) = ax^{2} + bx + c',
        r'\frac{dy}{dx} = y\prime',
    ]

    print("LaTeX Expression Embedding Generator")
    print("=" * 50)

    for expr in test_cases:
        try:
            embedding = generator.generate_embedding(expr)
            formatted = generator.format_embedding(embedding)
            print(f"Input:  {expr}")
            print(f"Output: {formatted}")
            print("-" * 30)
        except Exception as e:
            print(f"Error processing '{expr}': {e}")
            print("-" * 30)

    # Interactive mode
    print("\nInteractive Mode (type 'quit' to exit):")
    while True:
        try:
            user_input = input("\nEnter LaTeX expression: ").strip()
            if user_input.lower() in ['quit', 'exit', 'q']:
                break

            if user_input:
                embedding = generator.generate_embedding(user_input)
                formatted = generator.format_embedding(embedding)
                print(f"Embedding: {formatted}")

        except KeyboardInterrupt:
            print("\nGoodbye!")
            break
        except Exception as e:
            print(f"Error: {e}")

if __name__ == "__main__":
    main()

In [None]:

import re
import latexcodec
from typing import List, Tuple, Dict

class LaTeXEmbeddingGenerator:
    def __init__(self):
        # Minimal fallback symbol mappings only for cases where latexcodec might fail
        self.symbol_map = {
            r'\int': '∫',
            r'\sum': '∑',
            r'\prod': '∏',
            r'\infty': '∞',
            r'\partial': '∂',
        }

    def convert_latex_to_symbol(self, latex_str: str) -> str:
        """Convert LaTeX commands to Unicode symbols using latexcodec"""
        result = latex_str

        # Use latexcodec for LaTeX command conversion
        try:
            # Extract LaTeX commands and convert them
            latex_commands = re.findall(r'\\[a-zA-Z]+', result)
            for cmd in latex_commands:
                try:
                    # Try different formats for latexcodec
                    symbol = latexcodec.latex_decode(cmd + '{}')
                    if symbol != cmd + '{}':  # If conversion was successful
                        result = result.replace(cmd, symbol)
                    else:
                        # Try without braces
                        symbol = latexcodec.latex_decode(cmd)
                        if symbol != cmd:
                            result = result.replace(cmd, symbol)
                except:
                    # Fallback to custom mapping only if latexcodec fails
                    if cmd in self.symbol_map:
                        result = result.replace(cmd, self.symbol_map[cmd])
        except:
            pass

        return result

    def parse_limits(self, expr: str) -> Tuple[str, str, str, str]:
        """Parse \limits_{lower}^{upper} or _{lower}^{upper}"""
        # Match limits pattern for integrals
        limits_pattern = r'\\limits_\{([^}]*)\}\^\{([^}]*)\}'
        match = re.search(limits_pattern, expr)

        if match:
            lower = match.group(1)
            upper = match.group(2)
            # Remove the limits part from expression
            expr_without_limits = expr[:match.start()] + expr[match.end():]
            return expr_without_limits, lower, upper, 'integral'

        # Check for subscript/superscript without \limits (for sum, prod, etc.)
        sub_sup_pattern = r'_\{([^}]*)\}\^\{([^}]*)\}'
        match = re.search(sub_sup_pattern, expr)

        if match:
            lower = match.group(1)
            upper = match.group(2)
            expr_without_limits = expr[:match.start()] + expr[match.end():]
            return expr_without_limits, lower, upper, 'sum_prod'

        return expr, None, None, None

    def parse_fraction(self, expr: str) -> Tuple[str, str, str]:
        """Parse \frac{numerator}{denominator}"""
        frac_pattern = r'\\frac\{([^}]*)\}\{([^}]*)\}'
        match = re.search(frac_pattern, expr)

        if match:
            numerator = match.group(1)
            denominator = match.group(2)
            # Remove the fraction part from expression
            expr_without_frac = expr[:match.start()] + expr[match.end():]
            return expr_without_frac, numerator, denominator

        return expr, None, None
    def parse_superscript(self, expr: str) -> List[str]:
        """Parse nested superscripts like ^{2}^{2}^{2}"""
        result = []
        i = 0

        while i < len(expr):
            if expr[i] == '^' and i + 1 < len(expr) and expr[i + 1] == '{':
                # Find matching closing brace
                brace_count = 0
                start = i + 2
                end = start

                for j in range(start, len(expr)):
                    if expr[j] == '{':
                        brace_count += 1
                    elif expr[j] == '}':
                        if brace_count == 0:
                            end = j
                            break
                        brace_count -= 1

                if end > start:
                    superscript_content = expr[start:end]
                    # Tokenize the superscript content
                    tokens = self.tokenize_expression(superscript_content)
                    result.append(f"TR{{{','.join(tokens)}}}")
                    i = end + 1
                else:
                    result.append(expr[i])
                    i += 1
            else:
                if expr[i] not in '{}':
                    result.append(expr[i])
                i += 1

        return result

    def parse_subscript(self, expr: str) -> List[str]:
        """Parse subscripts like _{content}"""
        result = []
        i = 0

        while i < len(expr):
            if expr[i] == '_' and i + 1 < len(expr) and expr[i + 1] == '{':
                # Find matching closing brace
                brace_count = 0
                start = i + 2
                end = start

                for j in range(start, len(expr)):
                    if expr[j] == '{':
                        brace_count += 1
                    elif expr[j] == '}':
                        if brace_count == 0:
                            end = j
                            break
                        brace_count -= 1

                if end > start:
                    subscript_content = expr[start:end]
                    result.append(f"BR{{{subscript_content}}}")
                    i = end + 1
                else:
                    result.append(expr[i])
                    i += 1
            else:
                if expr[i] not in '{}':
                    result.append(expr[i])
                i += 1

        return result

    def parse_parentheses(self, expr: str) -> List[str]:
        """Parse parentheses and brackets"""
        result = []
        i = 0

        while i < len(expr):
            if expr[i] == '(':
                result.append('L{(}')
            elif expr[i] == ')':
                result.append('R{)}')
            elif expr[i] == '[':
                result.append('L{[}')
            elif expr[i] == ']':
                result.append('R{]}')
            elif expr[i] == '{':
                result.append('L{{}')
            elif expr[i] == '}':
                result.append('R{}}')
            else:
                result.append(expr[i])
            i += 1

        return result

    def tokenize_expression(self, expr: str) -> List[str]:
        """Tokenize mathematical expression into components"""
        # Remove spaces
        expr = expr.replace(' ', '').replace('\\,', '')

        # Convert LaTeX symbols to Unicode
        expr = self.convert_latex_to_symbol(expr)

        tokens = []
        i = 0

        while i < len(expr):
            char = expr[i]

            # Handle multi-character operators and symbols
            if i + 1 < len(expr):
                two_char = expr[i:i+2]
                if two_char in ['dx', 'dy', 'dz', 'dt', 'du', 'dv', 'dw']:
                    tokens.extend(['d', two_char[1]])
                    i += 2
                    continue

            # Handle parentheses
            if char == '(':
                tokens.append('L{(}')
            elif char == ')':
                tokens.append('R{)}')
            # Single character
            elif char not in ' \t\n':
                tokens.append(char)

            i += 1

        return tokens

    def generate_embedding(self, latex_expr: str) -> List[str]:
        """Generate embedding for LaTeX mathematical expression"""
        # Clean the expression
        expr = latex_expr.strip()

        # Handle fractions first
        if '\\frac' in expr:
            expr_without_frac, numerator, denominator = self.parse_fraction(expr)

            embedding = []

            # Process any remaining part before fraction
            if expr_without_frac.strip():
                remaining_tokens = self.tokenize_expression(expr_without_frac)
                embedding.extend(remaining_tokens)

            # Add fraction symbol with numerator on top, denominator on bottom
            if numerator is not None and denominator is not None:
                num_tokens = self.tokenize_expression(numerator)
                den_tokens = self.tokenize_expression(denominator)
                embedding.append(f"÷T{{{','.join(num_tokens)}}}B{{{','.join(den_tokens)}}}")

            return embedding

        # Handle integral with limits
        elif '\\int' in expr:
            expr_without_limits, lower, upper, limit_type = self.parse_limits(expr)

            embedding = []

            # Add integral symbol with limits (use B for bottom, T for top)
            if lower is not None and upper is not None:
                lower_converted = self.convert_latex_to_symbol(lower)
                upper_converted = self.convert_latex_to_symbol(upper)
                embedding.append(f"∫B{{{lower_converted}}}T{{{upper_converted}}}")
            else:
                embedding.append('∫')

            # Process the rest of the expression
            remaining = expr_without_limits.replace('\\int', '')
            tokens = self.tokenize_expression(remaining)

            # Add tokens
            embedding.extend(tokens)

            return embedding

        # Handle sum/product with limits
        elif '\\sum' in expr or '\\prod' in expr:
            expr_without_limits, lower, upper, limit_type = self.parse_limits(expr)

            embedding = []

            # Add sum/product symbol with limits (use BR for bottom-right, TR for top-right)
            if '\\sum' in expr:
                symbol = self.convert_latex_to_symbol('\\sum')
            else:
                symbol = self.convert_latex_to_symbol('\\prod')

            if lower is not None and upper is not None:
                lower_converted = self.convert_latex_to_symbol(lower)
                upper_converted = self.convert_latex_to_symbol(upper)
                embedding.append(f"{symbol}BR{{{lower_converted}}}TR{{{upper_converted}}}")
            else:
                embedding.append(symbol)

            # Process the rest of the expression
            remaining = expr_without_limits.replace('\\sum', '').replace('\\prod', '')

            # Handle subscripts in the remaining expression
            remaining = self.handle_remaining_subscripts(remaining)
            tokens = self.tokenize_expression(remaining)

            embedding.extend(tokens)

            return embedding

        # Handle expressions with superscripts
        elif '^{' in expr:
            # Find the base expression (everything before the first ^{)
            base_match = re.match(r'([^\\^]*?)(\^{.*)', expr)
            if base_match:
                base_expr = base_match.group(1)
                superscript_expr = base_match.group(2)

                embedding = []

                # Process base expression
                base_tokens = self.tokenize_expression(base_expr)
                embedding.extend(base_tokens)

                # Parse superscripts
                superscript_parts = self.parse_superscript(superscript_expr)
                embedding.extend(superscript_parts)

                return embedding

        # Handle simple expressions
        else:
            tokens = self.tokenize_expression(expr)
            return tokens

    def handle_remaining_subscripts(self, expr: str) -> str:
        """Handle subscripts like x_i in the remaining expression"""
        # Replace x_i with xBR{i}
        subscript_pattern = r'([a-zA-Z])_([a-zA-Z0-9])'
        result = re.sub(subscript_pattern, r'\1BR{\2}', expr)
        return result

    def format_embedding(self, embedding: List[str]) -> str:
        """Format embedding as a clean string"""
        return '[' + ','.join(embedding) + ']'

# Example usage and testing
def main():
    generator = LaTeXEmbeddingGenerator()

    # Test cases
    test_cases = [
        r'\int\limits_{0}^{1} f(x)\,dx',
        r'x^{2}^{2}^{2} + 1',
        r'\sum_{i=1}^{n} x_i',
        r'\frac{dy}{dx}',
        r'ax^{2} + bx + c',
        r'e^{-x}',
        r'f(x) = ax^{2} + bx + c',
        r'\int_{0}^{\infty} e^{-x} dx',
        r'x^{2} + y^{2} = r^{2}',
    ]

    print("LaTeX Expression Embedding Generator")
    print("=" * 50)

    for expr in test_cases:
        try:
            embedding = generator.generate_embedding(expr)
            formatted = generator.format_embedding(embedding)
            print(f"Input:  {expr}")
            print(f"Output: {formatted}")
            print("-" * 30)
        except Exception as e:
            print(f"Error processing '{expr}': {e}")
            print("-" * 30)

    # Interactive mode
    print("\nInteractive Mode (type 'quit' to exit):")
    while True:
        try:
            user_input = input("\nEnter LaTeX expression: ").strip()
            if user_input.lower() in ['quit', 'exit', 'q']:
                break

            if user_input:
                embedding = generator.generate_embedding(user_input)
                formatted = generator.format_embedding(embedding)
                print(f"Embedding: {formatted}")

        except KeyboardInterrupt:
            print("\nGoodbye!")
            break
        except Exception as e:
            print(f"Error: {e}")

if __name__ == "__main__":
    main()

In [9]:

import re
import latexcodec
from typing import List, Tuple, Dict

class LaTeXEmbeddingGenerator:
    def __init__(self):
        # Minimal fallback symbol mappings only for cases where latexcodec might fail
        self.symbol_map = {
            r'\int': '∫',
            r'\sum': '∑',
            r'\prod': '∏',
            r'\infty': '∞',
            r'\partial': '∂',
        }

    def convert_latex_to_symbol(self, latex_str: str) -> str:
        """Convert LaTeX commands to Unicode symbols using latexcodec"""
        result = latex_str

        # Use latexcodec for LaTeX command conversion
        try:
            # Extract LaTeX commands and convert them
            latex_commands = re.findall(r'\\[a-zA-Z]+', result)
            for cmd in latex_commands:
                try:
                    # Try different formats for latexcodec
                    symbol = latexcodec.latex_decode(cmd + '{}')
                    if symbol != cmd + '{}':  # If conversion was successful
                        result = result.replace(cmd, symbol)
                    else:
                        # Try without braces
                        symbol = latexcodec.latex_decode(cmd)
                        if symbol != cmd:
                            result = result.replace(cmd, symbol)
                except:
                    # Fallback to custom mapping only if latexcodec fails
                    if cmd in self.symbol_map:
                        result = result.replace(cmd, self.symbol_map[cmd])
        except:
            pass

        return result

    def parse_limits_or_subscript_superscript(self, expr: str) -> Tuple[str, str, str, str]:
        """Parse limits, subscripts, and superscripts"""
        # Match limits pattern for integrals
        limits_pattern = r'\\limits_\{([^}]*)\}\^\{([^}]*)\}'
        match = re.search(limits_pattern, expr)

        if match:
            lower = match.group(1)
            upper = match.group(2)
            expr_without_limits = expr[:match.start()] + expr[match.end():]
            return expr_without_limits, lower, upper, 'integral'

        # Check for subscript/superscript without \limits (for sum, prod, etc.)
        sub_sup_pattern = r'_\{([^}]*)\}\^\{([^}]*)\}'
        match = re.search(sub_sup_pattern, expr)

        if match:
            lower = match.group(1)
            upper = match.group(2)
            expr_without_limits = expr[:match.start()] + expr[match.end():]
            return expr_without_limits, lower, upper, 'sum_prod'

        return expr, None, None, None

    def parse_fraction(self, expr: str) -> Tuple[str, str, str]:
        """Parse \frac{numerator}{denominator}"""
        frac_pattern = r'\\frac\{([^}]*)\}\{([^}]*)\}'
        match = re.search(frac_pattern, expr)

        if match:
            numerator = match.group(1)
            denominator = match.group(2)
            expr_without_frac = expr[:match.start()] + expr[match.end():]
            return expr_without_frac, numerator, denominator

        return expr, None, None

    def parse_nested_superscripts(self, expr: str) -> str:
        """Parse nested superscripts like x^{2}^{2}^{2} into xTR{2TR{2TR{2}}}"""
        result = ""
        i = 0

        while i < len(expr):
            if expr[i:i+2] == '^{':
                # Start building nested superscript
                superscript_content = ""
                brace_count = 1
                j = i + 2

                while j < len(expr) and brace_count > 0:
                    if expr[j] == '{':
                        brace_count += 1
                    elif expr[j] == '}':
                        brace_count -= 1

                    if brace_count > 0:
                        superscript_content += expr[j]
                    j += 1

                # Check if there are more superscripts following
                remaining = expr[j:]
                if remaining.startswith('^{'):
                    # Parse the remaining superscripts recursively
                    nested_super = self.parse_nested_superscripts(remaining)
                    result += f"TR{{{superscript_content}{nested_super}}}"
                    break
                else:
                    result += f"TR{{{superscript_content}}}"
                    i = j
            else:
                result += expr[i]
                i += 1

        return result

    def process_subscripts_and_superscripts(self, expr: str) -> str:
        """Process subscripts and superscripts in expressions"""
        result = expr

        # Handle nested superscripts first
        if '^{' in result:
            # Find base expression and superscripts
            base_match = re.match(r'([^\\^]*?)(\^{.*)', result)
            if base_match:
                base = base_match.group(1)
                superscripts = base_match.group(2)
                nested_super = self.parse_nested_superscripts(superscripts)
                result = base + nested_super

        # Handle simple subscripts like x_i
        subscript_pattern = r'([a-zA-Z])_([a-zA-Z0-9])'
        result = re.sub(subscript_pattern, r'\1BR{\2}', result)

        return result

    def tokenize_basic_expression(self, expr: str) -> List[str]:
        """Tokenize a basic expression into individual components"""
        # Remove spaces and \,
        expr = expr.replace(' ', '').replace('\\,', '')

        # Convert LaTeX symbols to Unicode
        expr = self.convert_latex_to_symbol(expr)

        # Process subscripts and superscripts
        expr = self.process_subscripts_and_superscripts(expr)

        tokens = []
        i = 0

        while i < len(expr):
            char = expr[i]

            # Handle multi-character patterns
            if i + 1 < len(expr):
                two_char = expr[i:i+2]
                if two_char in ['dx', 'dy', 'dz', 'dt', 'du', 'dv', 'dw']:
                    tokens.extend(['d', two_char[1]])
                    i += 2
                    continue

            # Handle function calls like f(x) -> f,xL{(}R{)}
            if char.isalpha() and i + 1 < len(expr) and expr[i + 1] == '(':
                # Find matching closing parenthesis
                paren_count = 0
                j = i + 1
                while j < len(expr):
                    if expr[j] == '(':
                        paren_count += 1
                    elif expr[j] == ')':
                        paren_count -= 1
                        if paren_count == 0:
                            break
                    j += 1

                if paren_count == 0:  # Found matching parenthesis
                    func_name = expr[i]
                    args = expr[i+2:j]  # Content between parentheses
                    tokens.append(func_name)
                    tokens.append(args + 'L{(}R{)}')
                    i = j + 1
                    continue

            # Handle TR{...} patterns
            if expr[i:i+3] == 'TR{':
                brace_count = 1
                j = i + 3
                content = ""
                while j < len(expr) and brace_count > 0:
                    if expr[j] == '{':
                        brace_count += 1
                    elif expr[j] == '}':
                        brace_count -= 1

                    if brace_count > 0:
                        content += expr[j]
                    j += 1

                tokens.append(f"TR{{{content}}}")
                i = j
                continue

            # Handle BR{...} patterns
            if expr[i:i+3] == 'BR{':
                brace_count = 1
                j = i + 3
                content = ""
                while j < len(expr) and brace_count > 0:
                    if expr[j] == '{':
                        brace_count += 1
                    elif expr[j] == '}':
                        brace_count -= 1

                    if brace_count > 0:
                        content += expr[j]
                    j += 1

                tokens.append(f"BR{{{content}}}")
                i = j
                continue

            # Handle regular parentheses
            if char == '(':
                tokens.append('L{(}')
            elif char == ')':
                tokens.append('R{)}')
            elif char not in ' \t\n{}':
                tokens.append(char)

            i += 1

        return tokens

    def generate_embedding(self, latex_expr: str) -> List[str]:
        """Generate embedding for LaTeX mathematical expression"""
        expr = latex_expr.strip()

        # Handle fractions first
        if '\\frac' in expr:
            expr_without_frac, numerator, denominator = self.parse_fraction(expr)

            embedding = []

            # Process any remaining part before fraction
            if expr_without_frac.strip():
                remaining_tokens = self.tokenize_basic_expression(expr_without_frac)
                embedding.extend(remaining_tokens)

            # Add fraction
            if numerator and denominator:
                num_tokens = self.tokenize_basic_expression(numerator)
                den_tokens = self.tokenize_basic_expression(denominator)
                embedding.append(f"÷T{{{','.join(num_tokens)}}}B{{{','.join(den_tokens)}}}")

            return embedding

        # Handle integral with limits
        elif '\\int' in expr:
            expr_without_limits, lower, upper, limit_type = self.parse_limits_or_subscript_superscript(expr)

            embedding = []

            # Add integral symbol with limits
            if lower and upper:
                lower_converted = self.convert_latex_to_symbol(lower)
                upper_converted = self.convert_latex_to_symbol(upper)
                embedding.append(f"∫B{{{lower_converted}}}T{{{upper_converted}}}")
            else:
                embedding.append('∫')

            # Process remaining expression
            remaining = expr_without_limits.replace('\\int', '')
            if remaining.strip():
                tokens = self.tokenize_basic_expression(remaining)
                embedding.extend(tokens)

            return embedding

        # Handle sum/product with limits
        elif '\\sum' in expr or '\\prod' in expr:
            expr_without_limits, lower, upper, limit_type = self.parse_limits_or_subscript_superscript(expr)

            embedding = []

            # Add sum/product symbol with limits
            if '\\sum' in expr:
                symbol = self.convert_latex_to_symbol('\\sum')
            else:
                symbol = self.convert_latex_to_symbol('\\prod')

            if lower and upper:
                lower_converted = self.convert_latex_to_symbol(lower)
                upper_converted = self.convert_latex_to_symbol(upper)
                embedding.append(f"{symbol}BR{{{lower_converted}}}TR{{{upper_converted}}}")
            else:
                embedding.append(symbol)

            # Process remaining expression
            remaining = expr_without_limits.replace('\\sum', '').replace('\\prod', '')
            if remaining.strip():
                tokens = self.tokenize_basic_expression(remaining)
                embedding.extend(tokens)

            return embedding

        # Handle other expressions
        else:
            return self.tokenize_basic_expression(expr)

    def format_embedding(self, embedding: List[str]) -> str:
        """Format embedding as a clean string"""
        return '[' + ','.join(embedding) + ']'

# Example usage and testing
def main():
    generator = LaTeXEmbeddingGenerator()

    # Test cases
    test_cases = [
           r'\int\limits_{0}^{1} f(x)\,dx',        r'x^{2}^{2}^{2} + 1',
        r'\int_{0}^{\infty} e^{x} dx',
        r'x^{2} + y^{2} = r^{2}',
        r'\sum_{i=1}^{n} x_i',
        r'f(x) = ax^{2} + bx + c',
        r'\frac{dy}{dx} = y\prime',    ]

    print("LaTeX Expression Embedding Generator")
    print("=" * 50)

    for expr in test_cases:
        try:
            embedding = generator.generate_embedding(expr)
            formatted = generator.format_embedding(embedding)
            print(f"Input:  {expr}")
            print(f"Output: {formatted}")
            print("-" * 30)
        except Exception as e:
            print(f"Error processing '{expr}': {e}")
            print("-" * 30)

    # Interactive mode
    print("\nInteractive Mode (type 'quit' to exit):")
    while True:
        try:
            user_input = input("\nEnter LaTeX expression: ").strip()
            if user_input.lower() in ['quit', 'exit', 'q']:
                break

            if user_input:
                embedding = generator.generate_embedding(user_input)
                formatted = generator.format_embedding(embedding)
                print(f"Embedding: {formatted}")

        except KeyboardInterrupt:
            print("\nGoodbye!")
            break
        except Exception as e:
            print(f"Error: {e}")

if __name__ == "__main__":
    main()

LaTeX Expression Embedding Generator
Input:  \int\limits_{0}^{1} f(x)\,dx
Output: [∫B{0}T{1},f,xL{(}R{)},d,x]
------------------------------
Input:  x^{2}^{2}^{2} + 1
Output: [x,TR{2TR{2TR{2}+1}}]
------------------------------
Input:  \int_{0}^{\infty} e^{x} dx
Output: [∫B{0}T{∞},e,TR{x},d,x]
------------------------------
Input:  x^{2} + y^{2} = r^{2}
Output: [x,TR{2},+,y,TR{2},=,r,TR{2}]
------------------------------
Input:  \sum_{i=1}^{n} x_i
Output: [∑BR{i=1}TR{n},x,BR{i}]
------------------------------
Input:  f(x) = ax^{2} + bx + c
Output: [f,xL{(}R{)},=,a,x,TR{2},+,b,x,+,c]
------------------------------
Input:  \frac{dy}{dx} = y\prime
Output: [=,y,\,p,r,i,m,e,÷T{d,y}B{d,x}]
------------------------------

Interactive Mode (type 'quit' to exit):

Enter LaTeX expression: quit
