In [1]:
#import zipfile
import tarfile
import io
import importlib
import os
import regex as re
import glob
import pandas as pd
import itertools as itr
import pprint

In [2]:
import pyperclip   #copy text to clipboard for inspecting

In [3]:
from tqdm.auto import tqdm

In [4]:
from IPython.core.interactiveshell import InteractiveShell
# pretty print all cell's output and not just the last one
InteractiveShell.ast_node_interactivity = "all"

In [5]:
import TexSoup as TS
from TexSoup.tokens import MATH_ENV_NAMES
# -

In [6]:
TS.__file__

'/Users/cjc73/miniconda3/envs/cforge/envs/orig_texsoup/lib/python3.11/site-packages/TexSoup/__init__.py'

In [7]:
TS.__file__

'/Users/cjc73/miniconda3/envs/cforge/envs/orig_texsoup/lib/python3.11/site-packages/TexSoup/__init__.py'

In [8]:
#importlib.reload(TS)

In [9]:
LOCAL_DATA_PATH = './data/2201_00_all/'

In [10]:
def pre_format(text):
    '''Apply some substititions to make LaTeX easier to parse'''
    source_text = (
        text
        .replace('\\}\\', '\\} \\')  # Due to escape rules \\ is equivalent to \
        .replace(')}', ') }')
        .replace(')$', ') $')
        #.replace(r'\left [', r'\left[ ')
        #.replace(r'\left (', r'\left( ')
        #.replace(r'\left \{', r'\left\{ ')
    )
    return source_text
    #clean_lines = []
    #for line in source_text.splitlines(False):
    #    cleanline = line.strip()
    #    if cleanline.startswith(r'\newcommand'):
    #        cleanline = r'%' + cleanline
    #    elif cleanline.startswith(r'\def'):
    #        cleanline = r'%' + cleanline
    #    clean_lines.append(cleanline)
    #return '\n'.join(clean_lines)

In [11]:
def find_doc_class(wrapped_file, name_match=False):
    '''Search for document class related lines in a file  and return a code to represent the type'''
    doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")
    sub_doc_class = re.compile(r"^\s*\\document(?:style|class).*(?:\{standalone\}|\{subfiles\})")

    for line in wrapped_file:
        if doc_class_pat.search(line):
            if name_match:
                # we can miss if there are two or more lines with documentclass 
                # and the first one is not the one that has standalone/subfile
                if sub_doc_class.search(line):
                    return -99999
                return 1 #main_files[tf] = 1
            
    return 0 #main_files[tf] = 0


def find_main_tex_source_in_tar(tar_path, encoding='uft-8'):
    '''Identify the main Tex file in a tarfile.
    
    Args:
        tar_path: A gzipped tar archive of a directory containing tex source and support files.
    '''
    
    tex_names = set(["paper", "main", "ms.", "article"])

    with tarfile.open(tar_path, 'r') as in_tar:
        tex_files = [f for f in in_tar.getnames() if f.endswith('.tex')]
        
        # got one file
        if len(tex_files) == 1:
            return tex_files[0]

        main_files = {}
        for tf in tex_files:
            depth = len(tf.split('/')) - 1
            has_main_name = any(kw in tf for kw in tex_names)
            fp = in_tar.extractfile(tf)
            wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
            # does it have a doc class?
            # get the type
            main_files[tf] = find_doc_class(wrapped_file, name_match = has_main_name) - depth 
            wrapped_file.close() 
        
        # got one file with doc class
        if len(main_files) == 1:
            return(main_files.keys()[0])
        
        # account for multi-file submissions
        return(max(main_files, key=main_files.get))

In [12]:
def soup_from_tar(tar_path, encoding='utf-8', tolerance=0):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        soup = TS.TexSoup(source_text, tolerance=tolerance, skip_envs=MATH_ENV_NAMES)
        return soup

In [13]:
def source_from_tar(tar_path, encoding='utf-8'):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        return source_text

In [14]:
swap = itr.cycle((True, False))

def find_bad(current_text_lines):
    mid = int(len(current_text_lines)/2)
    part_a = current_text_lines[0:mid]
    part_b = current_text_lines[mid:]
    if next(swap):
        part_b, part_a = part_a, part_b
    bad = ""
    try:
        soup = TS.TexSoup("\n".join(part_a), tolerance=tolerance, skip_envs=MATH_ENV_NAMES)
    except KeyboardInterrupt:
        raise
    except:
        return part_a
    try:
        soup = TS.TexSoup("\n".join(part_b), tolerance=tolerance, skip_envs=MATH_ENV_NAMES)
    except KeyboardInterrupt:
        raise
    except:
        return part_b
    return "--"
    

def find_bad_lines(tar_path, encoding='utf-8'):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        current_text = source_text.splitlines()

    while len(current_text) > 1:
        bad_half = find_bad(current_text)
        if current_text == bad_half:
            break
        current_text = bad_half
        
    return bad_half

In [15]:
def show_context(text_path, offset, context_size=50):
    try: 
        with open(text_path, 'r', encoding='utf-8') as file:
            file.seek(offset)
            context = file.read(context_size)
            return context
            # Is it unicode?
    except UnicodeDecodeError as ue:
        pass
    try:
        with open(text_path, 'r', encoding='latin-1') as file:
            file.seek(offset)
            context = file.read(context_size)
            return context
    except: 
        raise

# file_path = 'Ising_v2.tex'
# offset_position = 805
# context = show_context(file_path, offset_position)
# print("Error context at offset 805:", context)

In [16]:
show_context(infile_path, 8584)

NameError: name 'infile_path' is not defined

## Check a file with parse errors

In [16]:
min_example=r"""
{\subsection}
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
try:
    TS.TexSoup(pre_format(min_example), tolerance=0)
except AssertionError as e:
    print(e)
#print(min_example)

TypeError: [Line: 0, Offset 0] Malformed argument. First and last elements must match a valid argument format. In this case, TexSoup could not find matching punctuation for: {.
Just finished parsing: ['{', TexCmd('subsection', [BraceGroup('}')])]

In [17]:
min_example=r"""
\renewcommand{\tilde}{\widetilde}
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
try:
    TS.TexSoup(pre_format(min_example), tolerance=0)
except AssertionError as e:
    print(e)
#print(min_example)

TypeError: [Line: 0, Offset 21] Malformed argument. First and last elements must match a valid argument format. In this case, TexSoup could not find matching punctuation for: {.
Just finished parsing: ['{', TexCmd('widetilde', [BraceGroup('}')])]

In [18]:
infile_path = "./data/2201_00_all/2201.00001v1.tar.gz" #'./data/2201_samp/2201.00048v1.tar.gz'

text = source_from_tar(infile_path)
pyperclip.copy(text)
soup = soup_from_tar(infile_path, tolerance=1)


title = soup.find('title')
if title: print(f"{title.name}: {title.text}")
for sec in soup.find_all('section'):
    print(f' {sec.name}: {sec.text}')

title: ['Modeling Advection on Directed Graphs using  Mat', "\\'", 'e', 'rn Gaussian Processes for Traffic Flow']
 section: ['Introduction']
 section: ['Understanding the directed graph advection operator']
 section: ['Directed Graph Advection Mat', "\\'", 'e', 'rn Gaussian Process (DGAMGP) ']
 section: ['Numerical Results']
 section: ['Conclusions']
 section: ['Upwinding discretizations of linear advection']
 section: ['Examples of ', '$', 'L_', 'adv', '$', ' on balanced graphs resulting in finite difference discretizations of linear advection']
 section: ['Additional Experiments']


## Quick check a folder of tar files

In [None]:
files = glob.glob(f'{LOCAL_DATA_PATH}/*.tar.gz')
files_count = len(files)
utf_count = 0
latin_count = 0 
err_files = {}

TOLERANCE = 0

with tqdm(total=files_count, desc="errors") as err_prog:
    for tar_file in tqdm(files, desc="Progress", display=True):
        # Is it unicode?
        try:
            soup = soup_from_tar(tar_file, encoding='utf-8', tolerance=TOLERANCE)
            utf_count += 1
            continue
        except EOFError as eof:
            err_files[tar_file] = type(eof)
            _ = err_prog.update(1)
            continue
        except UnicodeDecodeError as ue:
            pass
        except KeyboardInterrupt as KB_err:
            break
        except Exception as e:
            err_files[tar_file] = type(e)
            _ = err_prog.update(1)
            continue

        # Is it something else?
        try:
            soup = soup_from_tar(tar_file, encoding='latin-1', tolerance=TOLERANCE)
            latin_count += 1
            continue
        except KeyboardInterrupt as KB_err:
            break
        except Exception as e:
            err_files[tar_file] = type(e)
            _ = err_prog.update(1)
            pass

In [None]:
print(f"{files_count} processed, {len(err_files)} failures.")
print(f"UTF8: {utf_count}; Latin1: {latin_count}")
err_files

## Scratch below here

In [24]:
show_context(infile_path, 8584)

AttributeError: 'str' object has no attribute 'decode'

In [28]:
TOLERANCE = 0
infile_path = "/Volumes/Neptune/scratch/2404/2404.08812v1.tar.gz" #'./data/2201_samp/2201.00048v1.tar.gz'

text = source_from_tar(infile_path)
pyperclip.copy(text)
soup = soup_from_tar(infile_path, tolerance=TOLERANCE)


title = soup.find('title')
if title: print(f"{title.name}: {title.text}")
for sec in soup.find_all('section'):
    print(f' {sec.name}: {sec.text}')

TypeError: [Line: 0, Offset 7484] Malformed argument. First and last elements must match a valid argument format. In this case, TexSoup could not find matching punctuation for: {.
Just finished parsing: ['{', TexCmd('cite', [BraceGroup('}'), BracketGroup('1')]), BraceGroup(TexCmd('ifnotes'), TexCmd('mbox', [BraceGroup(TexCmd('origcite', [BraceGroup('#1')]))]), TexCmd('else'), ' ', TexCmd('origcite', [BraceGroup('#1')]), TexCmd('fi')), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('strike')), BraceGroup('[')]), '1', ']', BraceGroup(TexCmd('ifnotes', [BraceGroup(TexCmd('color', [BraceGroup('gray'), BraceGroup(TexCmd('texorpdfstring', [BraceGroup(TexCmd('sout', [BraceGroup('#1')])), BraceGroup('#1')]))]))]), TexCmd('fi')), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('add')), BraceGroup('[')]), '1', ']', BraceGroup(TexCmd('ifnotes', [BraceGroup(TexCmd('leavevmode'), TexCmd('color', [BraceGroup('purple'), BraceGroup('#1')]))]), TexCmd('else', [BraceGroup('#1')]), TexCmd('fi')), '\n', TexCmd('newcommand', [BraceGroup(TexCmd('replace')), BraceGroup('[')]), '2', ']', BraceGroup(TexCmd('ifnotes', [BraceGroup(TexCmd('strike', [BraceGroup('#1')]), TexCmd('add', [BraceGroup('#2')]))]), TexCmd('else', [BraceGroup('#2')]), TexCmd('fi')), '\n', '\n', '\n', '% \\newcommand{\\chooseI}{\\scalerel*{\\includegraphics{figs/tasks/choose.pdf}~~~}{B}}', '\n', '% \\newcommand{\\activateI}{\\scalerel*{\\includegraphics{figs/tasks/activate.pdf}~~~}{B}}', '\n', '% \\newcommand{\\createI}{\\scalerel*{\\includegraphics{figs/tasks/create.pdf}~~~}{B}}', '\n', '\n', '\n', '%%%%%%%%%%%', '\n', '\n', '\n', TexNamedEnv('document', ["\n\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n%%%%%%%%%%%%%%%%%%%%%% START OF THE PAPER %%%%%%%%%%%%%%%%%%%%%%\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\n%% The ``\\maketitle'' command must be the first command after the\n%% ``\\begin{document}'' command. It prepares and prints the title block.\n%% the only exception to this rule is the \\firstsection command\n% \\firstsection{Introduction}\n\n\\maketitle\n\n\n\\input{tex/Introduction}\n\\input{tex/Background}\n\\input{tex/Methodology}\n\\input{tex/TypologyOfDecisionMakingTasks}\n\\input{tex/Composability}\n\\input{tex/CaseStudies}\n\\input{tex/Evaluation}\n\\input{tex/Discussion}\n% \\input{tex/archives/FutureWork}\n\\input{tex/Conclusion}\n% \\input{tex/archives/quotes}\n\n%% if specified like this the section will be omitted in review mode\n% \\acknowledgments{%\n% \tThe authors wish to thank A, B, and C.\n%   This work was supported in part by a grant from XYZ (\\# 12345-67890).%\n% }\n\n\\bibliographystyle{abbrv-doi-hyperref}\n%\\bibliographystyle{abbrv-doi-hyperref-narrow}\n%\\bibliographystyle{abbrv-doi}\n%\\bibliographystyle{abbrv-doi-narrow}\n\n\\bibliography{main}\n\\appendix % You can use the `hideappendix` class option to skip everything after \\appendix\n\n"], []), '\n', '\n']

In [None]:
soup.find_all('section')

In [None]:
soup

In [None]:
find_bad_lines(infile_path, encoding='utf-8')

In [None]:
tar_path = "./data/2201_samp/2201.00008v2.tar.gz"
encoding = "utf-8"
with tarfile.open(tar_path, 'r') as in_tar:
    tex_files = [f for f in in_tar.getnames() if f.endswith('.tex')]

    # got one file
    if len(tex_files) == 1:
        pass #return tex_files[0]

    main_files = {}
    for tf in tex_files:
        fp = in_tar.extractfile(tf)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        # does it have a doc class?
        # get the type
        main_files[tf] = find_doc_class(wrapped_file)
        wrapped_file.close() 

    # got one file with doc class
    if len(main_files) == 1:
        pass #return(main_files.keys()[0])

    # account for multi-file submissions
    #return(max(main_files, key=main_files.get))

In [None]:
main_files

In [None]:
doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")

with tarfile.open(tar_path, 'r', encoding='utf-8') as in_tar:
    #in_tar.getnames()
    fp = in_tar.extractfile('main.tex')
    wrapped_file = io.TextIOWrapper(fp, newline=None, encoding='utf-8') #universal newlines
    for line in wrapped_file:
        if doc_class_pat.search(line):
            print(line)
            break

In [None]:
next(wrapped_file)

In [None]:
min_example=r"""
\documentclass{article}
\begin{document}
% \renewcommand{\shorttitle}{Avoiding Catastrophe}
\end{document}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)
TS.TexSoup(r'\newcommand{\bra}[1]{\left\langle#1\right|}')

In [None]:
TS.TexSoup(r'\def\be{\foo{equation}}')

In [None]:
TS.TexSoup(r'\renewcommand{\shorttitle}{Avoiding Catastrophe}')
min_example = r"\newenvironment{inlinemath}{$}{$}".strip()
TS.TexSoup(pre_format(min_example))
#print(min_example)

In [None]:
min_example = r"In practice, the matrix $\left [ 4 \right]\Inv\M{D}^{(1)}_n $".strip()
TS.TexSoup(pre_format(min_example))
#print(min_example)

In [None]:
min_example = r"In practice, the matrix $\left[ 4 \right]\Inv\M{D}^{(1)}_n $"


cats = TS.category.categorize(min_example)
tokens = list(TS.tokens.tokenize(cats))

char_codes = list(TS.category.categorize(min_example))

buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(r'\left[ 4 \right]')))
TS.reader.read_command(buf, n_required_args=-1, mode='mode:math', skip=1 )

In [17]:
min_example=r"""
\renewcommand{\subsection}[1]{{\textit{#1.~}}}""".strip()

cats = TS.category.categorize(min_example)
tokens = list(TS.tokens.tokenize(cats))

char_codes = list(TS.category.categorize(min_example))

with pd.option_context('display.max.columns', None, 'display.max_colwidth', 0):
    pd.DataFrame({'char':char_codes, 'code':(x.category for x in char_codes)}).transpose()
    pd.DataFrame({'tokens':tokens})

print("Cmd Math Mode")
buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(min_example)))
rd_cmd = TS.reader.read_command(buf, n_required_args=2, n_optional_args=2, mode='mode:math', tolerance=1)
pprint.pp(rd_cmd)

print("Cmd Non Math")
buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(min_example)))
rd_cmd = TS.reader.read_command(buf,  n_required_args=2, n_optional_args=2, tolerance=1)
pprint.pp(rd_cmd)

print("Reader")
buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(min_example)))
rd = TS.read(buf, tolerance=1)
pprint.pp(rd)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45
char,\,r,e,n,e,w,c,o,m,m,a,n,d,{,\,s,u,b,s,e,c,t,i,o,n,},[,1,],{,{,\,t,e,x,t,i,t,{,#,1,.,~,},},}
code,1,12,12,12,12,12,12,12,12,12,12,12,12,2,1,12,12,12,12,12,12,12,12,12,12,3,19,13,20,2,2,1,12,12,12,12,12,12,2,7,13,13,14,3,3,3


Unnamed: 0,tokens
0,\
1,renewcommand
2,{
3,\
4,subsection
5,}
6,[
7,1
8,]
9,{


Cmd Math Mode
('\\',
 [BraceGroup('renewcommand'), BraceGroup(TexCmd('subsection')), BracketGroup('1')])
Cmd Non Math
('\\',
 [BraceGroup('renewcommand'), BraceGroup(TexCmd('subsection')), BracketGroup('1')])
Reader
([TexCmd('renewcommand', [BraceGroup(TexCmd('subsection')), BracketGroup('1'), BraceGroup(BraceGroup(TexCmd('textit', [BraceGroup('#1.~')])))])],
 '\\renewcommand{\\subsection}[1]{{\\textit{#1.~}}}')


In [18]:
from TexSoup.category import categorize
from TexSoup.tokens import tokenize
from TexSoup.reader import read_args
test = lambda s, *a, **k: read_args(tokenize(categorize(s)), *a, **k)
print(test('[walla]{walla}{ba]ng}'))  # 'regular' arg parse
    #[BracketGroup('walla'), BraceGroup('walla'), BraceGroup('ba', ']', 'ng')]
print(test('\t[wa]\n{lla}\n\n{b[ing}'))  # interspersed spacers + 2 newlines
    #[BracketGroup('wa'), BraceGroup('lla')]
print(test('\t[\t{a]}bs', 2, 0))  # use char as arg, since no opt args)
    #[BraceGroup('['), BraceGroup('a', ']')]
print(test('\n[hue]\t[\t{a]}', 2, 1))  # check stop opt arg capture)
    #[BracketGroup('hue'), BraceGroup('['), BraceGroup('a', ']')]
print(test('\t\\item'))
    #[]
print(test('   \t    \n\t \n{bingbang}'))
    #[]
print(test('\t[wa]\n{lla}\n\n{b[ing}'))  # interspersed spacers + 2 newlines)
    #[]
print(test('[tempt]{ing}[WITCH]{doctorrrr}', 0, 0))
    #[]

[walla]{walla}{ba]ng}
[wa]{lla}
{[}{a]}
[hue]{[}{a]}


[wa]{lla}



In [23]:
foo  = test('[walla]{walla}{ba]ng}')
print(foo.all)

[BracketGroup('walla'), BraceGroup('walla'), BraceGroup('ba', ']', 'ng')]


In [None]:
dir(foo)

[BracketGroup('walla'), BraceGroup('walla'), BraceGroup('ba', ']', 'ng')]

['_TexArgs__coerce',
 '__add__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'all',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

In [35]:
min_example = r"\subsection[Background Info]{Background}"


cats = TS.category.categorize(min_example)
tokens = list(TS.tokens.tokenize(cats))

char_codes = list(TS.category.categorize(min_example))

with pd.option_context('display.max.columns', None, 'display.max_colwidth', 0):
    pd.DataFrame({'char':char_codes, 'code':(x.category for x in char_codes)}).transpose()
    pd.DataFrame({'tokens':tokens})

buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(min_example)))
TS.reader.read_command(buf, n_required_args=-1, mode='mode:math', skip=3, tolerance=1)

buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(min_example)))
TS.read(buf, tolerance=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
char,\,s,u,b,s,e,c,t,i,o,n,[,B,a,c,k,g,r,o,u,n,d,,I,n,f,o,],{,B,a,c,k,g,r,o,u,n,d,}
code,1,12,12,12,12,12,12,12,12,12,12,19,12,12,12,12,12,12,12,12,12,12,11.0,12,12,12,12,20,2,12,12,12,12,12,12,12,12,12,12,3


Unnamed: 0,tokens
0,\
1,subsection
2,[
3,Background Info
4,]
5,{
6,Background
7,}


('Background Info', [])

([TexCmd('subsection', [BracketGroup('Background Info'), BraceGroup('Background')])],
 '\\subsection[Background Info]{Background}')

In [29]:
min_example = r"$ t \in [0,1] $$ t \in [0,1] $"


cats = TS.category.categorize(min_example)
tokens = list(TS.tokens.tokenize(cats))

char_codes = list(TS.category.categorize(min_example))

with pd.option_context('display.max.columns', None, 'display.max_colwidth', 0):
    pd.DataFrame({'char':char_codes, 'code':(x.category for x in char_codes)}).transpose()
    pd.DataFrame({'tokens':tokens})

buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(min_example)))
TS.reader.read_command(buf, n_required_args=-1, mode='mode:math', skip=3, tolerance=1)

buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(min_example)))
TS.read(buf, tolerance=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
char,$,,t,,\,i,n,,[,0,",",1,],,$,$,,t,,\,i,n,,[,0,",",1,],,$
code,4,11.0,12,11.0,1,12,12,11.0,19,13,13,13,20,11.0,4,4,11.0,12,11.0,1,12,12,11.0,19,13,13,13,20,11.0,4


Unnamed: 0,tokens
0,$
1,t
2,\
3,in
4,
5,[
6,01
7,]
8,
9,$


('in', [])

EOFError: [Line: 0, Offset: 19] "$" env expecting $. Reached end of file.

In [None]:
with pd.option_context('display.max.columns', None, 'display.max_colwidth', 0):
    pd.DataFrame({'char':char_codes, 'code':(x.category for x in char_codes)}).transpose()
    pd.DataFrame({'tokens':tokens})

In [None]:
min_example = r"In practice, the matrix $\left [\M{D}^{(1)}_n(\M{D}^{(1)}_n)\Tra\right]\Inv\M{D}^{(1)}_n $"
print(min_example)
TS.TexSoup(pre_format(min_example))
TS.TexSoup(min_example)

In [None]:
min_example=r"""
\documentclass{article}
\begin{document}
% \renewcommand{\shorttitle}{Avoiding Catastrophe}
\end{document}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)

In [None]:
min_example=r"""
\def\bean {\begin{foo}}  \def\eean {\end{foo}}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
TS.TexSoup(min_example)
print(min_example)
min_example=r"""
we {use $A=8B$ and $s=1$, then the scalar field becomes same with (\Ref{scalarfield}) and
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#TS.TexSoup(min_example)
print(min_example)
print(pre_format(min_example))
BRACKETS_DELIMITERS = {
    '(', ')', '<', '>', '[', ']', '{', '}', r'\{', r'\}', '.' '|', r'\langle',
    r'\rangle', r'\lfloor', r'\rfloor', r'\lceil', r'\rceil', r'\ulcorner',
    r'\urcorner', r'\lbrack', r'\rbrack'
}
# TODO: looks like left-right do have to match
SIZE_PREFIX = ('left', 'right', 'big', 'Big', 'bigg', 'Bigg')
PUNCTUATION_COMMANDS = {command + opt_space + bracket
                        for command in SIZE_PREFIX
                        for opt_space in {'', ' '}
                        for bracket in BRACKETS_DELIMITERS.union({'|', '.'})}
PUNCTUATION_COMMANDS

In [None]:
min_example=r"""
\def\bean {\begin{eqnarray*}}  \def\eean {\end{eqnarray*}}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)
min_example=r"""
the interval $t\in[0,1)$. 
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)
min_example=r"""







\beq
[\chF,\chG\}=\{\partial\chF,\chG\}.
\eeq

derivation $\CA\mapsto [\CB,\CA]$. 







The following characterizations of UAL chains are all equivalent:
\begin{itemize}
    \item[(1)] A skew-symmetric function $\cha:\Lambda^{q+1}\ra\mfkdal$ defines an element of $C_{q}(\mfkdal) $ if $\|\cha\|_{\alpha}<\infty$ for any $\alpha \in \NN$.
    \item[(2)] A skew-symmetric function $\cha:\Lambda^{q+1}\ra\mfkdal$ defines an element of $C_{q}(\mfkdal) $ if there is a function $b(r) \in \Orf$  such that for any $j_0,...,j_q$ the observable $\cha_{j_0...j_q}$ is $b$-localized at $j_a$ for any $a \in \{0,1,...,q\}$.
    \item[(3)] $C_{q}(\mfkdal) $ is the completion of $C_q(\mfkdl) $ with respect to the norms $\|\cdot\|_{\alpha}$.
\end{itemize}
\end{lemma}





""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
\newcommand\const{\operatorname{const}}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
\newcommand{\beq}{\begin{equation}}
\newcommand{\eeq}{\end{equation}}
\newcommand{\chF}{{\mathsf f}}
\newcommand{\chG}{{\mathsf g}}
\beq  
[\chF,\chG\}=\{\partial\chF,\chG\}.
\eeq
derivation $\CA\mapsto [\CB,\CA]$. 
""".strip().replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
\[
r_p=d(p,\cdot)\colon \Gamma \to [0,\infty)|~ p \in M\}
\]
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
$\bigl[ a \bigr)$
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""

$\varepsilon\in]0,\varepsilon_\star[$,  

""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
\[
i\colon [0,\infty) 
\]
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)

In [None]:
min_example=r"""
\newcommand\1{{\mathds 1}}
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# !! This bug was specific to my fork
min_example=r"""
\newcommand{\linebreakand}{%
    \end{@IEEEauthorhalign}
    \hfill\mbox{}\par
    \mbox{}\hfill\begin{@IEEEauthorhalign}
    }
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
 $S \subseteq \{0\} \bigcup [1,\infty) $ if $z^*_2=1$.  
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# two inline math envs next to eachother
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
$\rm{W_{cyc} }\geq 0$$\;\;\square$
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
\verb+$TEXMF/tex/latex/elsevier/+, %$%%%%%%%%%%%%%%%%%%%%%%%%%%%%
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# does not handle missing optional braces around arguments
min_example=r"""
$\sqrt {\frac 3 2} >p >1$
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
&$\rm{N_{Diskbb}}$$(\times 10^4) $
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
$\frac{j+1+\epsilon}{m^{\alpha}}[$
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
$1\le k< \frac n2 $ 
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
\begin{equation}
\begin{aligned}[t]
[T\tensor*[]{]}{_{\CT}^{\sp}} \\
[T]{_{\CT}^{\sp}}
\end{aligned}
\end{equation}
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)

In [None]:
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
with open('./data/test.txt', 'r') as infile:
    min_example=infile.read().strip()

TS.TexSoup(pre_format(min_example), tolerance=0)
#print(min_example)

In [26]:
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
\def\f{\frac}
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=0)
#print(min_example)
import pandas as pd
import numpy as np
pd.DataFrame(np.random.randint(0,100,size=(10, 3)), columns=list('ABC')).to_csv('~/Expire/test_console_upload.csv')

TypeError: [Line: 0, Offset 6] Malformed argument. First and last elements must match a valid argument format. In this case, TexSoup could not find matching punctuation for: {.
Just finished parsing: ['{', TexCmd('frac', [BraceGroup('}')])]

In [None]:
min_example=r"""
\renewcommand{\subsection}[1]{{\textit{#1.~}}}
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=0)
#print(min_example)