In [32]:
#import zipfile
import tarfile
import io
import importlib
import os
import regex as re
import glob
import pandas as pd

In [2]:
from tqdm.auto import tqdm

In [3]:
from IPython.core.interactiveshell import InteractiveShell
# pretty print all cell's output and not just the last one
InteractiveShell.ast_node_interactivity = "all"

In [4]:
import TexSoup as TS
#importlib.reload(TS)

In [5]:
LOCAL_DATA_PATH = './data/2201_samp/'

In [6]:
def pre_format(text):
    '''Apply some substititions to make LaTeX easier to parse'''
    source_text = (
        text
        .replace('\\}\\', '\\} \\')  # Due to escape rules \\ is equivalent to \
        .replace(')}', ') }')
        .replace(')$', ') $')
    )
    return source_text
    #clean_lines = []
    #for line in source_text.splitlines(False):
    #    cleanline = line.strip()
    #    if cleanline.startswith(r'\newcommand'):
    #        cleanline = r'%' + cleanline
    #    elif cleanline.startswith(r'\def'):
    #        cleanline = r'%' + cleanline
    #    clean_lines.append(cleanline)
    #return '\n'.join(clean_lines)

In [7]:
def find_doc_class(wrapped_file, name_match=False):
    '''Search for document class related lines in a file  and return a code to represent the type'''
    doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")
    sub_doc_class = re.compile(r"^\s*\\document(?:style|class).*(?:\{standalone\}|\{subfiles\})")

    for line in wrapped_file:
        if doc_class_pat.search(line):
            if name_match:
                # we can miss if there are two or more lines with documentclass 
                # and the first one is not the one that has standalone/subfile
                if sub_doc_class.search(line):
                    return -99999
                return 1 #main_files[tf] = 1
            
    return 0 #main_files[tf] = 0


def find_main_tex_source_in_tar(tar_path, encoding='uft-8'):
    '''Identify the main Tex file in a tarfile.
    
    Args:
        tar_path: A gzipped tar archive of a directory containing tex source and support files.
    '''
    
    tex_names = set(["paper.tex", "main.tex", "ms.tex", "article.tex"])

    with tarfile.open(tar_path, 'r') as in_tar:
        tex_files = [f for f in in_tar.getnames() if f.endswith('.tex')]
        
        # got one file
        if len(tex_files) == 1:
            return tex_files[0]

        main_files = {}
        for tf in tex_files:
            has_main_name = tf in tex_names
            fp = in_tar.extractfile(tf)
            wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
            # does it have a doc class?
            # get the type
            main_files[tf] = find_doc_class(wrapped_file, name_match = has_main_name)
            wrapped_file.close() 
        
        # got one file with doc class
        if len(main_files) == 1:
            return(main_files.keys()[0])
        
        # account for multi-file submissions
        return(max(main_files, key=main_files.get))

In [8]:
def soup_from_tar(tar_path, encoding='utf-8'):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        soup = TS.TexSoup(source_text)
        return soup

## Quick check a folder of tar files

In [9]:
files = glob.glob(f'{LOCAL_DATA_PATH}/*.tar.gz')
files_count = len(files)
utf_count = 0
latin_count = 0 
err_files = {}

with tqdm(total=files_count, desc="errors") as err_prog:
    for tar_file in tqdm(files, desc="Progress", display=True):
        # Is it unicode?
        try:
            soup = soup_from_tar(tar_file, encoding='utf-8')
            utf_count += 1
            continue
        except EOFError as eof:
            err_files[tar_file] = type(eof)
            _ = err_prog.update(1)
            continue
        except UnicodeDecodeError as ue:
            pass
        except KeyboardInterrupt as KB_err:
            break
        except Exception as e:
            err_files[tar_file] = type(e)
            _ = err_prog.update(1)
            continue

        # Is it something else?
        try:
            soup = soup_from_tar(tar_file, encoding='latin-1')
            latin_count += 1
            continue
        except KeyboardInterrupt as KB_err:
            break
        except Exception as e:
            err_files[tar_file] = type(e)
            _ = err_prog.update(1)
            pass

errors:   0%|          | 0/46 [00:00<?, ?it/s]

Progress:   0%|          | 0/46 [00:00<?, ?it/s]

In [10]:
print(f"{files_count} processed, {len(err_files)} failures.")
print(f"UTF8: {utf_count}; Latin1: {latin_count}")
err_files

46 processed, 21 failures.
UTF8: 25; Latin1: 0


{'./data/2201_samp/2201.00048v1.tar.gz': EOFError,
 './data/2201_samp/2201.00092v1.tar.gz': TypeError,
 './data/2201_samp/2201.00042v1.tar.gz': AssertionError,
 './data/2201_samp/2201.00091v2.tar.gz': TypeError,
 './data/2201_samp/2201.00035v1.tar.gz': TypeError,
 './data/2201_samp/2201.00082v1.tar.gz': TypeError,
 './data/2201_samp/2201.00058v1.tar.gz': AssertionError,
 './data/2201_samp/2201.00065v2.tar.gz': EOFError,
 './data/2201_samp/2201.00070v1.tar.gz': EOFError,
 './data/2201_samp/2201.00045v1.tar.gz': AssertionError,
 './data/2201_samp/2201.00044v1.tar.gz': EOFError,
 './data/2201_samp/2201.00056v1.tar.gz': AssertionError,
 './data/2201_samp/2201.00068v1.tar.gz': AssertionError,
 './data/2201_samp/2201.00087v1.tar.gz': EOFError,
 './data/2201_samp/2201.00062v1.tar.gz': EOFError,
 './data/2201_samp/2201.00036v1.tar.gz': TypeError,
 './data/2201_samp/2201.00091v1.tar.gz': TypeError,
 './data/2201_samp/2201.00078v1.tar.gz': AssertionError,
 './data/2201_samp/2201.00072v1.tar.gz':

## Scratch below here

In [None]:
infile_path = "./data/2201_samp/2201.00092v1.tar.gz" #'./data/2201_samp/2201.00048v1.tar.gz'

soup = soup_from_tar(infile_path)


title = soup.find('title')
print(f"{title.name}: {title.text}")
for sec in soup.find_all('section'):
    print(f' {sec.name}: {sec.text}')

In [None]:
tar_path = "./data/2201_samp/2201.00008v2.tar.gz"
encoding = "utf-8"
with tarfile.open(tar_path, 'r') as in_tar:
    tex_files = [f for f in in_tar.getnames() if f.endswith('.tex')]

    # got one file
    if len(tex_files) == 1:
        pass #return tex_files[0]

    main_files = {}
    for tf in tex_files:
        fp = in_tar.extractfile(tf)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        # does it have a doc class?
        # get the type
        main_files[tf] = find_doc_class(wrapped_file)
        wrapped_file.close() 

    # got one file with doc class
    if len(main_files) == 1:
        pass #return(main_files.keys()[0])

    # account for multi-file submissions
    #return(max(main_files, key=main_files.get))

In [None]:
main_files

In [None]:
doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")

with tarfile.open(tar_path, 'r', encoding='utf-8') as in_tar:
    #in_tar.getnames()
    fp = in_tar.extractfile('main.tex')
    wrapped_file = io.TextIOWrapper(fp, newline=None, encoding='utf-8') #universal newlines
    for line in wrapped_file:
        if doc_class_pat.search(line):
            print(line)
            break

In [None]:
next(wrapped_file)

In [9]:
min_example=r"""
\documentclass{article}
\begin{document}
% \renewcommand{\shorttitle}{Avoiding Catastrophe}
\end{document}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)

\documentclass{article}
\begin{document}
% \renewcommand{\shorttitle}{Avoiding Catastrophe}
\end{document}

In [10]:
TS.TexSoup(r'\newcommand{\bra}[1]{\left\langle#1\right|}')

\newcommand{\bra}[1]{\left\langle#1\right|}

In [11]:
TS.TexSoup(r'\def\be{\foo{equation}}')

\def\be{\foo{equation}}

In [12]:
TS.TexSoup(r'\renewcommand{\shorttitle}{Avoiding Catastrophe}')

\renewcommand{\shorttitle}{Avoiding Catastrophe}

In [52]:
min_example = r"In practice, the matrix $\left [ 4 \right]\Inv\M{D}^{(1)}_n $"


cats = TS.category.categorize(min_example)
tokens = list(TS.tokens.tokenize(cats))

char_codes = list(TS.category.categorize(min_example))

print(tokens)
print(char_codes)




['In practice, the matrix ', '$', '\\', 'left', ' ', '[', ' 4 ', '\\', 'right]', '\\', 'Inv', '\\', 'M', '{', 'D', '}', '^', '{', '(1)', '}', '_n ', '$']
['I', 'n', ' ', 'p', 'r', 'a', 'c', 't', 'i', 'c', 'e', ',', ' ', 't', 'h', 'e', ' ', 'm', 'a', 't', 'r', 'i', 'x', ' ', '$', '\\', 'l', 'e', 'f', 't', ' ', '[', ' ', '4', ' ', '\\', 'r', 'i', 'g', 'h', 't', ']', '\\', 'I', 'n', 'v', '\\', 'M', '{', 'D', '}', '^', '{', '(', '1', ')', '}', '_', 'n', ' ', '$']


In [51]:
with pd.option_context('display.max.columns', None, 'display.max_colwidth', 0):
    pd.DataFrame({'char':char_codes, 'code':(x.category for x in char_codes)}).transpose()
    pd.DataFrame({'tokens':tokens})

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
char,I,n,,p,r,a,c,t,i,c,e,",",,t,h,e,,m,a,t,r,i,x,,$,\,l,e,f,t,[,,4,,\,r,i,g,h,t,],\,I,n,v,\,M,{,D,},^,{,(,1,),},_,n,,$
code,12,12,11.0,12,12,12,12,12,12,12,12,13,11.0,12,12,12,11.0,12,12,12,12,12,12,11.0,4,1,12,12,12,12,19,11.0,13,11.0,1,12,12,12,12,12,20,1,12,12,12,1,12,2,12,3,8,2,21,13,22,3,9,12,11.0,4


Unnamed: 0,tokens
0,"In practice, the matrix"
1,$
2,\
3,left[
4,4
5,\
6,right]
7,\
8,Inv
9,\


In [19]:
min_example = r"In practice, the matrix $\left [\M{D}^{(1)}_n(\M{D}^{(1)}_n)\Tra\right]\Inv\M{D}^{(1)}_n $"
print(min_example)
TS.TexSoup(pre_format(min_example))
TS.TexSoup(min_example)

In practice, the matrix $\left [\M{D}^{(1)}_n(\M{D}^{(1)}_n)\Tra\right]\Inv\M{D}^{(1)}_n $


EOFError: [Line: 0, Offset: 43] "$" env expecting $. Reached end of file.

In [21]:
min_example=r"""
\documentclass{article}
\begin{document}
% \renewcommand{\shorttitle}{Avoiding Catastrophe}
\end{document}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)

\documentclass{article}
\begin{document}
% \renewcommand{\shorttitle}{Avoiding Catastrophe}
\end{document}

In [20]:
min_example=r"""
In practice, the matrix $\left[\M{D}^{(1)}_n(\M{D}^{(1)}_n)\Tra\right]\Inv\M{D}^{(1)}_n$ 
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
TS.TexSoup(min_example)
print(min_example)

In practice, the matrix $\left[\M{D}^{(1) }_n(\M{D}^{(1) }_n)\Tra\right]\Inv\M{D}^{(1) }_n$

In practice, the matrix $\left[\M{D}^{(1)}_n(\M{D}^{(1)}_n)\Tra\right]\Inv\M{D}^{(1)}_n$

In practice, the matrix $\left[\M{D}^{(1)}_n(\M{D}^{(1)}_n)\Tra\right]\Inv\M{D}^{(1)}_n$


In [15]:
print(pre_format(min_example))

In practice, the matrix $\left[\M{D}^{(1) }_n(\M{D}^{(1) }_n)\Tra\right]\Inv\M{D}^{(1) }_n$


In [29]:
BRACKETS_DELIMITERS = {
    '(', ')', '<', '>', '[', ']', '{', '}', r'\{', r'\}', '.' '|', r'\langle',
    r'\rangle', r'\lfloor', r'\rfloor', r'\lceil', r'\rceil', r'\ulcorner',
    r'\urcorner', r'\lbrack', r'\rbrack'
}
# TODO: looks like left-right do have to match
SIZE_PREFIX = ('left', 'right', 'big', 'Big', 'bigg', 'Bigg')
PUNCTUATION_COMMANDS = {command + bracket
                        for command in SIZE_PREFIX
                        for bracket in BRACKETS_DELIMITERS.union({'|', '.'})}
PUNCTUATION_COMMANDS

{'Big(',
 'Big)',
 'Big.',
 'Big.|',
 'Big<',
 'Big>',
 'Big[',
 'Big\\langle',
 'Big\\lbrack',
 'Big\\lceil',
 'Big\\lfloor',
 'Big\\rangle',
 'Big\\rbrack',
 'Big\\rceil',
 'Big\\rfloor',
 'Big\\ulcorner',
 'Big\\urcorner',
 'Big\\{',
 'Big\\}',
 'Big]',
 'Bigg(',
 'Bigg)',
 'Bigg.',
 'Bigg.|',
 'Bigg<',
 'Bigg>',
 'Bigg[',
 'Bigg\\langle',
 'Bigg\\lbrack',
 'Bigg\\lceil',
 'Bigg\\lfloor',
 'Bigg\\rangle',
 'Bigg\\rbrack',
 'Bigg\\rceil',
 'Bigg\\rfloor',
 'Bigg\\ulcorner',
 'Bigg\\urcorner',
 'Bigg\\{',
 'Bigg\\}',
 'Bigg]',
 'Bigg{',
 'Bigg|',
 'Bigg}',
 'Big{',
 'Big|',
 'Big}',
 'big(',
 'big)',
 'big.',
 'big.|',
 'big<',
 'big>',
 'big[',
 'big\\langle',
 'big\\lbrack',
 'big\\lceil',
 'big\\lfloor',
 'big\\rangle',
 'big\\rbrack',
 'big\\rceil',
 'big\\rfloor',
 'big\\ulcorner',
 'big\\urcorner',
 'big\\{',
 'big\\}',
 'big]',
 'bigg(',
 'bigg)',
 'bigg.',
 'bigg.|',
 'bigg<',
 'bigg>',
 'bigg[',
 'bigg\\langle',
 'bigg\\lbrack',
 'bigg\\lceil',
 'bigg\\lfloor',
 'bigg\\rangle