In [1]:
#import zipfile
import tarfile
import io
import importlib
import os
import regex as re
import glob

In [2]:
from tqdm.auto import tqdm

In [3]:
from IPython.core.interactiveshell import InteractiveShell
# pretty print all cell's output and not just the last one
InteractiveShell.ast_node_interactivity = "all"

In [4]:
import TexSoup as TS
#importlib.reload(TS)

In [5]:
LOCAL_DATA_PATH = './data/2201_01_all'

In [6]:
def pre_format(text):
    '''Apply some substititions to make LaTeX easier to parse'''
    source_text = (
        text
        .replace('\\}\\', '\\} \\')  # Due to escape rules \\ is equivalent to \
        .replace(')}', ') }')
        .replace(')$', ') $')
    )
    return source_text
    #clean_lines = []
    #for line in source_text.splitlines(False):
    #    cleanline = line.strip()
    #    if cleanline.startswith(r'\newcommand'):
    #        cleanline = r'%' + cleanline
    #    elif cleanline.startswith(r'\def'):
    #        cleanline = r'%' + cleanline
    #    clean_lines.append(cleanline)
    #return '\n'.join(clean_lines)

In [7]:
def find_doc_class(wrapped_file, name_match=False):
    '''Search for document class related lines in a file  and return a code to represent the type'''
    doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")
    sub_doc_class = re.compile(r"^\s*\\document(?:style|class).*(?:\{standalone\}|\{subfiles\})")

    for line in wrapped_file:
        if doc_class_pat.search(line):
            if name_match:
                # we can miss if there are two or more lines with documentclass 
                # and the first one is not the one that has standalone/subfile
                if sub_doc_class.search(line):
                    return -99999
                return 1 #main_files[tf] = 1
            
    return 0 #main_files[tf] = 0


def find_main_tex_source_in_tar(tar_path, encoding='uft-8'):
    '''Identify the main Tex file in a tarfile.
    
    Args:
        tar_path: A gzipped tar archive of a directory containing tex source and support files.
    '''
    
    tex_names = set(["paper.tex", "main.tex", "ms.tex", "article.tex"])

    with tarfile.open(tar_path, 'r') as in_tar:
        tex_files = [f for f in in_tar.getnames() if f.endswith('.tex')]
        
        # got one file
        if len(tex_files) == 1:
            return tex_files[0]

        main_files = {}
        for tf in tex_files:
            has_main_name = tf in tex_names
            fp = in_tar.extractfile(tf)
            wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
            # does it have a doc class?
            # get the type
            main_files[tf] = find_doc_class(wrapped_file, name_match = has_main_name)
            wrapped_file.close() 
        
        # got one file with doc class
        if len(main_files) == 1:
            return(main_files.keys()[0])
        
        # account for multi-file submissions
        return(max(main_files, key=main_files.get))

In [8]:
MATH_ENV_NAMES = (
    'align', 'align*', 'alignat', 'array', 'displaymath', 'eqnarray',
    'eqnarray*', 'equation', 'equation*', 'flalign', 'flalign*', 'gather',
    'gather*', 'math', 'multline', 'multline*', 'split'
)

In [9]:
def soup_from_tar(tar_path, encoding='utf-8', tolerance=0):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        soup = TS.TexSoup(source_text, tolerance=tolerance, skip_envs=MATH_ENV_NAMES)
        return soup

In [10]:
def source_from_tar(tar_path, encoding='utf-8'):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        return source_text

## Quick check a folder of tar files

In [11]:
os.path.join(LOCAL_DATA_PATH, '*.tar.gz')

'./data/2201_01_all/*.tar.gz'

In [12]:
files = glob.glob(f'{LOCAL_DATA_PATH}/*.tar.gz')

In [14]:
err_files = {}
if 'err_files' in locals():
    if err_files: print('\n'.join(err_files.keys()))
    files = """
        ./data/2201_01_all/2201.01576v2.tar.gz
        ./data/2201_01_all/2201.01664v2.tar.gz
        ./data/2201_01_all/2201.01050v1.tar.gz
        ./data/2201_01_all/2201.01445v1.tar.gz
        ./data/2201_01_all/2201.01073v1.tar.gz
        ./data/2201_01_all/2201.01782v1.tar.gz
        ./data/2201_01_all/2201.01576v1.tar.gz
        ./data/2201_01_all/2201.01647v3.tar.gz
        ./data/2201_01_all/2201.01207v1.tar.gz
        ./data/2201_01_all/2201.01664v1.tar.gz
        ./data/2201_01_all/2201.01980v1.tar.gz
        ./data/2201_01_all/2201.01445v2.tar.gz
    """.strip().split()

In [15]:
files_count = len(files)
utf_count = 0
latin_count = 0 
err_files = {}

TOLERANCE = 1

with tqdm(total=files_count, desc="errors") as err_prog:
    for tar_file in tqdm(files, desc="Progress", display=True):
        # Is it unicode?
        try:
            soup = soup_from_tar(tar_file, encoding='utf-8', tolerance=TOLERANCE)
            utf_count += 1
            continue
        except EOFError as eof:
            err_files[tar_file] = type(eof)
            _ = err_prog.update(1)
            continue
        except UnicodeDecodeError as ue:
            pass
        except KeyboardInterrupt as KB_err:
            break
        except Exception as e:
            err_files[tar_file] = type(e)
            _ = err_prog.update(1)
            continue

        # Is it something else?
        try:
            soup = soup_from_tar(tar_file, encoding='latin-1', tolerance=TOLERANCE)
            latin_count += 1
            continue
        except KeyboardInterrupt as KB_err:
            break
        except Exception as e:
            err_files[tar_file] = type(e)
            _ = err_prog.update(1)
            pass

errors:   0%|          | 0/12 [00:00<?, ?it/s]

Progress:   0%|          | 0/12 [00:00<?, ?it/s]

In [16]:
print(f"{files_count} processed, {len(err_files)} failures.")
print(f"UTF8: {utf_count}; Latin1: {latin_count}")
err_files

12 processed, 8 failures.
UTF8: 4; Latin1: 0


{'./data/2201_01_all/2201.01576v2.tar.gz': AssertionError,
 './data/2201_01_all/2201.01664v2.tar.gz': AssertionError,
 './data/2201_01_all/2201.01050v1.tar.gz': AssertionError,
 './data/2201_01_all/2201.01576v1.tar.gz': AssertionError,
 './data/2201_01_all/2201.01647v3.tar.gz': AssertionError,
 './data/2201_01_all/2201.01207v1.tar.gz': AssertionError,
 './data/2201_01_all/2201.01664v1.tar.gz': AssertionError,
 './data/2201_01_all/2201.01980v1.tar.gz': AssertionError}

## Scratch below here

In [None]:
infile_path = "./data/2201_samp/2201.00092v1.tar.gz" #'./data/2201_samp/2201.00048v1.tar.gz'

text = source_from_tar(infile_path)
soup = soup_from_tar(infile_path)

title = soup.find('title')
print(f"{title.name}: {title.text}")
for sec in soup.find_all('section'):
    print(f' {sec.name}: {sec.text}')

In [None]:
tar_path = "./data/2201_samp/2201.00008v2.tar.gz"
encoding = "utf-8"
with tarfile.open(tar_path, 'r') as in_tar:
    tex_files = [f for f in in_tar.getnames() if f.endswith('.tex')]

    # got one file
    if len(tex_files) == 1:
        pass #return tex_files[0]

    main_files = {}
    for tf in tex_files:
        fp = in_tar.extractfile(tf)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        # does it have a doc class?
        # get the type
        main_files[tf] = find_doc_class(wrapped_file)
        wrapped_file.close() 

    # got one file with doc class
    if len(main_files) == 1:
        pass #return(main_files.keys()[0])

    # account for multi-file submissions
    #return(max(main_files, key=main_files.get))

In [None]:
main_files

In [None]:
doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")

with tarfile.open(tar_path, 'r', encoding='utf-8') as in_tar:
    #in_tar.getnames()
    fp = in_tar.extractfile('main.tex')
    wrapped_file = io.TextIOWrapper(fp, newline=None, encoding='utf-8') #universal newlines
    for line in wrapped_file:
        if doc_class_pat.search(line):
            print(line)
            break

In [None]:
next(wrapped_file)

In [None]:
min_example=r"""
\documentclass{article}
\begin{document}
% \renewcommand{\shorttitle}{Avoiding Catastrophe}
\end{document}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)

In [None]:
TS.TexSoup(r'\newcommand{\bra}[1]{\left\langle#1\right|}')

In [None]:
TS.TexSoup(r'\def\be{\foo{equation}}')

In [None]:
TS.TexSoup(r'\renewcommand{\shorttitle}{Avoiding Catastrophe}')

In [None]:
r"In practice, the matrix $\left [\M{D}^{(1)}_n(\M{D}^{(1)}_n)\Tra\right]\Inv\M{D}^{(1)}_n$"

In [None]:
In practice, the matrix $\left [\M{D}^{(1) }_n(\M{D}^{(1) }_n)\Tra\right]\Inv\M{D}^{(1) }_n$ is pre-computed and cached for repeated use.


In [None]:
min_example=r"""
$\left [\M{D}^{(1) }_n(\M{D}^{(1) }_n)\Tra \right] $
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#TS.TexSoup(min_example)
#print(min_example)

In [None]:
print(pre_format(min_example))

In [None]:
print()

In [None]:
min_example=r"""
\documentclass{article}
\begin{document}
\catcode\day\month
\end{document}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)

In [None]:
TS.TexSoup(r'\renewcommand{\shorttitle}{Avoiding Catastrophe}')