In [1]:
import os
import tarfile
import logging
from tqdm.auto import tqdm

In [2]:
logging.basicConfig(
   filename='extraction.log',
   filemode='a',
   format='%(asctime)s - %(levelname)s - %(message)s',
   level=logging.INFO
)

In [3]:
def extract_tex_files(tar_path, output_dir):
    """
    Extract all .tex files from a .tar.gz archive to a dedicated folder in output_dir.
    """
    archive_name = os.path.splitext(os.path.basename(tar_path))[0]
    if archive_name.endswith('.tar'):
        archive_name = os.path.splitext(os.path.basename(archive_name))[0]
    extract_folder = os.path.join(output_dir, archive_name)
    
    # Create directory for extracted files if it doesn't exist
    os.makedirs(extract_folder, exist_ok=True)
    
    try:
        with tarfile.open(tar_path, "r:gz") as tar:
           for member in tar.getmembers():
               if member.isfile() and member.name.endswith(".tex"):
                   member.name = os.path.basename(member.name)  # Avoid nested directory extraction
                   tar.extract(member, path=extract_folder)
                   logging.info(f"Extracted {member.name} from {tar_path} to {extract_folder}")
        logging.info(f"All .tex files from {tar_path} have been extracted to {extract_folder}")
    except Exception as e:
        logging.error(f"Failed to extract .tex files from {tar_path}, Error: {e}")


def extract_all_tex_files(dataset_dir, output_dir):
    """
    Extract .tex files from all .tar.gz files in dataset_dir to output_dir.
    Each archive's .tex files are placed in a separate subfolder.
    """
    for root, _, files in os.walk(dataset_dir):
       for file in tqdm(files, position=1):
           if file.endswith(".tar.gz"):
               tar_path = os.path.join(root, file)
               extract_tex_files(tar_path, output_dir)
    

# if __name__ == "__main__":
#    dataset_dir = '/Users/sancho/arxiv/spacy/dataset/2201_samp'  #change it to your dataset address
#    output_dir = 'all_tex_files'  #output
#    os.makedirs(output_dir, exist_ok=True)
#   
#    extract_all_tex_files(dataset_dir, output_dir)
#    print(f"every .tex output to the: {output_dir}")

In [4]:
!ls /Volumes/Neptune/scratch/

[34m2301[m[m          [34m2310[m[m          [34m2311[m[m          [34m2312[m[m          [34m2401[m[m
2301.zip      2310.zip      2311.zip      2312.zip      2401.zip
2301_tex.zip  2310_tex.zip  2311_tex.zip  2312_tex.zip  [34m2402[m[m
[34m2301_tex_mix[m[m  [34m2310_tex_mix[m[m  [34m2311_tex_mix[m[m  [34m2312_tex_mix[m[m  [34m2403[m[m
[34m2301_text[m[m     [34m2310_text[m[m     [34m2311_text[m[m     [34m2312_text[m[m     [34m2404[m[m
2301_text.zip 2310_text.zip 2311_text.zip 2312_text.zip


In [5]:
folders = [
    ("/Volumes/Neptune/scratch/2301_tex_mix", "/Volumes/Neptune/scratch/2301_tex"),
    ("/Volumes/Neptune/scratch/2310_tex_mix", "/Volumes/Neptune/scratch/2310_tex"),
    ("/Volumes/Neptune/scratch/2311_tex_mix", "/Volumes/Neptune/scratch/2311_tex"),
    ("/Volumes/Neptune/scratch/2312_tex_mix", "/Volumes/Neptune/scratch/2312_tex"),
]

for in_dir, out_dir in tqdm(folders, position=0):
    extract_all_tex_files(in_dir, out_dir)


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/12507 [00:00<?, ?it/s]

  0%|          | 0/18984 [00:00<?, ?it/s]

  0%|          | 0/17060 [00:00<?, ?it/s]

  0%|          | 0/16320 [00:00<?, ?it/s]

In [3]:
#import zipfile
import tarfile
import io
import importlib
import os
import regex as re
import glob
import pandas as pd

In [17]:
import pathlib
import collections as coll

In [4]:
import pyperclip   #copy text to clipboard for inspecting

In [5]:
from tqdm.auto import tqdm

In [21]:
from IPython.core.interactiveshell import InteractiveShell
# pretty print all cell's output and not just the last one
InteractiveShell.ast_node_interactivity = "all"

In [7]:
import TexSoup as TS
from TexSoup.tokens import MATH_ENV_NAMES

In [8]:
#importlib.reload(TS)

In [9]:
def pre_format(text):
    '''Apply some substititions to make LaTeX easier to parse'''
    source_text = (
        text
        .replace('\\}\\', '\\} \\')  # Due to escape rules \\ is equivalent to \
        .replace(')}', ') }')
        .replace(')$', ') $')
        #.replace(r'\left [', r'\left[ ')
        #.replace(r'\left (', r'\left( ')
        #.replace(r'\left \{', r'\left\{ ')
    )
    return source_text
    #clean_lines = []
    #for line in source_text.splitlines(False):
    #    cleanline = line.strip()
    #    if cleanline.startswith(r'\newcommand'):
    #        cleanline = r'%' + cleanline
    #    elif cleanline.startswith(r'\def'):
    #        cleanline = r'%' + cleanline
    #    clean_lines.append(cleanline)
    #return '\n'.join(clean_lines)

In [10]:
def find_doc_class(wrapped_file, name_match=False):
    '''Search for document class related lines in a file  and return a code to represent the type'''
    doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")
    sub_doc_class = re.compile(r"^\s*\\document(?:style|class).*(?:\{standalone\}|\{subfiles\})")

    for line in wrapped_file:
        if doc_class_pat.search(line):
            if name_match:
                # we can miss if there are two or more lines with documentclass 
                # and the first one is not the one that has standalone/subfile
                if sub_doc_class.search(line):
                    return -99999
                return 1 #main_files[tf] = 1
            
    return 0 #main_files[tf] = 0


def find_main_tex_source_in_tar(tar_path, encoding='uft-8'):
    '''Identify the main Tex file in a tarfile.
    
    Args:
        tar_path: A gzipped tar archive of a directory containing tex source and support files.
    '''
    
    tex_names = set(["paper.tex", "main.tex", "ms.tex", "article.tex"])

    with tarfile.open(tar_path, 'r') as in_tar:
        tex_files = [f for f in in_tar.getnames() if f.endswith('.tex')]
        
        # got one file
        if len(tex_files) == 1:
            return tex_files[0]

        main_files = {}
        for tf in tex_files:
            has_main_name = tf in tex_names
            fp = in_tar.extractfile(tf)
            wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
            # does it have a doc class?
            # get the type
            main_files[tf] = find_doc_class(wrapped_file, name_match = has_main_name)
            wrapped_file.close() 
        
        # got one file with doc class
        if len(main_files) == 1:
            return(main_files.keys()[0])
        
        # account for multi-file submissions
        return(max(main_files, key=main_files.get))

In [11]:
def soup_from_tar(tar_path, encoding='utf-8', tolerance=0):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        soup = TS.TexSoup(source_text, tolerance=tolerance, skip_envs=MATH_ENV_NAMES)
        return soup

In [11]:
def source_from_tar(tar_path, encoding='utf-8', tolerance=None):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        return source_text

## Check a file with parse errors

In [34]:
infile_path = "./data/2201_00_all/2201.00001v1.tar.gz" #'./data/2201_samp/2201.00048v1.tar.gz'

text = source_from_tar(infile_path)
pyperclip.copy(text)
soup = soup_from_tar(infile_path, tolerance=1)


title = soup.find('title')
if title: print(f"{title.name}: {title.text}")
for sec in soup.find_all('section'):
    print(f' {sec.name}: {sec.text}')

title: ['Modeling Advection on Directed Graphs using  Mat', "\\'", 'e', 'rn Gaussian Processes for Traffic Flow']
 section: ['Introduction']
 section: ['Understanding the directed graph advection operator']
 section: ['Directed Graph Advection Mat', "\\'", 'e', 'rn Gaussian Process (DGAMGP) ']
 section: ['Numerical Results']
 section: ['Conclusions']
 section: ['Upwinding discretizations of linear advection']
 section: ['Examples of ', '$', 'L_', 'adv', '$', ' on balanced graphs resulting in finite difference discretizations of linear advection']
 section: ['Additional Experiments']


In [12]:
infile_path = "./data/2201_00_all/2201.00430v1.tar.gz" #'./data/2201_samp/2201.00048v1.tar.gz'

text = source_from_tar(infile_path)
pyperclip.copy(text)
soup = soup_from_tar(infile_path, tolerance=1)


title = soup.find('title')
if title: print(f"{title.name}: {title.text}")
for sec in soup.find_all('section'):
    print(f' {sec.name}: {sec.text}')

title: ['Classifying Subset Feedback Vertex Set', '\\\\', ' for ', '$', 'H', '$', '-Free Graphs']
 section: ['Introduction']
 section: ['Preliminaries']
 section: ['The Weighted Variant']
 section: ['The Unweighted Variant']
 section: ['Conclusions']
 section: ['Preliminaries']


## Quick check a folder of tar files

In [27]:
LOCAL_DATA_PATH = '/Volumes/Neptune/scratch/2311'

In [28]:
files = glob.glob(f'{LOCAL_DATA_PATH}/*.tar.gz')
files_count = len(files)
files_count
ufiles = set(pathlib.Path(x).name.strip("tar.gz").split('v')[0] for x in files)
len(ufiles)

0

0

In [29]:
files = glob.glob(f'{LOCAL_DATA_PATH}/*.tar.gz')
files_count = len(files)
utf_count = 0
latin_count = 0 
inc_graphics_count = 0
inc_alt_count = 0
err_files = {}

TOLERANCE = 1

def update_counts(text, tar_file):
    global inc_alt_count
    global inc_graphics_count
    if "alt=" in text:
        inc_alt_count += 1
        print(f"Found alt in {tar_file}")
        
    if r"\usepackage{graphicx}" in text:
        inc_graphics_count += 1

with tqdm(total=files_count, desc="errors") as err_prog:
    for tar_file in tqdm(files, desc="Progress", display=True):
        # Is it unicode?
        text = ""
        try:
            text = source_from_tar(tar_file, encoding='utf-8', tolerance=TOLERANCE)
            utf_count += 1
            update_counts(text, tar_file)
            continue
        except EOFError as eof:
            err_files[tar_file] = type(eof)
            _ = err_prog.update(1)
            continue
        except UnicodeDecodeError as ue:
            pass
        except KeyboardInterrupt as KB_err:
            break
        except Exception as e:
            err_files[tar_file] = type(e)
            _ = err_prog.update(1)
            continue

        # Is it something else?
        try:
            text = source_from_tar(tar_file, encoding='latin-1', tolerance=TOLERANCE)
            latin_count += 1
            update_counts(text, tar_file)
            continue
        except KeyboardInterrupt as KB_err:
            break
        except Exception as e:
            err_files[tar_file] = type(e)
            _ = err_prog.update(1)
            pass
            

            

errors:   0%|          | 0/16840 [00:00<?, ?it/s]

Progress:   0%|          | 0/16840 [00:00<?, ?it/s]

Found alt in /Volumes/Neptune/scratch/2401/2401.04531v1.tar.gz
Found alt in /Volumes/Neptune/scratch/2403/2403.11782v1.tar.gz
Found alt in /Volumes/Neptune/scratch/2403/2403.11782v2.tar.gz
Found alt in /Volumes/Neptune/scratch/2404/2404.05317v3.tar.gz
Found alt in /Volumes/Neptune/scratch/2404/2404.05317v1.tar.gz
Found alt in /Volumes/Neptune/scratch/2404/2404.05317v2.tar.gz
Found alt in /Volumes/Neptune/scratch/2404/2404.08812v1.tar.gz


In [32]:
print(f"{files_count} processed, {len(err_files)} failures.")
print(f"UTF8: {utf_count}; Latin1: {latin_count}")
err_files

16840 processed, 14 failures.
UTF8: 16705; Latin1: 121


{'/Volumes/Neptune/scratch/2312/2312.14430v2.tar.gz': ValueError,
 '/Volumes/Neptune/scratch/2312/2312.07078v1.tar.gz': ValueError,
 '/Volumes/Neptune/scratch/2312/2312.14430v4.tar.gz': ValueError,
 '/Volumes/Neptune/scratch/2312/2312.03652v1.tar.gz': AttributeError,
 '/Volumes/Neptune/scratch/2312/2312.14430v1.tar.gz': ValueError,
 '/Volumes/Neptune/scratch/2312/2312.14430v3.tar.gz': ValueError,
 '/Volumes/Neptune/scratch/2312/2312.05574v1.tar.gz': ValueError,
 '/Volumes/Neptune/scratch/2401/2401.07831v2.tar.gz': ValueError,
 '/Volumes/Neptune/scratch/2401/2401.00660v1.tar.gz': ValueError,
 '/Volumes/Neptune/scratch/2401/2401.07831v1.tar.gz': ValueError,
 '/Volumes/Neptune/scratch/2401/2401.14152v1.tar.gz': ValueError,
 '/Volumes/Neptune/scratch/2403/2403.19693v1.tar.gz': ValueError,
 '/Volumes/Neptune/scratch/2403/2403.09803v1.tar.gz': ValueError,
 '/Volumes/Neptune/scratch/2403/2403.14585v1.tar.gz': ValueError}

In [33]:
print(f"{files_count} processed, {inc_graphics_count} used graphicx package, {inc_alt_count} used alt.")

16840 processed, 10854 used graphicx package, 7 used alt.


In [112]:
source_from_tar('./data/2301/2301.01083v2.tar.gz', encoding='utf-8', tolerance=TOLERANCE)

ValueError: max() arg is an empty sequence

## Scratch below here

In [19]:
infile_path = "./data/2201_00_all/2201.00740v1.tar.gz" #'./data/2201_samp/2201.00048v1.tar.gz'
#infile_path = "./data/2201_01_all/2201.01050v1.tar.gz" #'./data/2201_samp/2201.00048v1.tar.gz'

text = source_from_tar(infile_path)
pyperclip.copy(text)
soup = soup_from_tar(infile_path, tolerance=1)

title = soup.find('title')
if title: print(f"{title.name}: {title.text}")
for sec in soup.find_all('section'):
    print(f' {sec.name}: {sec.text}')

EOFError: [Line: 0, Offset: 42281] "displaymath" env expecting \]. Reached end of file.

In [32]:
infile_path = "./data/2201_00_all/2201.00430v1.tar.gz" #'./data/2201_samp/2201.00048v1.tar.gz'

text = source_from_tar(infile_path)
pyperclip.copy(text)
soup = soup_from_tar(infile_path, encoding='utf-8', tolerance=TOLERANCE)


title = soup.find('title')
if title: print(f"{title.name}: {title.text}")
for sec in soup.find_all('section'):
    print(f' {sec.name}: {sec.text}')

NameError: name 'TOLERANCE' is not defined

In [None]:
tar_path = "./data/2201_samp/2201.00008v2.tar.gz"
encoding = "utf-8"
with tarfile.open(tar_path, 'r') as in_tar:
    tex_files = [f for f in in_tar.getnames() if f.endswith('.tex')]

    # got one file
    if len(tex_files) == 1:
        pass #return tex_files[0]

    main_files = {}
    for tf in tex_files:
        fp = in_tar.extractfile(tf)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        # does it have a doc class?
        # get the type
        main_files[tf] = find_doc_class(wrapped_file)
        wrapped_file.close() 

    # got one file with doc class
    if len(main_files) == 1:
        pass #return(main_files.keys()[0])

    # account for multi-file submissions
    #return(max(main_files, key=main_files.get))

In [None]:
main_files

In [None]:
doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")

with tarfile.open(tar_path, 'r', encoding='utf-8') as in_tar:
    #in_tar.getnames()
    fp = in_tar.extractfile('main.tex')
    wrapped_file = io.TextIOWrapper(fp, newline=None, encoding='utf-8') #universal newlines
    for line in wrapped_file:
        if doc_class_pat.search(line):
            print(line)
            break

In [None]:
next(wrapped_file)

In [None]:
min_example=r"""
\documentclass{article}
\begin{document}
% \renewcommand{\shorttitle}{Avoiding Catastrophe}
\end{document}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)
TS.TexSoup(r'\newcommand{\bra}[1]{\left\langle#1\right|}')

In [None]:
TS.TexSoup(r'\def\be{\foo{equation}}')

In [None]:
TS.TexSoup(r'\renewcommand{\shorttitle}{Avoiding Catastrophe}')
min_example = r"\newenvironment{inlinemath}{$}{$}".strip()
TS.TexSoup(pre_format(min_example))
#print(min_example)

In [None]:
min_example = r"In practice, the matrix $\left [ 4 \right]\Inv\M{D}^{(1)}_n $".strip()
TS.TexSoup(pre_format(min_example))
#print(min_example)

In [None]:
min_example = r"In practice, the matrix $\left[ 4 \right]\Inv\M{D}^{(1)}_n $"


cats = TS.category.categorize(min_example)
tokens = list(TS.tokens.tokenize(cats))

char_codes = list(TS.category.categorize(min_example))

buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(r'\left[ 4 \right]')))
TS.reader.read_command(buf, n_required_args=-1, mode='mode:math', skip=1 )

In [None]:
min_example = r"$ t \in [0,1] $$ t \in [0,1] $"


cats = TS.category.categorize(min_example)
tokens = list(TS.tokens.tokenize(cats))

char_codes = list(TS.category.categorize(min_example))

with pd.option_context('display.max.columns', None, 'display.max_colwidth', 0):
    pd.DataFrame({'char':char_codes, 'code':(x.category for x in char_codes)}).transpose()
    pd.DataFrame({'tokens':tokens})

buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(min_example)))
TS.reader.read_command(buf, n_required_args=-1, mode='mode:math', skip=3, tolerance=1)

buf = TS.reader.Buffer(TS.tokens.tokenize(TS.category.categorize(min_example)))
TS.read(buf, tolerance=1)

In [None]:
with pd.option_context('display.max.columns', None, 'display.max_colwidth', 0):
    pd.DataFrame({'char':char_codes, 'code':(x.category for x in char_codes)}).transpose()
    pd.DataFrame({'tokens':tokens})

In [None]:
min_example = r"In practice, the matrix $\left [\M{D}^{(1)}_n(\M{D}^{(1)}_n)\Tra\right]\Inv\M{D}^{(1)}_n $"
print(min_example)
TS.TexSoup(pre_format(min_example))
TS.TexSoup(min_example)

In [None]:
min_example=r"""
\documentclass{article}
\begin{document}
% \renewcommand{\shorttitle}{Avoiding Catastrophe}
\end{document}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)

In [None]:
min_example=r"""
\def\bean {\begin{foo}}  \def\eean {\end{foo}}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
TS.TexSoup(min_example)
print(min_example)
min_example=r"""
we {use $A=8B$ and $s=1$, then the scalar field becomes same with (\Ref{scalarfield}) and
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#TS.TexSoup(min_example)
print(min_example)
print(pre_format(min_example))
BRACKETS_DELIMITERS = {
    '(', ')', '<', '>', '[', ']', '{', '}', r'\{', r'\}', '.' '|', r'\langle',
    r'\rangle', r'\lfloor', r'\rfloor', r'\lceil', r'\rceil', r'\ulcorner',
    r'\urcorner', r'\lbrack', r'\rbrack'
}
# TODO: looks like left-right do have to match
SIZE_PREFIX = ('left', 'right', 'big', 'Big', 'bigg', 'Bigg')
PUNCTUATION_COMMANDS = {command + opt_space + bracket
                        for command in SIZE_PREFIX
                        for opt_space in {'', ' '}
                        for bracket in BRACKETS_DELIMITERS.union({'|', '.'})}
PUNCTUATION_COMMANDS

In [None]:
min_example=r"""
\def\bean {\begin{eqnarray*}}  \def\eean {\end{eqnarray*}}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)
min_example=r"""
the interval $t\in[0,1)$. 
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example))
#print(min_example)
min_example=r"""







\beq
[\chF,\chG\}=\{\partial\chF,\chG\}.
\eeq

derivation $\CA\mapsto [\CB,\CA]$. 







The following characterizations of UAL chains are all equivalent:
\begin{itemize}
    \item[(1)] A skew-symmetric function $\cha:\Lambda^{q+1}\ra\mfkdal$ defines an element of $C_{q}(\mfkdal) $ if $\|\cha\|_{\alpha}<\infty$ for any $\alpha \in \NN$.
    \item[(2)] A skew-symmetric function $\cha:\Lambda^{q+1}\ra\mfkdal$ defines an element of $C_{q}(\mfkdal) $ if there is a function $b(r) \in \Orf$  such that for any $j_0,...,j_q$ the observable $\cha_{j_0...j_q}$ is $b$-localized at $j_a$ for any $a \in \{0,1,...,q\}$.
    \item[(3)] $C_{q}(\mfkdal) $ is the completion of $C_q(\mfkdl) $ with respect to the norms $\|\cdot\|_{\alpha}$.
\end{itemize}
\end{lemma}





""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
\newcommand\const{\operatorname{const}}
""".strip() #.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
\newcommand{\beq}{\begin{equation}}
\newcommand{\eeq}{\end{equation}}
\newcommand{\chF}{{\mathsf f}}
\newcommand{\chG}{{\mathsf g}}
\beq  
[\chF,\chG\}=\{\partial\chF,\chG\}.
\eeq
derivation $\CA\mapsto [\CB,\CA]$. 
""".strip().replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
\[
r_p=d(p,\cdot)\colon \Gamma \to [0,\infty)|~ p \in M\}
\]
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
$\bigl[ a \bigr)$
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""

$\varepsilon\in]0,\varepsilon_\star[$,  

""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
\[
i\colon [0,\infty) 
\]
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)

In [None]:
min_example=r"""
\newcommand\1{{\mathds 1}}
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# !! This bug was specific to my fork
min_example=r"""
\newcommand{\linebreakand}{%
    \end{@IEEEauthorhalign}
    \hfill\mbox{}\par
    \mbox{}\hfill\begin{@IEEEauthorhalign}
    }
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
min_example=r"""
 $S \subseteq \{0\} \bigcup [1,\infty) $ if $z^*_2=1$.  
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# two inline math envs next to eachother
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
$\rm{W_{cyc} }\geq 0$$\;\;\square$
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
\verb+$TEXMF/tex/latex/elsevier/+, %$%%%%%%%%%%%%%%%%%%%%%%%%%%%%
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# does not handle missing optional braces around arguments
min_example=r"""
$\sqrt {\frac 3 2} >p >1$
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
&$\rm{N_{Diskbb}}$$(\times 10^4) $
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
$\frac{j+1+\epsilon}{m^{\alpha}}[$
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# \verb{char}...{char} is also an issue for parser
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
$1\le k< \frac n2 $ 
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# math nested in text in math
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""
$$
\sum_{S: |S|=\lfloor q/2 \rfloor,\lceil q/2 \rceil} \beta_S \geq  \begin{cases} 
0.76 & \quad \text{if $q=5$}\\
0.80  & \quad \text{if $q\geq 7$}.
\end{cases},
$$
""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
# math nested in text in math
# !! probably not fixable given the approach used in TexSoup (needs stateful tokenization)
min_example=r"""




\begin{figure}
\centering
\begin{minipage}[b]{0.4\textwidth}

\end{minipage}
\qquad
\begin{minipage}[b]{0.2\textwidth}
\begin{tikzpicture}[scale=0.8] \draw (-2,-1)--(-2,0)--(0,1) (-3,-2.2)--(-3,0.2)--(-1,0.2)--(-1,-2.2)--(-3,-2.2) (2,0)--(2,-1) (-2.5,-2) to[out=120,in=220] (-2,0) (-1.5,-2) to[out=60,in=320] (-2,0); \draw[dotted] (-2.5,-2)--(-1.5,-2); \draw[very thick] (0,-1)--(0,1)--(2,0); \draw[dashed] (-2.5,-2)--(-2,-1)--(-1.5,-2); \draw[fill=white] (-2,0) circle [radius=3pt] (-2,-1) circle [radius=3pt] (2,-1) circle [radius=3pt]; \draw[fill=black] (2,0) circle [radius=3pt] (0,1) circle [radius=3pt] (0,0) circle [radius=3pt] (0,-1) circle [radius=3pt] (-2.5,-2) circle [radius=3pt] (-1.5,-2) circle [radius=3pt]; \node[above] at (0,1) {$u\in T$}; \node[left] at (-2,0) {$v_1$}; \node[right] at (0,0) {$v_2$}; \node[right] at (2,0) {$v_3$}; \node[left] at (-2,-1) {$x_1$}; \node[right] at (0,-1) {$x_2$}; \node[right] at (2,-1) {$x_3$}; \node[left] at (-2.5,-2) {$y$}; \node[right] at (-1.5,-2) {$y'$};\node[above] at (-2,0.3) {$$};
\end{tikzpicture}
\end{minipage}

\end{figure}


We first compute a $\leq$$1$-part solution, $2$-part solution and $3$-part solution for $(G,T,w) $ of maximum weight. By Lemmas~\ref{l-1part},~\ref{l-2part} and~\ref{l-3part}, respectively, this takes polynomial time. 





""".strip()#.replace('\\}\\', '\\} \\').replace(')}', ') }')
TS.TexSoup(pre_format(min_example), tolerance=1)
#print(min_example)
import pandas as pd
import numpy as np
pd.DataFrame(np.random.randint(0,100,size=(10, 3)), columns=list('ABC')).to_csv('~/Expire/test_console_upload.csv')

In [88]:
byte_string = b"Hello World"
byte_string.lower?

[0;31mDocstring:[0m
B.lower() -> copy of B

Return a copy of B with all ASCII characters converted to lowercase.
[0;31mType:[0m      builtin_function_or_method

In [93]:
b'\Xc3\x80'.lower()

b'\\xc3\x80'

In [94]:
b_string = b"Hello World"

In [98]:
b_string.hex()

'48656c6c6f20576f726c64'

In [106]:
bin(int(b'A'.hex(),16))

'0b1000001'

In [107]:
bin(int(b'a'.hex(),16))

'0b1100001'