In [52]:
import os
import re
import io
import shutil
import TexSoup
import tarfile

In [120]:
# path to .tar.gz files
LOCAL_DATA_PATH = "./data"
LOCAL_FILE_PATH = "2201_01_all"
LOCAL_GAR_PATH = os.path.join(LOCAL_DATA_PATH, LOCAL_FILE_PATH)

In [4]:
from IPython.core.interactiveshell import InteractiveShell
# pretty print all cell's output and not just the last one
InteractiveShell.ast_node_interactivity = "all"

In [6]:
# directly written here because parse_bad_file.py syntax has issues
def pre_format(text):
    '''Apply some substititions to make LaTeX easier to parse'''
    source_text = (
        text
        .replace('\\}\\', '\\} \\')  # Due to escape rules \\ is equivalent to \
        .replace(')}', ') }')
        .replace(')$', ') $')
    )
    return source_text

In [80]:
def find_main_tex_source(path):
    # assuming path is a directory containing unzipped tex source etc
    tex_files = [f for f in os.listdir(path) if f.endswith('.tex')]
    if len(tex_files) == 1:
        return(os.path.join(path, tex_files[0]))
    else:
        main_files = {}
        for tf in tex_files:
            file = open(os.path.join(path, tf), "r")
            for line in file:
                if re.search(r"^\s*\\document(?:style|class)", line):
                    # https://arxiv.org/help/faq/mistakes#wrongtex
                    # according to this page, there should only be one tex file with a \documentclass - the main file ?
                    if tf == "paper.tex" or tf == "main.tex" or tf == "ms.tex" or tf == "article.tex":
                        main_files[tf] = 1
                    else:
                        main_files[tf] = 0
                    break
            file.close
        if len(main_files) == 1:
           return(os.path.join(path, list(main_files)[0]))
        else:
            # account for the two main ways of creating multi-file submissions on overleaf (standalone, subfiles)
            for mf in main_files:
                file = open(os.path.join(path, mf), "r")
                for line in file:
                    if re.search(r"^\s*\\document(?:style|class).*(?:\{standalone\}|\{subfiles\})", line):
                        main_files[mf] = -99999
                        break
                        # document class of main should not be standalone or subfiles (the main file is just {article} or something else)
                file.close
            return(os.path.join(path, max(main_files, key=main_files.get)))

In [67]:
# uncompress all .tar.gz files and get all tex files
LOCAL_TEX_PATH = os.path.join(LOCAL_DATA_PATH, LOCAL_FILE_PATH+"_tex")

for file_name in os.listdir(LOCAL_GAR_PATH):
    path = os.path.join(LOCAL_GAR_PATH, file_name)
    if tarfile.is_tarfile(path):        # .tar.gz check to avoid error
        file = tarfile.open(path, "r")
        for name in file.getnames():
            if name.endswith(".tex"):   # tex file check
                if not os.path.exists(LOCAL_TEX_PATH):      # tex folder check
                    os.mkdir(LOCAL_TEX_PATH)
                sub_tex_path = os.path.join(LOCAL_TEX_PATH, file_name.strip(".tar.gz"))
                if not os.path.exists(sub_tex_path):       # sub tex folder check
                    os.mkdir(sub_tex_path)
                file.extract(name, sub_tex_path)       # tex file extraction
        file.close()

In [160]:
# remove earlier version of articles
# shuold probably do it before uncompression, will reformat in the future if needed
file_list = []
for file in os.listdir(LOCAL_TEX_PATH):
    r = re.compile("^"+file[:-1]+"\d+$")
    file_list.append(file[:-1]+max([f[-1] for f in filter(r.match, os.listdir(LOCAL_TEX_PATH))]))

['2201.01576v2', '2201.01228v3', '2201.01829v2', '2201.01287v1', '2201.01797v2', '2201.01120v1', '2201.01064v1', '2201.01840v1', '2201.01823v2', '2201.01778v2', '2201.01324v1', '2201.01347v2', '2201.01322v3', '2201.01734v1', '2201.01360v1', '2201.01804v1', '2201.01020v2', '2201.01888v1', '2201.01538v2', '2201.01105v2', '2201.01551v1', '2201.01415v2', '2201.01385v4', '2201.01925v1', '2201.01222v2', '2201.01305v1', '2201.01389v4', '2201.01411v1', '2201.01886v1', '2201.01026v2', '2201.01228v3', '2201.01364v1', '2201.01385v4', '2201.01264v1', '2201.01320v1', '2201.01774v1', '2201.01385v4', '2201.01511v1', '2201.01289v1', '2201.01341v1', '2201.01322v3', '2201.01961v1', '2201.01825v1', '2201.01902v2', '2201.01812v3', '2201.01871v2', '2201.01741v1', '2201.01666v2', '2201.01251v2', '2201.01647v3', '2201.01689v2', '2201.01810v1', '2201.01664v2', '2201.01683v1', '2201.01424v1', '2201.01070v1', '2201.01549v3', '2201.01293v6', '2201.01330v1', '2201.01274v1', '2201.01353v2', '2201.01620v1', '2201.0

In [None]:
# get all main tex files
# bit a of dumb way to do it, but works for now
# shutil.copy seemingly always print out copy results
LOCAL_MAIN_TEX_PATH = os.path.join(LOCAL_DATA_PATH, LOCAL_FILE_PATH+"_main_tex")
for folder_name in file_list:
    if not os.path.exists(LOCAL_MAIN_TEX_PATH):
        os.mkdir(LOCAL_MAIN_TEX_PATH)
    main_tex_file_path = find_main_tex_source(os.path.join(LOCAL_TEX_PATH, folder_name))
    main_tex_sub_folder_path = os.path.join(LOCAL_MAIN_TEX_PATH, folder_name)
    if not os.path.exists(main_tex_sub_folder_path):
        os.mkdir(main_tex_sub_folder_path)
    shutil.copy(main_tex_file_path, main_tex_sub_folder_path)

In [135]:
# parse main tex files
import logging
import traceback

exception_ct = 0
for folder in os.listdir(LOCAL_MAIN_TEX_PATH):
    for file in os.listdir(os.path.join(LOCAL_MAIN_TEX_PATH, folder)):
        with open(os.path.join(LOCAL_MAIN_TEX_PATH, folder, file), "r") as in_tex:
            # in_tex = open(os.path.join(LOCAL_MAIN_TEX_PATH, folder, file), "r")
            wrapped_file = io.TextIOWrapper(in_tex, newline=None, encoding="utf-8") # universal newlines
            print(wrapped_file)
            print(wrapped_file.read())
#         source_text = pre_format(wrapped_file.read())
#         try: 
#             soup = TexSoup.TexSoup(source_text)
#         except Exception as e: 
#             exception_ct += 1
#             # logging.error(traceback.format_exc())
# print(exception_ct)

<_io.TextIOWrapper name='./data/2201_01_all_main_tex/2201.01576v2/KitaevTable_v8.tex' encoding='utf-8'>


TypeError: can't concat str to bytes