In [17]:
import requests
import os
import tarfile
import bibtexparser
import arxiv
import json
import re

download_dir = "arxiv_downloads"
extracted_dir = "arxiv_extracted"
os.makedirs(download_dir, exist_ok=True)
os.makedirs(extracted_dir, exist_ok=True)

In [27]:
data = {}

client = arxiv.Client()

categories = [  "cs.AI", "cs.AR", "cs.CC", "cs.CE", "cs.CG", "cs.CL", "cs.CR", "cs.CV", "cs.CY", "cs.DB",
                "cs.DC", "cs.DL", "cs.DM", "cs.DS", "cs.ET", "cs.FL", "cs.GL", "cs.GR", "cs.GT", "cs.HC",
                "cs.IR", "cs.IT", "cs.LG", "cs.LO", "cs.MA", "cs.MM", "cs.MS", "cs.NA", "cs.NE", "cs.NI",
                "cs.OH", "cs.OS", "cs.PF", "cs.PL", "cs.RO", "cs.SC", "cs.SD", "cs.SE", "cs.SI", "cs.SY",
                "econ.EM", "econ.GN", "econ.TH", "eess.AS", "eess.IV", "eess.SP", "eess.SY", "math.AC", "math.AG", "math.AP",
                "math.AT", "math.CA", "math.CO", "math.CT", "math.CV", "math.DG", "math.DS", "math.FA", "math.GM", "math.GN",
                "math.GR", "math.GT", "math.HO", "math.IT", "math.KT", "math.LO", "math.MG", "math.MP", "math.NA", "math.NT",
                "math.OA", "math.OC", "math.PR", "math.QA", "math.RA", "math.RT", "math.SG", "math.SP", "math.ST", "astro-ph.CO",
                "astro-ph.EP", "astro-ph.GA", "astro-ph.HE", "astro-ph.IM", "astro-ph.SR", "cond-mat.dis-nn", "cond-mat.mes-hall", "cond-mat.mtrl-sci", "cond-mat.other",
                "cond-mat.quant-gas", "cond-mat.soft", "cond-mat.stat-mech", "cond-mat.str-el", "cond-mat.supr-con", "gr-qc", "hep-ex", "hep-lat", "hep-ph", "hep-th",
                "math-ph", "nlin.AO", "nlin.CD", "nlin.CG", "nlin.PS", "nlin.SI", "nucl-ex", "nucl-th", "physics.acc-ph", "physics.ao-ph",
                "physics.app-ph", "physics.atm-clus", "physics.atom-ph", "physics.bio-ph", "physics.chem-ph", "physics.class-ph", "physics.comp-ph", "physics.data-an", "physics.ed-ph", "physics.flu-dyn",
                "physics.gen-ph", "physics.geo-ph", "physics.hist-ph", "physics.ins-det", "physics.med-ph", "physics.optics", "physics.plasm-ph", "physics.pop-ph", "physics.soc-ph", "physics.space-ph",
                "quant-ph", "q-bio.BM", "q-bio.CB", "q-bio.GN", "q-bio.MN", "q-bio.NC", "q-bio.OT", "q-bio.PE", "q-bio.QM", "q-bio.SC",
                "q-bio.TO", "q-fin.CP", "q-fin.EC", "q-fin.GN", "q-fin.MF", "q-fin.PM", "q-fin.PR", "q-fin.RM", "q-fin.ST", "q-fin.TR",
                "stat.AP", "stat.CO", "stat.ME", "stat.ML", "stat.OT", "stat.TH"
            ]

for n, c in enumerate(categories, 1):

    print(f"{n} / {len(categories)}")
    
    search = arxiv.Search(
        query = f"cat:{c}",
        max_results = 5,
        sort_by = arxiv.SortCriterion.SubmittedDate
    )
    
    results = client.results(search)
    try:
        for r in client.results(search):
            id = str.split(r.entry_id, '/')[-1]
            #print(f"\nProcessing paper {id}: {r.title}")
            
            data[id] = {}
            data[id]['link'] = r.entry_id
            data[id]['last_update'] = r.updated.isoformat()
            data[id]['published'] = r.published.isoformat()
            data[id]['title'] = r.title
            data[id]['authors'] = [str(a) for a in r.authors]
            data[id]['summary'] = r.summary
            data[id]['primary_category'] = r.primary_category
            data[id]['categories'] = r.categories
            data[id]['pre-print_link'] = f"http://arxiv.org/e-print/{id}"
            
            try:
                paper = next(arxiv.Client().results(arxiv.Search(id_list=[id])))
                paper.download_source(dirpath=download_dir, filename=f"{id}.tar.gz")
                #tar_path = os.path.join(download_dir, f"{id}.tar.gz")
                #response = requests.get(data[id]['pre-print_link'], stream=True)
                #response.raise_for_status()
        
                #with open(tar_path, 'wb') as f:
                #    for chunk in response.iter_content(chunk_size=8192):
                #        f.write(chunk)
                
                extracted_path = os.path.join(extracted_dir, id)
                os.makedirs(extracted_path, exist_ok=True)
        
                with tarfile.open(tar_path, "r:gz") as tar:
                    tar.extractall(path=extracted_path)
        
                bib_files = []
                for root, _, files in os.walk(extracted_path):
                    for file in files:
                        if file.endswith('.bib'):
                            bib_files.append(os.path.join(root, file))
        
                if bib_files:
                    for bib_file_path in bib_files:
                        try:
                            with open(bib_file_path, encoding='utf-8') as bibtex_file:
                                parser = bibtexparser.bparser.BibTexParser()
                                bib_database = bibtexparser.load(bibtex_file, parser=parser)
                                data[id]['bib'] = bib_database.entries
                        except Exception as e:
                            print(f"{bib_file_path}: {e}")
                else:
                    bbl_file = None
                    for root, _, files in os.walk(extracted_path):
                        for file in files:
                            if file.endswith('.bbl'):
                                bbl_file = os.path.join(root, file)
                                break
                        if bbl_file:
                            break                
                    if bbl_file:
                        try:
                            with open(bbl_file, 'r', encoding='utf-8') as bblfile:
                                data[id]['bbl'] = bblfile.read()
                        except Exception as e:
                            print(f"{bbl_file}: {e}")
    
                tex_file = None
                for root, _, files in os.walk(extracted_path):
                    for file in files:
                        if file.endswith('.tex'):
    
                            try:
                                with open(os.path.join(root, file), 'r', encoding='utf-8') as texfile:
                                    tex = texfile.read()
                                    if re.search(r"\*?\\documentclass\*?", tex):
                                        data[id]['tex'] = tex
                                        tex_file = os.path.join(root, file)
                                        break
                                    
                            except Exception as e:
                                print(f"{os.path.join(root, file)}: {e}")
    
                    if tex_file:
                        break
                
            except Exception as e:
                print(f"{id}: {e}")
                continue
    except Exception as e:
        print(e)
        pass




with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

1 / 1


  tar.extractall(path=extracted_path)
Overwriting existing string for key: pami.
Overwriting existing string for key: ijcv.
Overwriting existing string for key: cvpr.
Overwriting existing string for key: iccv.
Overwriting existing string for key: eccv.
Overwriting existing string for key: nips.
Overwriting existing string for key: icpr.
Overwriting existing string for key: bmvc.
Overwriting existing string for key: tog.
Overwriting existing string for key: tip.
Overwriting existing string for key: tvcg.
Overwriting existing string for key: tmm.
Overwriting existing string for key: acmmm.
Overwriting existing string for key: icme.
Overwriting existing string for key: icassp.
Overwriting existing string for key: icip.
Overwriting existing string for key: accv.
Overwriting existing string for key: iclr.
Overwriting existing string for key: ijcai.
Overwriting existing string for key: pr.
Overwriting existing string for key: aaai.
Overwriting existing string for key: cvprw.
Overwriting exis