In [71]:
from bs4 import BeautifulSoup
import boto3
from codecarbon import EmissionsTracker
import configparser
import tarfile
import glob
import fitz  # PyMuPDF
from multiprocessing import cpu_count, Process
import os
import pandas as pd
import re
from time import time
import shutil

tracker = EmissionsTracker(log_level="critical")

In [111]:
def download_arxiv_bucket(file_key):
    """Grabs tar files from arXiv AWS S3 bucket"""
    config = configparser.ConfigParser()
    config.read('config.ini')

    aws_access_key_id = config['DEFAULT']['ACCESS_KEY']
    aws_secret_access_key = config['DEFAULT']['SECRET_KEY']
    
    print(f"Preparing to download ArXiv files from file {file_key}..", flush=True)

    # Create an S3 client with your AWS credentials
    s3 = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name='us-east-1'  # same region arxiv bucket is in
    )

    # Set the bucket and file key for the desired tar file
    bucket_name = 'arxiv'
    local_path = file_key.replace('pdf/', '')

    # Download the tar file from S3
    s3.download_file(bucket_name, file_key, file_key.replace('pdf/', ''), ExtraArgs={'RequestPayer': 'requester'})

    print(f"Downloaded {file_key} from {bucket_name} to {local_path}", flush=True)

In [110]:
def extract_tar_file(bucket_num, file_key):
    """Extracts the pdfs from the tar_files downloaded from arXiv S3 bucket, and remove the arxiv files"""
    
    tar_path = file_key.replace('pdf/', '')
    output_path = 'pdf_files'
    
    with tarfile.open(tar_path) as tar:
        tar.extractall(output_path)
        print(f"Extracted {tar_path} to {output_path}", flush=True)
        
    os.remove(tar_path)

In [109]:
def extract_text_from_pdfs(bucket_num, file_key):
    "Extracts the text from the pdfs, adds them to a csv file, and removes the pdf files"
    
    print("Beginning to extract text from PDF's", flush=True)

    # Use re.search to find the pdf file number in the file_key
    pdf_file_num = re.search(r'pdf_(\d+)_', file_key).group(1)
    
    all_texts = []
    csv_file = 'arxiv_text.csv'
    pdf_files_path = os.path.join('pdf_files', pdf_file_num)
    pdf_files = glob.glob(os.path.join(pdf_files_path, '*.pdf'))

    # Extract text from the PDF's using PyMuDF
    for pdf_file in pdf_files:
        with fitz.open(pdf_file) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
            all_texts.append(text)

    print("Texts extracted from PDF files with length ", len(all_texts), flush=True)

    # Save data to a dataframe, then to a csv
    data = {'index': [i for i in range(len(all_texts))], 'raw_text': all_texts, 'pdf_file': pdf_files}
    df = pd.DataFrame(data)
    df.to_csv(csv_file, mode='a', header=False)
    
    print(f"Text succesfully added to csv from bucket {bucket_num}", flush=True)
        
    # Delete the 'pdf_files' folder and its contents
    shutil.rmtree(f'pdf_files/{pdf_file_num}')

In [108]:
def num_tar_from_arxiv():
    """Grabs tar files from arXiv AWS S3 bucket"""
    
    print("Attempting to retrieve the number of tar buckets available.")
    
    config = configparser.ConfigParser()
    config.read('config.ini')

    aws_access_key_id = config['DEFAULT']['ACCESS_KEY']
    aws_secret_access_key = config['DEFAULT']['SECRET_KEY']
    
    s3 = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name='us-east-1'  # same region arxiv bucket is in
    )

    s3.download_file(
            Bucket='arxiv', 
            Key='pdf/arXiv_pdf_manifest.xml',
            Filename='arXiv_pdf_manifest.xml',
            ExtraArgs={'RequestPayer':'requester'}
    )
    manifest = open('arXiv_pdf_manifest.xml', 'r')
    soup = BeautifulSoup(manifest, 'xml')
    num_buckets = len(soup.find_all('file'))
    filenames = soup.find_all('filename')

    # Pattern to match the text between <filename> and </filename>
    pattern = r'<filename>(.+)</filename>'

    # Use a list comprehension to apply the regex to each string in the list
    file_keys = [re.search(pattern, str(s)).group(1) for s in filenames if re.search(pattern, str(s))]

    
    print(f"There are currently {num_buckets} S3 tar files available from arXiv..", flush=True)
    
    return num_buckets, file_keys

In [107]:
def arxiv_to_csv(bucket_num, file_key):
    download_arxiv_bucket(file_key)
    extract_tar_file(bucket_num, file_key)
    extract_text_from_pdfs(bucket_num, file_key)
    
    print(f"Text extraction success from bucket {bucket_num}\n", flush=True)

In [113]:
tracker.start()
start = time()

print(f"Beginning extraction of arXiv PDF's on S3 into text files in a csv.")

batch_size = cpu_count()
num_tar_buckets, file_keys = num_tar_from_arxiv()

print(f"Utilizing {batch_size} cores to pull data from {num_tar_buckets} S3 buckets.\n")

for i in range(0, 32, batch_size):
    # Ensure we don't exceed the number of arxiv tar's available by checking the minimum
    processes = [
        Process(target=arxiv_to_csv,
        args=(j, file_keys[j], )) for j in range(i, min(i+batch_size, num_tar_buckets))
    ]
    
    for process in processes:
        process.start()
        
    for process in processes:
        process.join()
                
    
tracker.stop()
end = time()

print('Process completed, successfully extracted all of the text!', flush=True)
print(f"This process took {(end - start)/1000} seconds", flush=True)

Beginning extraction of arXiv PDF's on S3 into text files in a csv.
Attempting to retrieve the number of tar buckets available.
There are currently 5922 S3 tar files available from arXiv..
Utilizing 16 cores to pull data from 5922 S3 buckets.

Preparing to download ArXiv files from file pdf/arXiv_pdf_0001_001.tar..
Preparing to download ArXiv files from file pdf/arXiv_pdf_0001_002.tar..
Preparing to download ArXiv files from file pdf/arXiv_pdf_0002_001.tar..
Preparing to download ArXiv files from file pdf/arXiv_pdf_0002_002.tar..
Preparing to download ArXiv files from file pdf/arXiv_pdf_0003_001.tar..
Preparing to download ArXiv files from file pdf/arXiv_pdf_0003_002.tar..
Preparing to download ArXiv files from file pdf/arXiv_pdf_0004_001.tar..
Preparing to download ArXiv files from file pdf/arXiv_pdf_0004_002.tar..
Preparing to download ArXiv files from file pdf/arXiv_pdf_0005_001.tar..
Preparing to download ArXiv files from file pdf/arXiv_pdf_0005_002.tar..
Preparing to download ArXi

Process Process-102:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_4126/2623834023.py", line 4, in arxiv_to_csv
    extract_text_from_pdfs(bucket_num, file_key)
  File "/tmp/ipykernel_4126/3149344953.py", line 16, in extract_text_from_pdfs
    with fitz.open(pdf_file) as doc:
  File "/home/branislav/.local/lib/python3.10/site-packages/fitz/fitz.py", line 3988, in __init__
    raise EmptyFileError(msg)
fitz.fitz.EmptyFileError: cannot open empty document


Downloaded pdf/arXiv_pdf_0006_002.tar from arxiv to arXiv_pdf_0006_002.tar
Extracted arXiv_pdf_0006_002.tar to pdf_files
Beginning to extract text from PDF's
Texts extracted from PDF files with length  548
Text succesfully added to csv from bucket 1
Text extraction success from bucket 1

Texts extracted from PDF files with length  627
Text succesfully added to csv from bucket 13
Text extraction success from bucket 13

Texts extracted from PDF files with length  761
Text succesfully added to csv from bucket 5
Text extraction success from bucket 5

Downloaded pdf/arXiv_pdf_0005_002.tar from arxiv to arXiv_pdf_0005_002.tar
Extracted arXiv_pdf_0005_002.tar to pdf_files
Beginning to extract text from PDF's
Texts extracted from PDF files with length  575
Text succesfully added to csv from bucket 11
Text extraction success from bucket 11



Process Process-108:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_4126/2623834023.py", line 4, in arxiv_to_csv
    extract_text_from_pdfs(bucket_num, file_key)
  File "/tmp/ipykernel_4126/3149344953.py", line 16, in extract_text_from_pdfs
    with fitz.open(pdf_file) as doc:
  File "/home/branislav/.local/lib/python3.10/site-packages/fitz/fitz.py", line 3988, in __init__
    raise EmptyFileError(msg)
fitz.fitz.EmptyFileError: cannot open empty document


Downloaded pdf/arXiv_pdf_0008_002.tar from arxiv to arXiv_pdf_0008_002.tar
Extracted arXiv_pdf_0008_002.tar to pdf_files
Beginning to extract text from PDF's
Texts extracted from PDF files with length  936
Text succesfully added to csv from bucket 15
Text extraction success from bucket 15

Downloaded pdf/arXiv_pdf_0002_001.tar from arxiv to arXiv_pdf_0002_001.tar
Extracted arXiv_pdf_0002_001.tar to pdf_files
Beginning to extract text from PDF's
Downloaded pdf/arXiv_pdf_0006_001.tar from arxiv to arXiv_pdf_0006_001.tar
Extracted arXiv_pdf_0006_001.tar to pdf_files
Beginning to extract text from PDF's
Downloaded pdf/arXiv_pdf_0005_001.tar from arxiv to arXiv_pdf_0005_001.tar
Extracted arXiv_pdf_0005_001.tar to pdf_files
Beginning to extract text from PDF's
Downloaded pdf/arXiv_pdf_0007_001.tar from arxiv to arXiv_pdf_0007_001.tar
Extracted arXiv_pdf_0007_001.tar to pdf_files
Beginning to extract text from PDF's
Downloaded pdf/arXiv_pdf_0001_001.tar from arxiv to arXiv_pdf_0001_001.tar
Do

Process Process-107:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_4126/2623834023.py", line 4, in arxiv_to_csv
    extract_text_from_pdfs(bucket_num, file_key)
  File "/tmp/ipykernel_4126/3149344953.py", line 16, in extract_text_from_pdfs
    with fitz.open(pdf_file) as doc:
  File "/home/branislav/.local/lib/python3.10/site-packages/fitz/fitz.py", line 3988, in __init__
    raise EmptyFileError(msg)
fitz.fitz.EmptyFileError: cannot open empty document
Process Process-105:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_4126/2623834023.py"

KeyboardInterrupt: 

Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/ipykernel_4126/2623834023.py", line 4, in arxiv_to_csv
    extract_text_from_pdfs(bucket_num, file_key)
Process Process-101:
  File "/tmp/ipykernel_4126/3149344953.py", line 16, in extract_text_from_pdfs
    with fitz.open(pdf_file) as doc:
  File "/tmp/ipykernel_4126/2623834023.py", line 4, in arxiv_to_csv
    extract_text_from_pdfs(bucket_num, file_key)
Traceback (most recent call last):
  File "/home/branislav/.local/lib/python3.10/site-packages/fitz/fitz.py", line 5736, in __exit__
    self.close()
  File "/tmp

In [114]:
manifest = open('arXiv_pdf_manifest.xml', 'r')
soup = BeautifulSoup(manifest, 'xml')

In [115]:
df = pd.read_csv('arxiv_text.csv')