In [13]:
import boto3, configparser

configs = configparser.SafeConfigParser()
configs.read('config.ini')
    
# Create an S3 client with your AWS credentials
s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name='us-east-1'  # same region arxiv bucket is in
)

# Set the bucket and file key for the desired tar file
bucket_name = 'arxiv'
file_key = 'pdf/arXiv_pdf_1001_001.tar'

# Set the local path where you want to save the downloaded tar file
local_path = 'arXiv_pdf_1001_001.tar'

# Download the tar file from S3
s3.download_file(bucket_name, file_key, local_path, ExtraArgs={'RequestPayer': 'requester'})

print(f"Downloaded {file_key} from {bucket_name} to {local_path}")

Downloaded pdf/arXiv_pdf_1001_001.tar from arxiv to arXiv_pdf_1001_001.tar


In [14]:
import tarfile

def extract_tar_file(tar_path, output_path):
    with tarfile.open(tar_path) as tar:
        tar.extractall(output_path)
        print(f"Extracted {tar_path} to {output_path}")

tar_path = 'arXiv_pdf_1001_001.tar'
output_path = 'pdf_files'

extract_tar_file(tar_path, output_path)

Extracted arXiv_pdf_1001_001.tar to pdf_files


In [8]:
import os
import glob
import fitz  # PyMuPDF

pdf_files_path = 'pdf_files'
subfolders = sorted(glob.glob(os.path.join(pdf_files_path, '*')))

all_texts = []

for subfolder in subfolders:
    pdf_files = glob.glob(os.path.join(subfolder, '*.pdf'))
    for pdf_file in pdf_files:
        with fitz.open(pdf_file) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
            all_texts.append(text)

print("Texts extracted from PDF files:", len(all_texts))

Texts extracted from PDF files: 920


In [10]:
import pandas as pd

data = {'index': [i for i in range(len(all_texts))], 'raw_text': all_texts, 'pdf_file': pdf_files}

df = pd.DataFrame(data)
df.to_csv('1001.csv')
df.head()

Unnamed: 0,index,raw_text,pdf_file
0,0,arXiv:1001.0487v2 [astro-ph.SR] 5 Jan 2010\n...,pdf_files/1001/1001.0487.pdf
1,1,1 \nTESTING RELATIVISTIC GRAVITY AND \nDETECT...,pdf_files/1001/1001.0213.pdf
2,2,arXiv:1001.0912v1 [hep-th] 6 Jan 2010\nFIELD...,pdf_files/1001/1001.0912.pdf
3,3,arXiv:1001.0043v2 [astro-ph.EP] 13 Jan 2010\...,pdf_files/1001/1001.0043.pdf
4,4,arXiv:1001.0371v3 [hep-th] 14 May 2010\nAcou...,pdf_files/1001/1001.0371.pdf


In [41]:
MB = 1024 * 1024

pdfs_size = 0
for root, _, files in os.walk('./pdf_files/'):
    pdfs_size += sum(os.path.getsize(os.path.join(root, f)) for f in files if not os.path.islink(os.path.join(root, f)))

csv_size = os.path.getsize("./1001.csv")

print(f"PDF Folder Size: \n\t{pdfs_size / MB:.2f} MB\n")
print(f"CSV Output Size: \n\t{csv_size / MB:.2f} MB\n")
print(f"Total Space Change: \n\t{(csv_size - pdfs_size) / MB:.2f} MB")

PDF Folder Size: 
	500.25 MB

CSV Output Size: 
	39.77 MB

Total Space Change: 
	-460.48 MB
