In [4]:
import boto3, configparser, os, botocore

s3resource = None

def setup():
    """Creates S3 resource & sets configs to enable download."""

    # Securely import configs from private config file
    configs = configparser.SafeConfigParser()
    configs.read('config.ini')

    # Create S3 resource & set configs
    global s3resource
    s3resource = boto3.resource(
        's3',  # the AWS resource we want to use
        aws_access_key_id=configs['DEFAULT']['ACCESS_KEY'],
        aws_secret_access_key=configs['DEFAULT']['SECRET_KEY'],
        region_name='us-east-1'  # same region arxiv bucket is in
    )
    
def download_file(key):
    """
    Downloads given filename from source bucket to destination directory.

    Parameters
    ----------
    key : str
        Name of file to download
    """

    # Ensure src directory exists 
    if not os.path.isdir('src'):
        os.makedirs('src')

    # Download file
    print('\nDownloading s3://arxiv/{} to {}...'.format(key, key))

    try:
        s3resource.meta.client.download_file(
            Bucket='arxiv', 
            Key=key,  # name of key to download from
            Filename=key,  # path to file to download to
            ExtraArgs={'RequestPayer':'requester'})
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print('ERROR: ' + key + " does not exist in arxiv bucket")

setup()
download_file('src/arXiv_src_manifest.xml')

  if __name__ == '__main__':



Downloading s3://arxiv/src/arXiv_src_manifest.xml to src/arXiv_src_manifest.xml...


In [10]:
from bs4 import BeautifulSoup

with open('src/arXiv_src_manifest.xml') as f:
    soup = BeautifulSoup(f, "xml")
    sizes = soup.find_all('size')
    total_size = 0
    for size in sizes:
        total_size += int(size.text)

print(total_size/1000000000)

1052.209982638
