Initialize globals that will signal the download whether or not to proceed. Ensure the globals are set to False if you do not want to pay for data transfer. If they are set to False, the program will skip over any downloads and proceed to analysis.

In [17]:
DOWNLOAD_METADATA = False
DOWNLOAD_TARS = True

#### Import all dependencies:

In [55]:
import boto3, configparser, os, botocore, numpy as np, pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import xml.etree.ElementTree as ET, tarfile
from enum import Enum 

#### Connect to Amazon S3 resource:

In [18]:
s3resource = None

def setup():
    """Creates S3 resource & sets configs to enable download."""

    # Securely import configs from private AWS config file
    configs = configparser.ConfigParser()
    configs.read('config.ini')

    # Create S3 resource & set configs
    global s3resource
    s3resource = boto3.resource(
        's3',  # the AWS resource we want to use
        aws_access_key_id=configs['DEFAULT']['ACCESS_KEY'],
        aws_secret_access_key=configs['DEFAULT']['SECRET_KEY'],
        region_name='us-east-1'  # same region the arxiv bucket is in
    )
    
setup()

#### Get `arxiv` bucket metadata

Define a function to download a file from the `arxiv` bucket, then download `arXiv_src_manifest.xml`, which contains bucket metadata.

In [26]:
def download_file(key):
    """
    Downloads given filename from source bucket to destination directory.

    Parameters
    ----------
    key : str
        Name of file to download
    """

    # Ensure src directory exists 
    if not os.path.isdir('src'):
        os.makedirs('src')

    # Download file

    try:
        s3resource.meta.client.download_file(
            Bucket='arxiv', 
            Key=key,  # name of key to download from
            Filename=key,  # path to file to download to
            ExtraArgs={'RequestPayer':'requester'})
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print('ERROR: ' + key + " does not exist in arxiv bucket")
            
    print('Successfully downloaded s3://arxiv/{} to {}'.format(key, key))

In [43]:
if DOWNLOAD_METADATA:
    download_file('src/arXiv_src_manifest.xml')
else:
    print('You have opted not to download any metadata at this time')

You have opted not to download any metadata at this time


#### Convert metadata file from XML to Pandas DataFrame:

In [5]:
# From https://www.kaggle.com/ysviru/simple-analysis-of-healthcare-job-postings
class XMLTagsUpperLevel:
    """
    This class defines the XML tag constants at the higher level of XML tree. The tag <file> is found below the root tag
    <arXivSRC> in the tree hierarchy.
    """
    FILE = "file"
    
class XMLTagsLowerLevel(Enum):
    """
    This class defines all the XML tag constants that are one level below the <file> tag. This is defined as an
    enumerated type for ease of iterating over all tags.
    """
    CONTENT_MD5SUM = "content_md5sum"
    FILENAME = "filename"
    FIRST_ITEM = "first_item"
    LAST_ITEM = "last_item"
    MD5SUM = "md5sum"
    NUM_ITEMS = "num_items"
    SEQ_NUM = "seq_num"
    SIZE = "size"
    TIMESTAMP = "timestamp"
    YYMM = "yymm"

class XMLParser:
    def __init__(self, file_path):
        """
        Initializes the XMLParser class instance.
        :param file_path: Path to input xml file containing all the jobs data.
        """
        self.file_path = file_path


    def xml_to_pandas_df(self):
        """
        Using the standard xml python library, we parse the data xml file and convert the xml data to a pandas
        data frame.
        :return: A pandas data frame instance containing all the manifest data.
        """
        tree = ET.parse(self.file_path)
        root = tree.getroot()

        manifest_data = dict()
        for tag in XMLTagsLowerLevel:
            manifest_data[tag.value] = []
    
        for i, record in enumerate(root.findall(XMLTagsUpperLevel.FILE)):
            for tag in XMLTagsLowerLevel:
                temp = record.find(tag.value)
                if temp is not None:
                    manifest_data[tag.value].append(temp.text)
                else:
                    manifest_data[tag.value].append("")

        return pd.DataFrame(data=manifest_data)
    

In [6]:
parser = XMLParser("src/arXiv_src_manifest.xml")
manifest_df = parser.xml_to_pandas_df()
manifest_df

Unnamed: 0,content_md5sum,filename,first_item,last_item,md5sum,num_items,seq_num,size,timestamp,yymm
0,cacbfede21d5dfef26f367ec99384546,src/arXiv_src_0001_001.tar,astro-ph0001001,quant-ph0001119,949ae880fbaf4649a485a8d9e07f370b,2364,1,225605507,2010-12-23 00:13:59,0001
1,d90df481661ccdd7e8be883796539743,src/arXiv_src_0002_001.tar,astro-ph0002001,quant-ph0002094,4592ab506cf775afecf4ad560d982a00,2365,1,227036528,2010-12-23 00:18:09,0002
2,3388afd7bfb2dfd9d3f3e6b353357b33,src/arXiv_src_0003_001.tar,astro-ph0003001,quant-ph0003151,b5bf5e52ae8532cdf82b606b42df16ea,2600,1,230986882,2010-12-23 00:22:15,0003
3,46abb309d77065fed44965cc26a4ae2e,src/arXiv_src_0004_001.tar,astro-ph0004001,quant-ph0004109,9bf1b55890dceec9535ef723a2aea16b,2076,1,191559408,2010-12-23 00:26:31,0004
4,ea665c7b62eaac91110fa344f6ba3fc4,src/arXiv_src_0005_001.tar,astro-ph0005001,quant-ph0005134,b49af416746146eca13c5a6a76bc7193,2724,1,255509072,2010-12-23 00:30:11,0005
5,b5bea3821e15af75b831250e6ea0a031,src/arXiv_src_0006_001.tar,astro-ph0006001,quant-ph0006136,ea268ff2bc89161c6998146fbb2d7515,2438,1,242543048,2010-12-23 00:34:56,0006
6,ddfb7cb7cd9f413f08fe9b4db6ebd0a5,src/arXiv_src_0007_001.tar,astro-ph0007001,quant-ph0007124,79bf35aabec474f66dfa00004ae13660,2461,1,245531787,2010-12-23 00:39:28,0007
7,0587b74278ded853b6bc3e26e216e9f8,src/arXiv_src_0008_001.tar,astro-ph0008001,quant-ph0008134,76fbdc83a3e966b8b3d729115b28cfca,2613,1,286991432,2010-12-23 00:44:17,0008
8,31d1373aaf9fe1b6e12ccd5c47dae004,src/arXiv_src_0009_001.tar,astro-ph0009001,quant-ph0009124,c7b01e4886802e34e327403078740d95,2522,1,230749480,2010-12-23 00:49:20,0009
9,2015fdfcea56c7c9b28e07883d7d8909,src/arXiv_src_0010_001.tar,astro-ph0010001,quant-ph0010117,2c92f09cf1a4f742b3ed0cd54f17112c,2910,1,278679438,2010-12-23 00:53:38,0010


#### Calculate some information based on metadata

The last time the manifest file was updated (not included in `manifest_df` since there is only a single `<timestamp>` element in the upper level):

In [3]:
with open('src/arXiv_src_manifest.xml', 'r') as manifest:
    soup = BeautifulSoup(manifest, 'lxml-xml')
    timestamp = soup.arXivSRC.find('timestamp', recursive=False).string
    print('Manifest was last edited on ' + timestamp)

Manifest was last edited on Tue Feb  5 07:21:43 2019


Check if there are any missing values: 

In [7]:
manifest_df.isnull().sum()

content_md5sum    0
filename          0
first_item        0
last_item         0
md5sum            0
num_items         0
seq_num           0
size              0
timestamp         0
yymm              0
dtype: int64

Get total size of bucket:

In [15]:
print(str(len(manifest_df)) + " files")
manifest_df['size'] = pd.to_numeric(manifest_df['size'])
print(str(round(manifest_df['size'].sum() / 1000000000, 2)) + ' GB')

2182 files
1069.97 GB


Get range of dates:

In [11]:
manifest_df['timestamp'] = pd.to_datetime(manifest_df['timestamp'])
print('Oldest file was uploaded on ' + manifest_df['timestamp'].min().strftime('%m/%d/%Y'))
print('Most recent file was uploaded on ' +  manifest_df['timestamp'].max().strftime('%m/%d/%Y'))

Oldest file was uploaded on 12/22/2010
Most recent file was uploaded on 02/05/2019


Note: Since the timestamp just represents when the .tar file was uploaded, it's not a reliable indicator of the year the preprint was published in. 

`yymm` indicates the items' age. Docs: Two digit year and month of items in the tar package. Starts with 9108 for 1991-08, rolls past y2k to 0001 for 2000-01, 1008 for 2010-08 etc.

Get number of files uploaded in 2018:

In [50]:
given_year_df = manifest_df[manifest_df['yymm'].str.slice(0,2) == '18']
print(str(len(given_year_df)) + " .tars uploaded for given year")
print(str(round(given_year_df['size'].sum(), 2) / 1000000000) + " GB")

392 .tars uploaded for given year
204.823664304 GB


#### Get astro-ph preprints

The total size of the bucket exceeds 1 terabyte, but we won't use all of this data. The bucket contains preprints from all categories, and we're only interested in the astro-ph category. 

As suggested by this [SO answer](https://stackoverflow.com/questions/51276201/how-to-extract-files-in-s3-on-the-fly-with-boto3), I wanted to extract each .tar while downloading using [AWS Lambda](https://docs.aws.amazon.com/lambda/latest/dg/limits.html), but I need to check if there are any .tar files in the bucket whose size exceeds 512 MB, which is the limit of AWS Lambda temporary storage:

In [73]:
print(str(len(manifest_df[manifest_df['size'] > 512000000])) + " files")
print("Maximum size: " + str(round(manifest_df['size'].max()/1000000, 2)) + " MB")

1803 files
Maximum size: 1910.6 MB


There are many files that we won't be able to extract using Lambda, so we will just go the traditional download route.

In [174]:
def extract_tar(filename):
    """
    Extracts specified file.

    Parameters
    ----------
    filename : str
        Name of file to extract
    """

    total_gz = 0
    total_tex = 0

    # Quit file extraction if given file is not .tar
    if not tarfile.is_tarfile(filename):
        print('can\'t unzip ' + filename + ', not a .tar file')
        return
    
    # Proceed with file extraction if .tar
    print('Opening ' + filename + '...')
    # Open .tar, read-only
    tar = tarfile.open(filename)
    # Iterate over .tar subfiles
    for subfile in tar.getmembers():
        # Open subfile only if .gz and begins with 'astro-ph'
        if subfile.name.endswith('.gz') and 'astro-ph' in subfile.name:
            total_gz += 1
            try: 
                print('Processing ' + filename + '/' + subfile.name + '...')
                # Open .gz, read-only
                gz_obj = tar.extractfile(subfile) 
                gz = tarfile.open(fileobj=gz_obj) 
                # Iterate over .gz subfiles
                for subsubfile in gz.getmembers():
                    # Check if current subfile is .tex or .ltx 
                    if subsubfile.name.endswith('.tex') or subsubfile.name.endswith('.ltx'):
                        # Check if there has been another file with the same name
                        if os.path.exists('latex/' + subsubfile.name):
                            # If there is, rename that file 
                            basename = os.path.splitext(os.path.basename(subsubfile.name))[0]
                            os.rename('latex/' + subsubfile.name, 'latex/' + basename + '_' + str(total_tex) + '.tex')
                        # Extract the file
                        gz.extract(subsubfile, path='latex')
                        total_tex += 1
            except tarfile.ReadError:
                # Extract the entire .gz because we cannot read it using tarfile 
                # Note that these .gzs are single .tex files with no extension specified
                tar.extract(subfile, path='temp')
                # Uncompress the .gz file using gzip instead and place it with the other .tex files
                with gzip.open('temp/' + subfile.name, 'rb') as f_in:
                    basename = os.path.splitext(os.path.basename(subfile.name))[0]
                    with open('latex/' + basename + '.tex', 'wb+') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                        total_tex += 1
    
    # Delete the temporary folder for those wonky gz files
    shutil.rmtree('temp/', ignore_errors=True)
    # Close tar
    tar.close()

    print(filename + ' extraction complete')
    print('Total number of astro-ph .gz files extracted: ' + str(total_gz))
    print('Number of .tex files obtained: ' + str(total_tex) + '\n')
    
def download_tars():
    '''
    Downloads zipped .tars from source bucket to computer.

    Parameters
    ----------
    source_bucket : str
        Name of source bucket
    key : str
        Name of file to download
    '''

    print('Beginning tar download & extraction...')

    # Create a reusable Paginator
    paginator = s3resource.meta.client.get_paginator('list_objects_v2')

    # Create a PageIterator from the Paginator
    page_iterator = paginator.paginate(
        Bucket='arxiv',
        RequestPayer='requester',
        Prefix='src/'
    )

    # Download and extract tars
    numFiles = 0
    for page in page_iterator:
        numFiles = numFiles + len(page['Contents'])
        for file in page['Contents']:
            key = file['Key']
            # If current file is a tar 
            if key.endswith('.tar'):
                download_file(key)
                # Extract astrophysics contents
                extract_tar(key)
                # upload_to_google_drive()
                # remove_tar(key)
            
    print('Processed ' + str(numFiles - 1) + ' tars')  # -1 

In [175]:
if DOWNLOAD_TARS:
    #download_tars()
    extract_tar('src/arXiv_src_0001_001.tar')

Opening src/arXiv_src_0001_001.tar...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001001.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001002.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001003.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001004.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001005.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001006.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001007.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001008.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001009.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001010.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001011.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001012.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001013.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001014.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0

Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001141.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001142.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001143.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001144.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001145.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001146.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001147.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001149.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001150.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001151.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001152.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001153.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001154.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001155.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001156.gz...
Processing src/arXiv_src_

Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001273.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001274.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001275.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001276.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001277.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001278.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001279.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001280.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001281.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001282.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001283.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001284.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001285.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001286.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001287.gz...
Processing src/arXiv_src_

Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001405.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001406.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001407.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001408.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001409.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001410.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001411.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001412.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001413.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001414.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001415.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001416.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001417.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001418.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001419.gz...
Processing src/arXiv_src_

Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001541.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001542.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001543.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001544.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001545.gz...
Processing src/arXiv_src_0001_001.tar/0001/astro-ph0001546.gz...
src/arXiv_src_0001_001.tar extraction complete
Total number of astro-ph .gz files extracted: 541
Number of .tex files obtained: 606

