#### Import all dependencies:

In [28]:
import boto3, configparser, os, botocore, numpy as np, pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import xml.etree.ElementTree as ET
from enum import Enum 

#### Connect to S3 resource:

In [9]:
s3resource = None

def setup():
    """Creates S3 resource & sets configs to enable download."""

    # Securely import configs from private AWS config file
    configs = configparser.ConfigParser()
    configs.read('config.ini')

    # Create S3 resource & set configs
    global s3resource
    s3resource = boto3.resource(
        's3',  # the AWS resource we want to use
        aws_access_key_id=configs['DEFAULT']['ACCESS_KEY'],
        aws_secret_access_key=configs['DEFAULT']['SECRET_KEY'],
        region_name='us-east-1'  # same region the arxiv bucket is in
    )
    
setup()

#### Get `arxiv` bucket metadata

Define a function to download a file from the `arxiv` bucket, then download `arXiv_src_manifest.xml`, which contains bucket metadata.

In [12]:
def download_file(key):
    """
    Downloads given filename from source bucket to destination directory.

    Parameters
    ----------
    key : str
        Name of file to download
    """

    # Ensure src directory exists 
    if not os.path.isdir('src'):
        os.makedirs('src')

    # Download file

    try:
        s3resource.meta.client.download_file(
            Bucket='arxiv', 
            Key=key,  # name of key to download from
            Filename=key,  # path to file to download to
            ExtraArgs={'RequestPayer':'requester'})
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print('ERROR: ' + key + " does not exist in arxiv bucket")
            
    print('Successfully downloaded s3://arxiv/{} to {}'.format(key, key))


In [13]:
download_file('src/arXiv_src_manifest.xml')

\Successfully downloaded s3://arxiv/src/arXiv_src_manifest.xml to src/arXiv_src_manifest.xml


#### Convert metadata file from XML to Pandas DataFrame:

In [64]:
# From https://www.kaggle.com/ysviru/simple-analysis-of-healthcare-job-postings
class XMLTagsUpperLevel:
    """
    This class defines the XML tag constants at the higher level of XML tree. The tag <file> is found below the root tag
    <arXivSRC> in the tree hierarchy.
    """
    FILE = "file"
    
class XMLTagsLowerLevel(Enum):
    """
    This class defines all the XML tag constants that are one level below the <file> tag. This is defined as an
    enumerated type for ease of iterating over all tags.
    """
    CONTENT_MD5SUM = "content_md5sum"
    FILENAME = "filename"
    FIRST_ITEM = "first_item"
    LAST_ITEM = "last_item"
    MD5SUM = "md5sum"
    NUM_ITEMS = "num_items"
    SEQ_NUM = "seq_num"
    SIZE = "size"
    TIMESTAMP = "timestamp"
    YYMM = "yymm"

class XMLParser:
    def __init__(self, file_path):
        """
        Initializes the XMLParser class instance.
        :param file_path: Path to input xml file containing all the jobs data.
        """
        self.file_path = file_path


    def xml_to_pandas_df(self):
        """
        Using the standard xml python library, we parse the data xml file and convert the xml data to a pandas
        data frame.
        :return: A pandas data frame instance containing all the manifest data.
        """
        tree = ET.parse(self.file_path)
        root = tree.getroot()

        manifest_data = dict()
        for tag in XMLTagsLowerLevel:
            manifest_data[tag.value] = []
    
        for i, record in enumerate(root.findall(XMLTagsUpperLevel.FILE)):
            for tag in XMLTagsLowerLevel:
                temp = record.find(tag.value)
                if temp is not None:
                    manifest_data[tag.value].append(temp.text)
                else:
                    manifest_data[tag.value].append("")

        return pd.DataFrame(data=manifest_data)
    

In [65]:
parser = XMLParser("src/arXiv_src_manifest.xml")
manifest_df = parser.xml_to_pandas_df()
manifest_df

Unnamed: 0,content_md5sum,filename,first_item,last_item,md5sum,num_items,seq_num,size,timestamp,yymm
0,cacbfede21d5dfef26f367ec99384546,src/arXiv_src_0001_001.tar,astro-ph0001001,quant-ph0001119,949ae880fbaf4649a485a8d9e07f370b,2364,1,225605507,2010-12-23 00:13:59,0001
1,d90df481661ccdd7e8be883796539743,src/arXiv_src_0002_001.tar,astro-ph0002001,quant-ph0002094,4592ab506cf775afecf4ad560d982a00,2365,1,227036528,2010-12-23 00:18:09,0002
2,3388afd7bfb2dfd9d3f3e6b353357b33,src/arXiv_src_0003_001.tar,astro-ph0003001,quant-ph0003151,b5bf5e52ae8532cdf82b606b42df16ea,2600,1,230986882,2010-12-23 00:22:15,0003
3,46abb309d77065fed44965cc26a4ae2e,src/arXiv_src_0004_001.tar,astro-ph0004001,quant-ph0004109,9bf1b55890dceec9535ef723a2aea16b,2076,1,191559408,2010-12-23 00:26:31,0004
4,ea665c7b62eaac91110fa344f6ba3fc4,src/arXiv_src_0005_001.tar,astro-ph0005001,quant-ph0005134,b49af416746146eca13c5a6a76bc7193,2724,1,255509072,2010-12-23 00:30:11,0005
5,b5bea3821e15af75b831250e6ea0a031,src/arXiv_src_0006_001.tar,astro-ph0006001,quant-ph0006136,ea268ff2bc89161c6998146fbb2d7515,2438,1,242543048,2010-12-23 00:34:56,0006
6,ddfb7cb7cd9f413f08fe9b4db6ebd0a5,src/arXiv_src_0007_001.tar,astro-ph0007001,quant-ph0007124,79bf35aabec474f66dfa00004ae13660,2461,1,245531787,2010-12-23 00:39:28,0007
7,0587b74278ded853b6bc3e26e216e9f8,src/arXiv_src_0008_001.tar,astro-ph0008001,quant-ph0008134,76fbdc83a3e966b8b3d729115b28cfca,2613,1,286991432,2010-12-23 00:44:17,0008
8,31d1373aaf9fe1b6e12ccd5c47dae004,src/arXiv_src_0009_001.tar,astro-ph0009001,quant-ph0009124,c7b01e4886802e34e327403078740d95,2522,1,230749480,2010-12-23 00:49:20,0009
9,2015fdfcea56c7c9b28e07883d7d8909,src/arXiv_src_0010_001.tar,astro-ph0010001,quant-ph0010117,2c92f09cf1a4f742b3ed0cd54f17112c,2910,1,278679438,2010-12-23 00:53:38,0010


#### Calculate some information based on metadata

Check if there are any missing values: 

In [69]:
manifest_df.isnull().sum()

content_md5sum    0
filename          0
first_item        0
last_item         0
md5sum            0
num_items         0
seq_num           0
size              0
timestamp         0
yymm              0
dtype: int64

Get total size of bucket:

In [80]:
manifest_df['size'] = pd.to_numeric(manifest_df['size'])
print(str(manifest_df['size'].sum() / 1000000000) + ' GB')

1052.209982638 GB


Get range of dates:

In [90]:
manifest_df['timestamp'] = pd.to_datetime(manifest_df['timestamp'])
print('Oldest file was uploaded on ' + manifest_df['timestamp'].min().strftime('%m/%d/%Y'))
print('Most recent file was uploaded on ' +  manifest_df['timestamp'].max().strftime('%m/%d/%Y'))

Oldest file was uploaded on 12/22/2010
Most recent file was uploaded on 01/05/2019


Note: Since the timestamp just represents when the .tar file was uploaded, it's not a reliable indicator of the year the preprint was published in. 

#### Get astro-ph preprints

The total size of the bucket exceeds 1 terabyte, but we won't use all of this data. The bucket contains preprints from all categories, and we're only interested in the astro-ph category. 

To find astro-ph preprints...