Initialize globals that will signal the download whether or not to proceed. Ensure the globals are set to False if you do not want to pay for data transfer. If they are set to False, the program will skip over any downloads and proceed to analysis.

In [106]:
DOWNLOAD_METADATA = False
PROCESS_TARS = True

#### Import all dependencies:

In [24]:
import boto3, configparser, os, botocore, numpy as np, pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import xml.etree.ElementTree as ET, tarfile, oauth2client
from enum import Enum 
from apiclient.http import MediaFileUpload
from apiclient.discovery import build
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import os, gzip, shutil

#### Connect to Amazon S3 resource:

In [19]:
s3resource = None

def setup():
    """Creates S3 resource & sets configs to enable download."""

    # Securely import configs from private AWS config file
    configs = configparser.ConfigParser()
    configs.read('config.ini')

    # Create S3 resource & set configs
    global s3resource
    s3resource = boto3.resource(
        's3',  # the AWS resource we want to use
        aws_access_key_id=configs['DEFAULT']['ACCESS_KEY'],
        aws_secret_access_key=configs['DEFAULT']['SECRET_KEY'],
        region_name='us-east-1'  # same region the arxiv bucket is in
    )
    
setup()

#### Get `arxiv` bucket metadata

Define a function to download a file from the `arxiv` bucket, then download `arXiv_src_manifest.xml`, which contains bucket metadata.

In [20]:
def download_file(key):
    """
    Downloads given filename from source bucket to destination directory.

    Parameters
    ----------
    key : str
        Name of file to download
    """

    # Ensure src directory exists 
    if not os.path.isdir('src'):
        os.makedirs('src')

    # Download file
    try:
        s3resource.meta.client.download_file(
            Bucket='arxiv', 
            Key=key,  # name of key to download from
            Filename=key,  # path to file to download to
            ExtraArgs={'RequestPayer':'requester'})
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print('ERROR: ' + key + " does not exist in arxiv bucket")
            
    print('Successfully downloaded s3://arxiv/{} to {}'.format(key, key))

In [43]:
if DOWNLOAD_METADATA:
    download_file('src/arXiv_src_manifest.xml')
else:
    print('You have opted not to download any metadata at this time')

You have opted not to download any metadata at this time


#### Convert metadata file from XML to Pandas DataFrame:

In [5]:
# From https://www.kaggle.com/ysviru/simple-analysis-of-healthcare-job-postings
class XMLTagsUpperLevel:
    """
    This class defines the XML tag constants at the higher level of XML tree. The tag <file> is found below the root tag
    <arXivSRC> in the tree hierarchy.
    """
    FILE = "file"
    
class XMLTagsLowerLevel(Enum):
    """
    This class defines all the XML tag constants that are one level below the <file> tag. This is defined as an
    enumerated type for ease of iterating over all tags.
    """
    CONTENT_MD5SUM = "content_md5sum"
    FILENAME = "filename"
    FIRST_ITEM = "first_item"
    LAST_ITEM = "last_item"
    MD5SUM = "md5sum"
    NUM_ITEMS = "num_items"
    SEQ_NUM = "seq_num"
    SIZE = "size"
    TIMESTAMP = "timestamp"
    YYMM = "yymm"

class XMLParser:
    def __init__(self, file_path):
        """
        Initializes the XMLParser class instance.
        :param file_path: Path to input xml file containing all the jobs data.
        """
        self.file_path = file_path


    def xml_to_pandas_df(self):
        """
        Using the standard xml python library, we parse the data xml file and convert the xml data to a pandas
        data frame.
        :return: A pandas data frame instance containing all the manifest data.
        """
        tree = ET.parse(self.file_path)
        root = tree.getroot()

        manifest_data = dict()
        for tag in XMLTagsLowerLevel:
            manifest_data[tag.value] = []
    
        for i, record in enumerate(root.findall(XMLTagsUpperLevel.FILE)):
            for tag in XMLTagsLowerLevel:
                temp = record.find(tag.value)
                if temp is not None:
                    manifest_data[tag.value].append(temp.text)
                else:
                    manifest_data[tag.value].append("")

        return pd.DataFrame(data=manifest_data)
    

In [6]:
parser = XMLParser("src/arXiv_src_manifest.xml")
manifest_df = parser.xml_to_pandas_df()
manifest_df

Unnamed: 0,content_md5sum,filename,first_item,last_item,md5sum,num_items,seq_num,size,timestamp,yymm
0,cacbfede21d5dfef26f367ec99384546,src/arXiv_src_0001_001.tar,astro-ph0001001,quant-ph0001119,949ae880fbaf4649a485a8d9e07f370b,2364,1,225605507,2010-12-23 00:13:59,0001
1,d90df481661ccdd7e8be883796539743,src/arXiv_src_0002_001.tar,astro-ph0002001,quant-ph0002094,4592ab506cf775afecf4ad560d982a00,2365,1,227036528,2010-12-23 00:18:09,0002
2,3388afd7bfb2dfd9d3f3e6b353357b33,src/arXiv_src_0003_001.tar,astro-ph0003001,quant-ph0003151,b5bf5e52ae8532cdf82b606b42df16ea,2600,1,230986882,2010-12-23 00:22:15,0003
3,46abb309d77065fed44965cc26a4ae2e,src/arXiv_src_0004_001.tar,astro-ph0004001,quant-ph0004109,9bf1b55890dceec9535ef723a2aea16b,2076,1,191559408,2010-12-23 00:26:31,0004
4,ea665c7b62eaac91110fa344f6ba3fc4,src/arXiv_src_0005_001.tar,astro-ph0005001,quant-ph0005134,b49af416746146eca13c5a6a76bc7193,2724,1,255509072,2010-12-23 00:30:11,0005
5,b5bea3821e15af75b831250e6ea0a031,src/arXiv_src_0006_001.tar,astro-ph0006001,quant-ph0006136,ea268ff2bc89161c6998146fbb2d7515,2438,1,242543048,2010-12-23 00:34:56,0006
6,ddfb7cb7cd9f413f08fe9b4db6ebd0a5,src/arXiv_src_0007_001.tar,astro-ph0007001,quant-ph0007124,79bf35aabec474f66dfa00004ae13660,2461,1,245531787,2010-12-23 00:39:28,0007
7,0587b74278ded853b6bc3e26e216e9f8,src/arXiv_src_0008_001.tar,astro-ph0008001,quant-ph0008134,76fbdc83a3e966b8b3d729115b28cfca,2613,1,286991432,2010-12-23 00:44:17,0008
8,31d1373aaf9fe1b6e12ccd5c47dae004,src/arXiv_src_0009_001.tar,astro-ph0009001,quant-ph0009124,c7b01e4886802e34e327403078740d95,2522,1,230749480,2010-12-23 00:49:20,0009
9,2015fdfcea56c7c9b28e07883d7d8909,src/arXiv_src_0010_001.tar,astro-ph0010001,quant-ph0010117,2c92f09cf1a4f742b3ed0cd54f17112c,2910,1,278679438,2010-12-23 00:53:38,0010


#### Calculate some information based on metadata

The last time the manifest file was updated (not included in `manifest_df` since there is only a single `<timestamp>` element in the upper level):

In [3]:
with open('src/arXiv_src_manifest.xml', 'r') as manifest:
    soup = BeautifulSoup(manifest, 'lxml-xml')
    timestamp = soup.arXivSRC.find('timestamp', recursive=False).string
    print('Manifest was last edited on ' + timestamp)

Manifest was last edited on Tue Feb  5 07:21:43 2019


Check if there are any missing values: 

In [7]:
manifest_df.isnull().sum()

content_md5sum    0
filename          0
first_item        0
last_item         0
md5sum            0
num_items         0
seq_num           0
size              0
timestamp         0
yymm              0
dtype: int64

Get total size of bucket:

In [15]:
print(str(len(manifest_df)) + " files")
manifest_df['size'] = pd.to_numeric(manifest_df['size'])
print(str(round(manifest_df['size'].sum() / 1000000000, 2)) + ' GB')

2182 files
1069.97 GB


Get range of dates:

In [11]:
manifest_df['timestamp'] = pd.to_datetime(manifest_df['timestamp'])
print('Oldest file was uploaded on ' + manifest_df['timestamp'].min().strftime('%m/%d/%Y'))
print('Most recent file was uploaded on ' +  manifest_df['timestamp'].max().strftime('%m/%d/%Y'))

Oldest file was uploaded on 12/22/2010
Most recent file was uploaded on 02/05/2019


Note: Since the timestamp just represents when the .tar file was uploaded, it's not a reliable indicator of the year the preprint was published in. 

`yymm` indicates the items' age. Docs: Two digit year and month of items in the tar package. Starts with 9108 for 1991-08, rolls past y2k to 0001 for 2000-01, 1008 for 2010-08 etc.

Get number of files uploaded in 2018:

In [179]:
given_year_df = manifest_df[manifest_df['yymm'].str.slice(0,2) == '18']
given_year_df

Unnamed: 0,content_md5sum,filename,first_item,last_item,md5sum,num_items,seq_num,size,timestamp,yymm
1655,82782e703090bce2de95cddcb68dd721,src/arXiv_src_1801_001.tar,1801.00001,1801.00495,b8383e8eaa5e7edad8753b3871d649fb,495,1,559759332,2019-01-05 06:19:35,1801
1656,471e50e084d1ab66954ff61a58c22b24,src/arXiv_src_1801_002.tar,1801.00496,1801.00932,9187fccb8c6fe7f19d71333367c9a3c4,437,2,539244150,2019-01-05 06:20:32,1801
1657,5f9ef3cb7b019c8d403ff25a895d5412,src/arXiv_src_1801_003.tar,1801.00933,1801.01391,14fecffd1fea044ee8eea7036e46a941,459,3,546809800,2019-01-05 06:21:28,1801
1658,9dea1e19da8623b067449f95c67f170c,src/arXiv_src_1801_004.tar,1801.01392,1801.01762,783c0c2526bb083a721ad57c72050932,371,4,518973052,2019-01-05 06:22:19,1801
1659,58f9adcd3fd71d2e0a6fc3ec50bf7ea2,src/arXiv_src_1801_005.tar,1801.01763,1801.02170,7f5b79ce2f03a4d937b96ff48390bb96,408,5,641502583,2019-01-05 06:23:11,1801
1660,16f6b6fe4e3ee39bfaa762fa27cbf7ea,src/arXiv_src_1801_006.tar,1801.02171,1801.02610,0e12342af6ef4e4826ed55e5992395cd,440,6,513178861,2019-01-05 06:24:07,1801
1661,5ff8a71f2ca0d836b243587ceea72773,src/arXiv_src_1801_007.tar,1801.02611,1801.02990,3cfb296a4a0c186dfa1b2ea5308eebbd,380,7,523193441,2018-03-05 04:49:13,1801
1662,5fac97fec6e788a3c04cf4d78f4356b5,src/arXiv_src_1801_008.tar,1801.02991,1801.03416,0cac37d2a92864c20591d984bbfc2b77,426,8,528241677,2019-01-05 06:25:00,1801
1663,a85c5f11738f91979f81a86dbdfa9831,src/arXiv_src_1801_009.tar,1801.03417,1801.03834,cfea3162b6be1fcfdc7c710f29bc4a95,418,9,548732182,2019-01-05 06:25:57,1801
1664,6bac6078417aa6dd2e490d247d00b911,src/arXiv_src_1801_010.tar,1801.03835,1801.04206,87094d99353837f643bea5420fd7467c,372,10,547292201,2019-01-05 06:26:49,1801


In [180]:
print(str(len(given_year_df)) + " .tars uploaded for given year")
print(str(round(given_year_df['size'].sum(), 2) / 1000000000) + " GB")

392 .tars uploaded for given year
204.823664304 GB


### Connect to Google Drive

So that I don't need to pay to request the same data from the arxiv S3 bucket in the future.

PyDrive docs: https://pythonhosted.org/PyDrive/

https://medium.com/@annissouames99/how-to-upload-files-automatically-to-drive-with-python-ee19bb13dda

In [65]:
# Set up Google Drive according to docs
def connect_to_google_drive():
    g_login = GoogleAuth()
    g_login.LocalWebserverAuth()
    drive = GoogleDrive(g_login)

In [None]:
connect_to_google_drive()

In [13]:
arxiv_folder_id = None 

# Check if arxiv folder exists in Google Drive
files = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
for f in files:
    if f['mimeType'] == 'application/vnd.google-apps.folder' and f['title'] == 'arxiv':
        arxiv_folder_id = f['id']

# If folder doesn't exist, create and upload it 
if not arxiv_folder_id:
    arxiv_folder = drive.CreateFile({'title': 'arxiv', 
                                "mimeType": "application/vnd.google-apps.folder"}) 
    arxiv_folder.Upload()
    arxiv_folder_id = arxiv_folder['id']

print(arxiv_folder_id)

1f2WO6FlQhT3NyyfuBL6UEvkX1RX3cO_6


In [40]:
def upload_to_google_drive(filepath):
    print("Uploading " + filepath + " to Google Drive...")
    file = drive.CreateFile({'title': os.path.basename(filepath),
                             'parents': [{'id': arxiv_folder_id}], # place it into arxiv folder
                             'mimeType': 'application/gzip'})
    file.SetContentFile(filepath)
    file.Upload()
    print(filepath + ' uploaded.')

#### Get astro-ph preprints

The total size of the bucket exceeds 1 terabyte, but we won't use all of this data. The bucket contains preprints from all categories, and we're only interested in the astro-ph category. 

As suggested by this [SO answer](https://stackoverflow.com/questions/51276201/how-to-extract-files-in-s3-on-the-fly-with-boto3), I wanted to extract each .tar while downloading using [AWS Lambda](https://docs.aws.amazon.com/lambda/latest/dg/limits.html), but I need to check if there are any .tar files in the bucket whose size exceeds 512 MB, which is the limit of AWS Lambda temporary storage:

In [73]:
print(str(len(manifest_df[manifest_df['size'] > 512000000])) + " files")
print("Maximum size: " + str(round(manifest_df['size'].max()/1000000, 2)) + " MB")

1803 files
Maximum size: 1910.6 MB


There are many files that we won't be able to extract using Lambda, so we will just go the traditional download route.

In [108]:
def extract_tar(filename):
    """
    Extracts specified file.

    Parameters
    ----------
    filename : str
        Name of file to extract
    """

    # Quit file extraction if given file is not .tar
    if not tarfile.is_tarfile(filename):
        print('can\'t unzip ' + filename + ', not a .tar file')

    total_tex = 0
    tar_dir = 'latex/' + os.path.splitext(os.path.basename(filename))[0] + '/'

    # Create .tar directory if it doesn't exist
    if not os.path.isdir(tar_dir):
        os.makedirs(tar_dir)

    # Proceed with file extraction if .tar
    print('Opening ' + filename + '...')
    # Open .tar, read-only
    tar = tarfile.open(filename)
    # Iterate over .tar subfiles
    for subfile in tar.getmembers():
        # Open subfile only if .gz and begins with 'astro-ph'
        name = os.path.splitext(os.path.basename(subfile.name))[0]
        if subfile.name.endswith('.gz') and identifiers.str.contains(name).any():
            # Create submission directory if it doesn't exist
            if not os.path.isdir(tar_dir + name):
                os.makedirs(tar_dir + name)
            try:
                print('Processing ' + filename + '/' + subfile.name + '...')
                # Open .gz, read-only
                gz_obj = tar.extractfile(subfile) 
                gz = tarfile.open(fileobj=gz_obj) 
                # Iterate over .gz subfiles
                for subsubfile in gz.getmembers():
                    # Check if current subfile is .tex or .ltx 
                    if subsubfile.name.endswith('.tex') or subsubfile.name.endswith('.ltx'):
                        # Extract the file
                        gz.extract(subsubfile, path=tar_dir + name)
                        total_tex += 1
            except tarfile.ReadError:
                # Extract the entire .gz because we cannot read it using tarfile 
                # Note that these .gzs are single .tex files with no extension specified
                tar.extract(subfile, path='temp')
                # Uncompress the .gz file using gzip instead and place it with the other .tex files
                with gzip.open('temp/' + subfile.name, 'rb') as f_in:
                    basename = os.path.splitext(os.path.basename(subfile.name))[0]
                    with open(tar_dir + name + '/' + basename + '.tex', 'wb+') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                        total_tex += 1

    # Delete the temporary folder for those wonky gz files
    shutil.rmtree('temp/', ignore_errors=True)
    # Close tar
    tar.close()

    print(filename + ' extraction complete')
    print('Number of .tex files obtained: ' + str(total_tex) + '\n')
    
def process_tars():
    '''
    Downloads zipped .tars from source bucket to computer.

    Parameters
    ----------
    source_bucket : str
        Name of source bucket
    key : str
        Name of file to download
    '''

    print('Beginning tar download & extraction...')

    # Create a reusable Paginator
    paginator = s3resource.meta.client.get_paginator('list_objects_v2')

    # Create a PageIterator from the Paginator
    page_iterator = paginator.paginate(
        Bucket='arxiv',
        RequestPayer='requester',
        Prefix='src/'
    )

    # Download and extract tars
    numFiles = 0
    for page in page_iterator:
        numFiles = numFiles + len(page['Contents'])
        for file in page['Contents']:
            key = file['Key']
            # If current file is a tar 
            if key.endswith('.tar'):
                # Skip tar download if we already have it (in Google Drive)
                if os.path.basename(key) in uploaded_tars_list:
                    continue
                else:
                    # Download .tar
                    download_file(key)
                    # Extract astrophysics preprints from the .tar
                    extract_tar(key)
                    # Upload .tar to Google Drive for storage
                    upload_to_google_drive(key)
                    # Remove tar from local storage/the computer)
                    os.remove(key)
                    print('\n\n\n')
            
    print('Processed ' + str(numFiles - 1) + ' tars')  # -1 

In [None]:
if PROCESS_TARS:
    process_tars()

Beginning tar download & extraction...
Successfully downloaded s3://arxiv/src/arXiv_src_1310_006.tar to src/arXiv_src_1310_006.tar
Opening src/arXiv_src_1310_006.tar...
Processing src/arXiv_src_1310_006.tar/1310/1310.2608.gz...
Processing src/arXiv_src_1310_006.tar/1310/1310.2695.gz...
Processing src/arXiv_src_1310_006.tar/1310/1310.2820.gz...
Processing src/arXiv_src_1310_006.tar/1310/1310.2790.gz...
Processing src/arXiv_src_1310_006.tar/1310/1310.2845.gz...
Processing src/arXiv_src_1310_006.tar/1310/1310.2655.gz...
Processing src/arXiv_src_1310_006.tar/1310/1310.2852.gz...
Processing src/arXiv_src_1310_006.tar/1310/1310.2633.gz...
Processing src/arXiv_src_1310_006.tar/1310/1310.2607.gz...
Processing src/arXiv_src_1310_006.tar/1310/1310.2679.gz...
Processing src/arXiv_src_1310_006.tar/1310/1310.2600.gz...
Processing src/arXiv_src_1310_006.tar/1310/1310.2659.gz...
Processing src/arXiv_src_1310_006.tar/1310/1310.2780.gz...
Processing src/arXiv_src_1310_006.tar/1310/1310.2840.gz...
Proce

I need to take note of the time I began my download: 13:00 on February 7. So this means anything that is uploaded to arxiv was not downloaded. In the future, i can screen arxiv bucket for any files that are younger than 13:00 February 7 to add to my dataset. Screen inside process_tars

Note that I see that not all articles may be identified by astro-ph in the .gz name, so we will have to add code that goes back over all the .tars to identify metadata and whether it is an astrophysics article. 

Iterate through example .tar to see if any contain files that have been identified as astro-ph article in the metadata:

In [49]:
metadata_df = pd.read_csv('arXiv_metadata_oai.csv', dtype={'filename': str, 'filename_parsed': str})
identifiers = metadata_df['filename_parsed']
identifiers

0               0704.0009
1               0704.0017
2               0704.0023
3               0704.0044
4               0704.0048
5               0704.0059
6               0704.0080
7               0704.0094
8               0704.0128
9               0704.0133
10              0704.0138
11              0704.0139
12              0704.0144
13              0704.0155
14              0704.0156
15              0704.0160
16              0704.0168
17              0704.0171
18              0704.0175
19              0704.0184
20              0704.0187
21              0704.0192
22              0704.0203
23              0704.0205
24              0704.0207
25              0704.0209
26              0704.0212
27              0704.0219
28              0704.0221
29              0704.0222
               ...       
250017    quant-ph0007104
250018    quant-ph0101091
250019    quant-ph0104067
250020    quant-ph0106059
250021    quant-ph0106076
250022    quant-ph0107011
250023    quant-ph0107070
250024    qu

Opening src/arXiv_src_1310_006.tar...
1310
1310.2608
Processing src/arXiv_src_1310_006.tar/1310/1310.2608.gz...
1310.2854
1310.2715
1310.2846
1310.2818
1310.2695
Processing src/arXiv_src_1310_006.tar/1310/1310.2695.gz...
1310.2820
Processing src/arXiv_src_1310_006.tar/1310/1310.2820.gz...
1310.2790
Processing src/arXiv_src_1310_006.tar/1310/1310.2790.gz...
1310.2663
1310.2704
1310.2631
1310.2580
1310.2708
1310.2768
1310.2772
1310.2845
Processing src/arXiv_src_1310_006.tar/1310/1310.2845.gz...
1310.2813
1310.2710
1310.2655
Processing src/arXiv_src_1310_006.tar/1310/1310.2655.gz...
1310.2754
1310.2852
Processing src/arXiv_src_1310_006.tar/1310/1310.2852.gz...
1310.2699
1310.2589
1310.2554
1310.2660
1310.2819
1310.2648
1310.2803
1310.2633
Processing src/arXiv_src_1310_006.tar/1310/1310.2633.gz...
1310.2713
1310.2745
1310.2757
1310.2761
1310.2689
1310.2832
1310.2632
1310.2643
1310.2793
1310.2797
1310.2568
1310.2712
1310.2788
1310.2825
1310.2675
1310.2739
1310.2810
1310.2697
1310.2777
1310.

Identify, based on files in Google Drive, which files we've not yet downloaded from S3:

In [66]:
connect_to_google_drive()

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=205689913441-4qvumj04tvu7o2h0j1cth62qhp0ck9ld.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


In [96]:
# Get ID of arxiv folder
file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
for file in file_list:  
    if file.metadata['kind'] == 'application/vnd.google-apps.folder' and file.metadata['title'] == 'arxiv':
        arxiv_folder_id = file.metadata['id']

print(arxiv_folder_id)

1f2WO6FlQhT3NyyfuBL6UEvkX1RX3cO_6


In [100]:
# Get list of files in arxiv folder
uploaded_tars_list = drive.ListFile({'q': "'" + arxiv_folder_id + "' in parents and trashed=false"}).GetList()
uploaded_tars_list = [x.metadata['title'] for x in uploaded_tars_list]
print('Number of uploaded tars: ' + str(len(uploaded_tars_list)))
uploaded_tars_list

Number of uploaded tars: 664


['arXiv_src_1310_005.tar',
 'arXiv_src_1310_004.tar',
 'arXiv_src_1310_003.tar',
 'arXiv_src_1310_002.tar',
 'arXiv_src_1310_001.tar',
 'arXiv_src_1309_013.tar',
 'arXiv_src_1309_012.tar',
 'arXiv_src_1309_011.tar',
 'arXiv_src_1309_010.tar',
 'arXiv_src_1309_009.tar',
 'arXiv_src_1309_008.tar',
 'arXiv_src_1309_007.tar',
 'arXiv_src_1309_006.tar',
 'arXiv_src_1309_005.tar',
 'arXiv_src_1309_004.tar',
 'arXiv_src_1309_003.tar',
 'arXiv_src_1309_002.tar',
 'arXiv_src_1309_001.tar',
 'arXiv_src_1308_012.tar',
 'arXiv_src_1308_011.tar',
 'arXiv_src_1308_010.tar',
 'arXiv_src_1308_009.tar',
 'arXiv_src_1308_008.tar',
 'arXiv_src_1308_007.tar',
 'arXiv_src_1308_006.tar',
 'arXiv_src_1308_005.tar',
 'arXiv_src_1308_004.tar',
 'arXiv_src_1308_003.tar',
 'arXiv_src_1308_002.tar',
 'arXiv_src_1308_001.tar',
 'arXiv_src_1307_015.tar',
 'arXiv_src_1307_014.tar',
 'arXiv_src_1307_013.tar',
 'arXiv_src_1307_012.tar',
 'arXiv_src_1307_011.tar',
 'arXiv_src_1307_010.tar',
 'arXiv_src_1307_009.tar',
 

In [103]:
# In download_tars() check that the next title don't already exist in uploaded_tars_list
if os.path.basename('arXiv_src_1310_005.tar') in uploaded_tars_list:
    print('yes')

yes
