I found out halfway through my download from S3 that the identifiers (filenames) changes from "astro-ph" to entirely numerical, so I found all identifiers by querying metadata through the OAI2 and now need to re-access the older files I downloaded so that I can re-extract correct arXiv articles for astro-ph category. Also put them into folders representing their submission, instead of lumped together in one folder.

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

In [10]:
# Set up Google Drive according to docs
def connect_to_google_drive():
    g_login = GoogleAuth()
    g_login.LocalWebserverAuth()
    drive = GoogleDrive(g_login)
    return drive

In [11]:
drive = connect_to_google_drive()

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=205689913441-4qvumj04tvu7o2h0j1cth62qhp0ck9ld.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


In [34]:
# Get ID of arxiv folder
query = "'root' in parents and trashed=false and title='arxiv' and mimeType='application/vnd.google-apps.folder'"
arxiv_folder_id = drive.ListFile({'q': query}).GetList()[0].metadata['id']

# Get list of tars in Google Drive, sorted by numerical order
uploaded_tars_list = drive.ListFile({'q': "'" + arxiv_folder_id + "' in parents and trashed=false"}).GetList()
uploaded_tars_list = [x.metadata['title'] for x in uploaded_tars_list].sort()
print('Number of uploaded tars: ' + str(len(uploaded_tars_list)))
uploaded_tars_list

Number of uploaded tars: 1112


['arXiv_src_1602_001.tar',
 'arXiv_src_1601_017.tar',
 'arXiv_src_1601_016.tar',
 'arXiv_src_1601_015.tar',
 'arXiv_src_1601_014.tar',
 'arXiv_src_1601_013.tar',
 'arXiv_src_1601_012.tar',
 'arXiv_src_1601_011.tar',
 'arXiv_src_1601_010.tar',
 'arXiv_src_1601_009.tar',
 'arXiv_src_1601_008.tar',
 'arXiv_src_1601_007.tar',
 'arXiv_src_1601_006.tar',
 'arXiv_src_1601_005.tar',
 'arXiv_src_1601_004.tar',
 'arXiv_src_1601_003.tar',
 'arXiv_src_1601_002.tar',
 'arXiv_src_1601_001.tar',
 'arXiv_src_1512_019.tar',
 'arXiv_src_1512_018.tar',
 'arXiv_src_1512_017.tar',
 'arXiv_src_1512_016.tar',
 'arXiv_src_1512_015.tar',
 'arXiv_src_1512_014.tar',
 'arXiv_src_1512_013.tar',
 'arXiv_src_1512_012.tar',
 'arXiv_src_1512_011.tar',
 'arXiv_src_1512_010.tar',
 'arXiv_src_1512_009.tar',
 'arXiv_src_1512_008.tar',
 'arXiv_src_1512_007.tar',
 'arXiv_src_1512_006.tar',
 'arXiv_src_1512_005.tar',
 'arXiv_src_1512_004.tar',
 'arXiv_src_1512_003.tar',
 'arXiv_src_1512_002.tar',
 'arXiv_src_1512_001.tar',
 

In [37]:
# Truncate the list at target (the .tar we resumed download at, extracting correctly)
target = 'arXiv_src_1310_006.tar'
truncated_tars_list = uploaded_tars_list[0:uploaded_tars_list.index(target)]
truncated_tars_list

['arXiv_src_0001_001.tar',
 'arXiv_src_0002_001.tar',
 'arXiv_src_0003_001.tar',
 'arXiv_src_0004_001.tar',
 'arXiv_src_0005_001.tar',
 'arXiv_src_0006_001.tar',
 'arXiv_src_0007_001.tar',
 'arXiv_src_0008_001.tar',
 'arXiv_src_0009_001.tar',
 'arXiv_src_0010_001.tar',
 'arXiv_src_0011_001.tar',
 'arXiv_src_0012_001.tar',
 'arXiv_src_0101_001.tar',
 'arXiv_src_0102_001.tar',
 'arXiv_src_0103_001.tar',
 'arXiv_src_0104_001.tar',
 'arXiv_src_0105_001.tar',
 'arXiv_src_0106_001.tar',
 'arXiv_src_0107_001.tar',
 'arXiv_src_0108_001.tar',
 'arXiv_src_0109_001.tar',
 'arXiv_src_0110_001.tar',
 'arXiv_src_0111_001.tar',
 'arXiv_src_0112_001.tar',
 'arXiv_src_0201_001.tar',
 'arXiv_src_0202_001.tar',
 'arXiv_src_0203_001.tar',
 'arXiv_src_0204_001.tar',
 'arXiv_src_0205_001.tar',
 'arXiv_src_0206_001.tar',
 'arXiv_src_0207_001.tar',
 'arXiv_src_0208_001.tar',
 'arXiv_src_0209_001.tar',
 'arXiv_src_0210_001.tar',
 'arXiv_src_0211_001.tar',
 'arXiv_src_0212_001.tar',
 'arXiv_src_0301_001.tar',
 

In [44]:
# Set up extract and download tar code (from other notebook) and download_file changed here

def download_file(file_id, title):
    # Ensure src directory exists 
    if not os.path.isdir('src'):
        os.makedirs('src')
    
    # Download file
    try:
        file_obj = drive.CreateFile({'id': file_id})
        file_obj.GetContentFile(title) # Download file as 'cats.png'.
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print('ERROR: ' + key + " does not exist in Google Drive")
            
    print('Successfully downloaded s3://arxiv/{} to {}'.format(key, key))
    
def extract_tar(filename):
    """
    Extracts specified file.

    Parameters
    ----------
    filename : str
        Name of file to extract
    """

    # Quit file extraction if given file is not .tar
    if not tarfile.is_tarfile(filename):
        print('can\'t unzip ' + filename + ', not a .tar file')

    total_tex = 0
    tar_dir = 'latex/' + os.path.splitext(os.path.basename(filename))[0] + '/'

    # Create .tar directory if it doesn't exist
    if not os.path.isdir(tar_dir):
        os.makedirs(tar_dir)

    # Proceed with file extraction if .tar
    print('Opening ' + filename + '...')
    # Open .tar, read-only
    tar = tarfile.open(filename)
    # Iterate over .tar subfiles
    for subfile in tar.getmembers():
        # Open subfile only if .gz and begins with 'astro-ph'
        name = os.path.splitext(os.path.basename(subfile.name))[0]
        if subfile.name.endswith('.gz') and identifiers.str.contains(name).any():
            # Create submission directory if it doesn't exist
            if not os.path.isdir(tar_dir + name):
                os.makedirs(tar_dir + name)
            try:
                print('Processing ' + filename + '/' + subfile.name + '...')
                # Open .gz, read-only
                gz_obj = tar.extractfile(subfile) 
                gz = tarfile.open(fileobj=gz_obj) 
                # Iterate over .gz subfiles
                for subsubfile in gz.getmembers():
                    # Check if current subfile is .tex or .ltx 
                    if subsubfile.name.endswith('.tex') or subsubfile.name.endswith('.ltx'):
                        # Extract the file
                        gz.extract(subsubfile, path=tar_dir + name)
                        total_tex += 1
            except tarfile.ReadError:
                # Extract the entire .gz because we cannot read it using tarfile 
                # Note that these .gzs are single .tex files with no extension specified
                tar.extract(subfile, path='temp')
                # Uncompress the .gz file using gzip instead and place it with the other .tex files
                with gzip.open('temp/' + subfile.name, 'rb') as f_in:
                    basename = os.path.splitext(os.path.basename(subfile.name))[0]
                    with open(tar_dir + name + '/' + basename + '.tex', 'wb+') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                        total_tex += 1

    # Delete the temporary folder for those wonky gz files
    shutil.rmtree('temp/', ignore_errors=True)
    # Close tar
    tar.close()

    print(filename + ' extraction complete')
    print('Number of .tex files obtained: ' + str(total_tex) + '\n')
    
def process_tars():
    '''
    Downloads zipped .tars from source bucket to computer.

    Parameters
    ----------
    source_bucket : str
        Name of source bucket
    key : str
        Name of file to download
    '''

    print('Beginning tar download & extraction...')

    query = "'root' in parents and trashed=false and title='arxiv' and mimeType='application/vnd.google-apps.folder'"
    arxiv_folder_id = drive.ListFile({'q': query}).GetList()[0].metadata['id']
    uploaded_tars_list = drive.ListFile({'q': "'" + arxiv_folder_id + "' in parents and trashed=false"}).GetList()
    for uploaded_tar in uploaded_tars_list:
        title = uploaded_tar['title']
        if title < target:
            # Download .tar
            download_file(uploaded_tar.metadata['id'], 'src/' + title)
            # Extract astrophysics preprints from the .tar
            extract_tar('src/' + title)
            # Upload .tar to Google Drive for storage
            upload_to_google_drive('src/' + title)
            # Remove tar from local storage/the computer)
            os.remove('src/' + title)
            print('\n\n\n')
            
    print('Processed ' + str(numFiles - 1) + ' tars')  # -1 

In [45]:
# Download all of the .tars again, this time extracting them correctly
# We need to get the ids of these tars, so we will loop through each 
process_tars()

Beginning tar download & extraction...


NameError: name 'os' is not defined