# archive.org scraper

Import dependencies:

In [2]:
from selenium import webdriver
import time
import os

Get a list of the tars that I have already downloaded:

In [19]:
downloaded_tars = os.listdir('../data/archive')
downloaded_tars = [x.split('.')[0] for x in downloaded_tars]
print('Number of downloaded tars: ' + str(len(downloaded_tars)))

Number of downloaded tars: 2929


Define scraping functions:

In [9]:
def initialize():
    # Construct browser and profile (to prevent download dialog)
    # https://stackoverflow.com/questions/25251583/downloading-file-to-specified-location-with-selenium-and-python
    profile = webdriver.FirefoxProfile()
    profile.set_preference('browser.download.folderList', 2)
    profile.set_preference('browser.download.manager.showWhenStarting', False)
    profile.set_preference('browser.download.dir', '/Volumes/BRIENNAKH/Thesis/data/archive')
    profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/x-tar')
    return webdriver.Firefox(profile, executable_path='/usr/local/bin/geckodriver') # brew install chromedriver then see in terminal where it was installed to and paste this

def scroll(browser):
    # https://stackoverflow.com/questions/20986631/how-can-i-scroll-a-web-page-using-selenium-webdriver-in-python
    SCROLL_PAUSE_TIME = 2
    # Get scroll height
    last_height = browser.execute_script('return document.body.scrollHeight')
        
    while True:
        # Scroll down to bottom
        browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = browser.execute_script('return document.body.scrollHeight')
        if new_height == last_height:
            break
        last_height = new_height

Run scraper. Keep browser open and run the next cell.

In [15]:
# Initialize browser
browser = initialize()

# Navigate to page with search results
browser.get('https://archive.org/search.php?query=arXiv_src&sort=-publicdate')

# Scroll until the results end
scroll(browser)

# Collect all attributes
names = []
results = browser.find_elements_by_xpath('//div[@data-id]')
for result in results:
    name = result.get_attribute('data-id')
    if 'arXiv' in name:
        names.append(name)
        
# Print difference in downloaded & archived files
print('Files in my downloaded tars that aren\'t in the archive: ' 
      + str(len(list(set(downloaded_tars) - set(names)))))

print('Files in the archive that aren\'t in my downloaded tars: ' 
     + str(len(list(set(names) - set(downloaded_tars)))))

Files in my downloaded tars that aren't in the archive: 0
Files in the archive that aren't in my downloaded tars: 336


The Internet Archive has all of the tar files that I downloaded from Amazon S3. And they're free!

Go to the download page for each file I haven't already downloaded, and it will download:

In [16]:
for name in names:
    if name not in downloaded_tars:
        print('Downloading ' + name + '...')
        browser.get('https://archive.org/download/' + name)
        browser.find_element_by_partial_link_text('.tar').click()
        time.sleep(120) # wait 2 min to let download progress (it won't end but this prevents overflow)
    
    # could find a way to wait until the file appears in download directory, but I just put time.sleep

Downloading arXiv_src_2006_060...
Downloading arXiv_src_2006_061...
Downloading arXiv_src_2006_059...
Downloading arXiv_src_2006_057...
Downloading arXiv_src_2006_058...
Downloading arXiv_src_2006_055...
Downloading arXiv_src_2006_054...
Downloading arXiv_src_2006_056...
Downloading arXiv_src_2006_053...
Downloading arXiv_src_2006_052...
Downloading arXiv_src_2006_051...
Downloading arXiv_src_2006_050...
Downloading arXiv_src_2006_049...
Downloading arXiv_src_2006_048...
Downloading arXiv_src_2006_047...
Downloading arXiv_src_2006_046...
Downloading arXiv_src_2006_045...
Downloading arXiv_src_2006_044...
Downloading arXiv_src_2006_043...
Downloading arXiv_src_2006_042...
Downloading arXiv_src_2006_041...
Downloading arXiv_src_2006_040...
Downloading arXiv_src_2006_038...
Downloading arXiv_src_2006_039...
Downloading arXiv_src_2006_037...
Downloading arXiv_src_2006_034...
Downloading arXiv_src_2006_035...
Downloading arXiv_src_2006_036...
Downloading arXiv_src_2006_033...
Downloading ar

Downloading arXiv_src_2002_013...
Downloading arXiv_src_2002_012...
Downloading arXiv_src_2002_011...
Downloading arXiv_src_2002_009...
Downloading arXiv_src_2002_008...
Downloading arXiv_src_2002_010...
Downloading arXiv_src_2002_007...
Downloading arXiv_src_2002_006...
Downloading arXiv_src_2002_005...
Downloading arXiv_src_2002_003...
Downloading arXiv_src_2002_004...
Downloading arXiv_src_2002_002...
Downloading arXiv_src_2002_001...
Downloading arXiv_src_2001_039...
Downloading arXiv_src_2001_040...
Downloading arXiv_src_2001_036...
Downloading arXiv_src_2001_037...
Downloading arXiv_src_2001_038...
Downloading arXiv_src_2001_034...
Downloading arXiv_src_2001_035...
Downloading arXiv_src_2001_033...
Downloading arXiv_src_2001_032...
Downloading arXiv_src_2001_031...
Downloading arXiv_src_2001_030...
Downloading arXiv_src_2001_029...
Downloading arXiv_src_2001_027...
Downloading arXiv_src_2001_028...
Downloading arXiv_src_2001_026...
Downloading arXiv_src_2001_025...
Downloading ar