# archive.org scraper

Import dependencies:

In [88]:
from selenium import webdriver
import time
import os

Get a list of the tars that I have already downloaded:

In [89]:
downloaded_tars = os.listdir('../data/2020_03_03_original_arxiv_tars')
downloaded_tars = [x.split('.')[0] for x in downloaded_tars]

Define scraping functions:

In [99]:
def initialize():
    # Construct browser and profile (to prevent download dialog)
    # https://stackoverflow.com/questions/25251583/downloading-file-to-specified-location-with-selenium-and-python
    profile = webdriver.FirefoxProfile()
    profile.set_preference('browser.download.folderList', 2)
    profile.set_preference('browser.download.manager.showWhenStarting', False)
    profile.set_preference('browser.download.dir', '/Volumes/BRIENNAKH/Thesis/data/2020_03_07_update_tars')
    profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/x-tar')
    return webdriver.Firefox(profile, executable_path='/usr/local/bin/geckodriver') # brew install chromedriver then see in terminal where it was installed to and paste this

def scroll(browser):
    # https://stackoverflow.com/questions/20986631/how-can-i-scroll-a-web-page-using-selenium-webdriver-in-python
    
    SCROLL_PAUSE_TIME = 2
    
    # Get scroll height
    last_height = browser.execute_script('return document.body.scrollHeight')
        
    while True:
        # Scroll down to bottom
        browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        
        # Calculate new scroll height and compare with last scroll height
        new_height = browser.execute_script('return document.body.scrollHeight')
        if new_height == last_height:
            break
        last_height = new_height

In [100]:
# Initialize browser
browser = initialize()

# Navigate to page with search results
browser.get('https://archive.org/search.php?query=arXiv_src&sort=-publicdate')

# Scroll until the results end
scroll(browser)

In [94]:
# Collect all attributes
names = []
results = browser.find_elements_by_xpath('//div[@data-id]')
for result in results:
    name = result.get_attribute('data-id')
    if 'arXiv' in name:
        names.append(name)

In [96]:
print('Files in my downloaded tars that aren\'t in the archive: ' 
      + str(len(list(set(downloaded_tars) - set(names)))))

print('Files in the archive that aren\'t in my downloaded tars: ' 
     + str(len(list(set(names) - set(downloaded_tars)))))

Files in my downloaded tars that aren't in the archive: 0
Files in the archive that aren't in my downloaded tars: 431


In [98]:
list(set(names) - set(downloaded_tars))

['arXiv_src_1908_012',
 'arXiv_src_1910_009',
 'arXiv_src_1911_040',
 'arXiv_src_1910_027',
 'arXiv_src_1909_040',
 'arXiv_src_1905_026',
 'arXiv_src_1904_035',
 'arXiv_src_1907_016',
 'arXiv_src_1908_004',
 'arXiv_src_1909_026',
 'arXiv_src_1906_027',
 'arXiv_src_1908_006',
 'arXiv_src_1707_016',
 'arXiv_src_1802_013',
 'arXiv_src_1810_012',
 'arXiv_src_1910_028',
 'arXiv_src_1909_044',
 'arXiv_src_1804_020',
 'arXiv_src_1907_024',
 'arXiv_src_1911_019',
 'arXiv_src_1905_016',
 'arXiv_src_1904_012',
 'arXiv_src_1911_008',
 'arXiv_src_1908_009',
 'arXiv_src_1910_018',
 'arXiv_src_1709_008',
 'arXiv_src_1703_025',
 'arXiv_src_1904_017',
 'arXiv_src_1910_010',
 'arXiv_src_1906_034',
 'arXiv_src_1901_010',
 'arXiv_src_1910_015',
 'arXiv_src_1906_015',
 'arXiv_src_1905_004',
 'arXiv_src_1908_005',
 'arXiv_src_1908_026',
 'arXiv_src_1907_025',
 'arXiv_src_1702_018',
 'arXiv_src_1910_035',
 'arXiv_src_1910_038',
 'arXiv_src_1905_040',
 'arXiv_src_1906_031',
 'arXiv_src_1904_040',
 'arXiv_src

The Internet Archive actually has all of the tar files that I downloaded from Amazon S3. And they're available for free. 

Go to the download page for each file I haven't already downloaded:

In [101]:
for name in names:
    if name not in downloaded_tars:
        browser.get('https://archive.org/download/' + name)
        browser.find_element_by_partial_link_text('.tar').click()
        time.sleep(120) # wait 2 min to let download progress
    
    # could find a way to wait until the file appears in download directory, but I just put time.sleep