# Scientific American

Scrapes all issues and ebooks from Scientific American.

**Scraper needs an update that will enable it to notice a duplicated issue or ebook while it is scraping, and it will quit scraping any further pages.**

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import configparser
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
import re
import pandas as pd

configs = configparser.ConfigParser()
configs.read('../config.ini')

['../config.ini']

## Define scraping functions:

In [98]:
def initialize(download_dir):
    options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': download_dir,
             'plugins.always_open_pdf_externally': True}
    options.add_experimental_option('prefs', prefs)
    return webdriver.Chrome(options=options, executable_path='../chromedriver')

def login(username, password):
    signin_button_present = EC.presence_of_element_located((By.ID, 'signin-click'))
    wait.until(signin_button_present)
    signin_button = browser.find_element(By.ID, 'signin-click')
    signin_button.click()
    email_input = browser.find_element(By.ID, 'emailAddress1')
    password_input = browser.find_element(By.ID, 'password1')
    browser.execute_script('arguments[0].value = "' + username, email_input) 
    browser.execute_script('arguments[0].value = "' + password + '";', password_input)
    login_button = browser.find_element_by_class_name('signin-flyout__form__submit')
    time.sleep(2)
    login_button.click()
    signed_in = EC.invisibility_of_element_located((By.CLASS_NAME, 'signin-flyout__form__loading'))
    wait.until(signed_in)
    
def scrape_issue_links():
    view_details_links =  [x.get_attribute('href') for x in browser.find_elements_by_class_name('store-listing__cta')]
    for btn in view_details_links:
        browser.get(btn)
        link = browser.find_element_by_class_name('download-issue-btn-counter').get_attribute('href')
        # If link is already in previous data, quit
        # Add code here
        meta = re.search('Volume (\d+), Issue (\d+s?)', browser.find_element_by_class_name('product-detail__meta').text)
        volume = meta[1]
        issue = meta[2]
        date = browser.find_element_by_class_name('product-detail__title').text
        category = re.search('(CURRENT ISSUE)?(.+)', browser.find_element_by_class_name('product-detail__category').text)[2]
        data['category'].append(category)
        data['date'].append(date)
        data['volume'].append(volume)
        data['issue'].append(issue)
        data['link'].append(link)
        # Maybe return a boolean (all successful, false - had duplicates, then quit)
    
def scrape_ebook_links():
    view_details_links =  [x.get_attribute('href') for x in browser.find_elements_by_class_name('store-listing__cta')]
    for idx, btn in enumerate(view_details_links):
        browser.get(btn)
        title = browser.find_element_by_class_name('product-detail__title').text 
        data['title'].append(title)
        try:
            epub_link = browser.find_element_by_xpath('//*[contains(text(), "Download Epub/Other")]').get_attribute('href')
            mobi_link = browser.find_element_by_xpath('//*[contains(text(), "Download Mobi/Kindle")]').get_attribute('href')
            summary = browser.find_elements_by_class_name('product-detail__meta')[1].find_element_by_tag_name('p').text        
            date = browser.find_elements_by_class_name('product-detail__meta')[1].text.split('On Sale Date: ')[1]
            data['date'].append(date)
            data['epub_link'].append(epub_link) 
            data['mobi_link'].append(mobi_link)
            data['pdf_link'].append(None)
            data['summary'].append(summary)
        except NoSuchElementException:
            print('NoSuchElementException...')
            pdf_link = browser.find_element_by_xpath('//*[contains(text(), "Download")]').get_attribute('href')
            summary = browser.find_element_by_class_name('product-detail__body').text
            data['epub_link'].append(None) 
            data['mobi_link'].append(None)
            data['pdf_link'].append(pdf_link)
            data['summary'].append(summary)
            data['date'].append(None)

## Log in, navigating to issue archive

Log into account.

In [99]:
# Initialize browser
download_path = '/Volumes/ARCHIVES/GitHub/scientific-american/issues'
browser = initialize(download_path)
wait = WebDriverWait(browser, 15)

# Navigate to page with search results for issues
current_url = 'https://www.scientificamerican.com/store/archive/?page=1'
browser.get(current_url)

# Log into my account 
login(configs['SCIAM']['EMAIL'] + '";', configs['SCIAM']['PASSWORD'])

## Collect download links for magazines 

**Add code here that reads in issues.csv. Update the scraper function above to check this data for duplicates.**

Scrape.

In [100]:
data = {'category': [], 
        'volume': [],
        'issue': [],
        'date': [],
        'link': []}

# Collect all download links on the first page
total_pages = int(browser.find_elements_by_class_name('pagination__nums__item')[-1].find_element_by_tag_name('a').text)
pages_left = total_pages - 1
current_page = 2
print('Scraping page 1/' + str(total_pages) + '...')
scrape_issue_links()

# Paginate until the end, scraping all download links
while pages_left > 0: 
    try: 
        browser.get('https://www.scientificamerican.com/store/archive/?page=' + str(current_page))
        current_total_pages = int(browser.find_elements_by_class_name('pagination__nums__item')[-1].find_element_by_tag_name('a').text)
        print('Scraping page ' + str(current_page) + '/' + str(total_pages) + '...')
        scrape_issue_links()
        if total_pages != current_total_pages:
            print('Detected change in total pages.')
            pages_left = current_total_pages - (total_pages - pages_left)
            total_pages = current_total_pages
        current_page += 1
        pages_left -= 1
        time.sleep(5) # according to robots.txt
    except IndexError: # Your connection was interrupted. 
        print('Connection interrupted, reconnecting...')
        continue

Scraping page 1/441...
Scraping page 2/441...
Scraping page 3/441...
Scraping page 4/441...
Scraping page 5/441...
Scraping page 6/441...
Scraping page 7/441...
Scraping page 8/441...
Scraping page 9/441...
Scraping page 10/441...
Scraping page 11/441...
Scraping page 12/441...
Scraping page 13/441...
Scraping page 14/441...
Scraping page 15/441...
Scraping page 16/441...
Scraping page 17/441...
Scraping page 18/441...
Scraping page 19/441...
Scraping page 20/441...
Scraping page 21/441...
Scraping page 22/441...
Scraping page 23/441...
Scraping page 24/441...
Scraping page 25/441...
Scraping page 26/441...
Scraping page 27/441...
Scraping page 28/441...
Scraping page 29/441...
Scraping page 30/441...
Scraping page 31/441...
Scraping page 32/441...
Scraping page 33/441...
Scraping page 34/441...
Scraping page 35/441...
Scraping page 36/441...
Scraping page 37/441...
Scraping page 38/441...
Scraping page 39/441...
Scraping page 40/441...
Scraping page 41/441...
Scraping page 42/441...
S

Scraping page 334/441...
Scraping page 335/441...
Scraping page 336/441...
Scraping page 337/441...
Scraping page 338/441...
Scraping page 339/441...
Scraping page 340/441...
Scraping page 341/441...
Scraping page 342/441...
Scraping page 343/441...
Scraping page 344/441...
Scraping page 345/441...
Scraping page 346/441...
Scraping page 347/441...
Scraping page 348/441...
Scraping page 349/441...
Scraping page 350/441...
Scraping page 351/441...
Scraping page 352/441...
Scraping page 353/441...
Scraping page 354/441...
Scraping page 355/441...
Scraping page 356/441...
Scraping page 357/441...
Scraping page 358/441...
Scraping page 359/441...
Scraping page 360/441...
Scraping page 361/441...
Scraping page 362/441...
Scraping page 363/441...
Scraping page 364/441...
Scraping page 365/441...
Scraping page 366/441...
Scraping page 367/441...
Scraping page 368/441...
Scraping page 369/441...
Scraping page 370/441...
Scraping page 371/441...
Scraping page 372/441...
Scraping page 373/441...


### Clean magazine data.

In [136]:
issues_df = pd.DataFrame(data)
issues_df

Unnamed: 0,category,volume,issue,date,link
0,SCIENTIFIC AMERICAN SPACE & PHYSICS,3,6,December 2020,https://www.scientificamerican.com/index.cfm/_...
1,SCIENTIFIC AMERICAN HEALTH & MEDICINE,2,6,December 2020,https://www.scientificamerican.com/index.cfm/_...
2,SCIENTIFIC AMERICAN,323,6,December 2020,https://www.scientificamerican.com/index.cfm/_...
3,SCIENTIFIC AMERICAN,323,5,November 2020,https://www.scientificamerican.com/index.cfm/_...
4,SCIENTIFIC AMERICAN MIND,31,6,November 2020,https://www.scientificamerican.com/index.cfm/_...
...,...,...,...,...,...
5287,SCIENTIFIC AMERICAN,1,5,"September 25, 1845",https://www.scientificamerican.com/index.cfm/_...
5288,SCIENTIFIC AMERICAN,1,4,"September 18, 1845",https://www.scientificamerican.com/index.cfm/_...
5289,SCIENTIFIC AMERICAN,1,3,"September 11, 1845",https://www.scientificamerican.com/index.cfm/_...
5290,SCIENTIFIC AMERICAN,1,2,"September 04, 1845",https://www.scientificamerican.com/index.cfm/_...


Check duplicates.

In [137]:
dupes = issues_df[issues_df['link'].duplicated(keep=False)]
dupes

Unnamed: 0,category,volume,issue,date,link
922,SCIENTIFIC AMERICAN,200,1,January 1959,https://www.scientificamerican.com/index.cfm/_...
923,SCIENTIFIC AMERICAN,200,1,January 1959,https://www.scientificamerican.com/index.cfm/_...
1607,SCIENTIFIC AMERICAN,116,13,"March 31, 1917",https://www.scientificamerican.com/index.cfm/_...
1608,SCIENTIFIC AMERICAN,116,13,"March 31, 1917",https://www.scientificamerican.com/index.cfm/_...
4055,SCIENTIFIC AMERICAN,21,19,"November 06, 1869",https://www.scientificamerican.com/index.cfm/_...
4056,SCIENTIFIC AMERICAN,21,19,"November 06, 1869",https://www.scientificamerican.com/index.cfm/_...


In [138]:
issues_df.drop_duplicates(inplace=True)
print('Dropped', int(len(dupes)/2), 'duplicates')

Dropped 3 duplicates


Check categories.

In [139]:
issues_df['category'].value_counts()

SCIENTIFIC AMERICAN                       5153
SCIENTIFIC AMERICAN MIND                   106
SCIENTIFIC AMERICAN SPACE & PHYSICS         16
SCIENTIFIC AMERICAN HEALTH & MEDICINE       11
 SCIENTIFIC AMERICAN                         1
 SCIENTIFIC AMERICAN SPACE & PHYSICS         1
 SCIENTIFIC AMERICAN HEALTH & MEDICINE       1
Name: category, dtype: int64

In [141]:
issues_df['category'] = issues_df['category'].apply(lambda x: x.strip())
issues_df['category'].value_counts()

SCIENTIFIC AMERICAN                      5154
SCIENTIFIC AMERICAN MIND                  106
SCIENTIFIC AMERICAN SPACE & PHYSICS        17
SCIENTIFIC AMERICAN HEALTH & MEDICINE      12
Name: category, dtype: int64

In [145]:
issues_df.to_csv('issues.csv', index=False)

## Collect download links for ebooks.

In [132]:
# Set up data
data = {'title': [], 
        'date': [],
        'summary': [],
        'pdf_link': [],
        'epub_link': [],
        'mobi_link': []}

# Navigate to URL for ebooks
current_url = 'https://www.scientificamerican.com/store/ebooks/'
browser.get(current_url)

# Collect all download links on the first page
total_pages = int(browser.find_elements_by_class_name('pagination__nums__item')[-1].find_element_by_tag_name('a').text)
pages_left = total_pages - 1
current_page = 2
print('Scraping page 1/' + str(total_pages) + '...')
scrape_ebook_links()

# Paginate until the end, scraping all download links
while pages_left > 0: 
    try: 
        browser.get('https://www.scientificamerican.com/store/ebooks/?page=' + str(current_page))
        current_total_pages = int(browser.find_elements_by_class_name('pagination__nums__item')[-1].find_element_by_tag_name('a').text)
        print('Scraping page ' + str(current_page) + '/' + str(total_pages) + '...')
        scrape_ebook_links()      
        if total_pages != current_total_pages:
            print('Detected change in total pages.')
            pages_left = current_total_pages - (total_pages - pages_left)
            total_pages = current_total_pages
        current_page += 1
        pages_left -= 1
        time.sleep(5) # according to robots.txt
    except IndexError: # Your connection was interrupted. 
        print('Connection interrupted, reconnecting...')
        continue

Scraping page 1/9...
Scraping page 2/9...
Scraping page 3/9...
Scraping page 4/9...
NoSuchElementException...
Scraping page 5/9...
Scraping page 6/9...
Scraping page 7/9...
Scraping page 8/9...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
Scraping page 9/9...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...


Save ebooks data.

In [133]:
ebooks_df = pd.DataFrame(data)
ebooks_df

Unnamed: 0,title,date,summary,pdf_link,epub_link,mobi_link
0,Hacking the Immune System,11/2/20,"The immune system is a marvel, but sometimes t...",,https://www.scientificamerican.com/index.cfm/_...,https://www.scientificamerican.com/index.cfm/_...
1,Technology vs. Truth: Deception in the Digital...,10/5/20,"In the digital age, information, both true and...",,https://www.scientificamerican.com/index.cfm/_...,https://www.scientificamerican.com/index.cfm/_...
2,"Stressed Out: Causes, Effects and Keeping Calm",8/31/20,Chronic stress makes people sick. The fight-or...,,https://www.scientificamerican.com/index.cfm/_...,https://www.scientificamerican.com/index.cfm/_...
3,Black Holes: Going to Extremes,8/3/20,"Once dismissed as a mathematical curiosity, bl...",,https://www.scientificamerican.com/index.cfm/_...,https://www.scientificamerican.com/index.cfm/_...
4,Quantum Universe,7/6/20,"Strange and probabilistic, physics at the smal...",,https://www.scientificamerican.com/index.cfm/_...,https://www.scientificamerican.com/index.cfm/_...
...,...,...,...,...,...,...
103,Nature Books and Arts Special: 2014. No 1,,A collection of book reviews for the scientifi...,https://www.scientificamerican.com/index.cfm/_...,,
104,Books & Arts Special 2015 No.1,,In Nature’s first book reviews supplement for ...,https://www.scientificamerican.com/index.cfm/_...,,
105,Nature Collections: Stem Cells – Breaking Barr...,,The most versatile of stem cells can be made f...,https://www.scientificamerican.com/index.cfm/_...,,
106,Nature Collections: Homo Floresiensis – the ‘h...,,The remains of the tiny hominin Homo floresien...,https://www.scientificamerican.com/index.cfm/_...,,


In [134]:
ebooks_df.to_csv('ebooks.csv', index=False)

## Download all issues and ebooks.

Download ebooks.

Download issues.

In [127]:
download_links = issues_df['link']
download_links

0       https://www.scientificamerican.com/index.cfm/_...
1       https://www.scientificamerican.com/index.cfm/_...
2       https://www.scientificamerican.com/index.cfm/_...
3       https://www.scientificamerican.com/index.cfm/_...
4       https://www.scientificamerican.com/index.cfm/_...
                              ...                        
5287    https://www.scientificamerican.com/index.cfm/_...
5288    https://www.scientificamerican.com/index.cfm/_...
5289    https://www.scientificamerican.com/index.cfm/_...
5290    https://www.scientificamerican.com/index.cfm/_...
5291    https://www.scientificamerican.com/index.cfm/_...
Name: link, Length: 5289, dtype: object

In [129]:
print(len(download_links), 'issues to download...')
for link in download_links:
    browser.get(link)
    time.sleep(10) # so that we don't download too many at once

5289 issues to download...
