# Scientific American

Scrapes Scientific American site.

In [119]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import configparser
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd

configs = configparser.ConfigParser()
configs.read('../config.ini')

['../config.ini']

## Define scraping functions:

In [129]:
def initialize(download_dir):
    options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': download_dir,
             'plugins.always_open_pdf_externally': True}
    options.add_experimental_option('prefs', prefs)
    return webdriver.Chrome(options=options, executable_path='../chromedriver')

def login(username, password):
    signin_button_present = EC.presence_of_element_located((By.ID, 'signin-click'))
    wait.until(signin_button_present)
    signin_button = browser.find_element(By.ID, 'signin-click')
    signin_button.click()
    email_input = browser.find_element(By.ID, 'emailAddress1')
    password_input = browser.find_element(By.ID, 'password1')
    browser.execute_script('arguments[0].value = "' + username, email_input) 
    browser.execute_script('arguments[0].value = "' + password + '";', password_input)
    login_button = browser.find_element_by_class_name('signin-flyout__form__submit')
    time.sleep(2)
    login_button.click()
    signed_in = EC.invisibility_of_element_located((By.CLASS_NAME, 'signin-flyout__form__loading'))
    wait.until(signed_in)
    
def scrape_issue_links():    
    # Grab all sections
    sections = browser.find_elements_by_class_name('store-listing')
    # For each section
    for section in sections:
        link = section.find_element_by_class_name('download').get_attribute('href')
        date = section.find_element_by_class_name('store-listing__title').find_element_by_tag_name('a').text # usually date
        if link in data['link']:
            link_idx = data['link'].index(link)
            data['comments'][link_idx] = date
        else:
            category = section.find_element_by_class_name('store-listing__category').text
            data['link'].append(link) 
            data['date'].append(date)
            data['category'].append(category)
            data['comments'].append(None)
            
def scrape_ebook_links():
    temporary_links =  [x.get_attribute('href') for x in browser.find_elements_by_class_name('store-listing__cta')]
    for idx,tl in enumerate(temporary_links):
        view_details = browser.get(tl)
        title = browser.find_element_by_class_name('product-detail__title').text 
        data['title'].append(title)
        try:
            epub_link = browser.find_element_by_xpath('//*[contains(text(), "Download Epub/Other")]').get_attribute('href')
            mobi_link = browser.find_element_by_xpath('//*[contains(text(), "Download Mobi/Kindle")]').get_attribute('href')
            summary = browser.find_elements_by_class_name('product-detail__meta')[1].find_element_by_tag_name('p').text        
            date = browser.find_elements_by_class_name('product-detail__meta')[1].text.split('On Sale Date: ')[1]
            data['date'].append(date)
            data['epub_link'].append(epub_link) 
            data['mobi_link'].append(mobi_link)
            data['pdf_link'].append(None)
            data['summary'].append(summary)
        except NoSuchElementException:
            print('NoSuchElementException...')
            pdf_link = browser.find_element_by_xpath('//*[contains(text(), "Download")]').get_attribute('href')
            summary = browser.find_element_by_class_name('product-detail__body').text
            data['epub_link'].append(None) 
            data['mobi_link'].append(None)
            data['pdf_link'].append(pdf_link)
            data['summary'].append(summary)
            data['date'].append(None)

## Scrape download links for all issues.

Log into account.

In [131]:
# Initialize browser
download_path = '/Volumes/ARCHIVES/GitHub/scientific-american/issues'
browser = initialize(download_path)
wait = WebDriverWait(browser, 15)

# Navigate to page with search results for issues
current_url = 'https://www.scientificamerican.com/store/archive/?page=1'
browser.get(current_url)

# Log into my account 
login(configs['SCIAM']['EMAIL'] + '";', configs['SCIAM']['PASSWORD'])

### Collect download links for magazines. 

In [19]:
data = {'category': [], 
        'date': [],
        'link': [],
        'comments': []}

# Collect all download links on the first page
total_pages = int(browser.find_elements_by_class_name('pagination__nums__item')[-1].find_element_by_tag_name('a').text)
pages_left = total_pages - 1
current_page = 2
print('Scraping page 1/' + str(total_pages) + '...')
scrape_issue_links()

# Paginate until the end, scraping all download links
while pages_left > 0: 
    try: 
        browser.get('https://www.scientificamerican.com/store/archive/?page=' + str(current_page))
        scrape_issue_links()
        print('Scraping page ' + str(current_page) + '/' + str(total_pages) + '...')
        current_total_pages = int(browser.find_elements_by_class_name('pagination__nums__item')[-1].find_element_by_tag_name('a').text)
        if total_pages != current_total_pages:
            print('Detected change in total pages.')
            pages_left = current_total_pages - (total_pages - pages_left)
            total_pages = current_total_pages
        current_page += 1
        pages_left -= 1
        time.sleep(5) # according to robots.txt
    except IndexError: # Your connection was interrupted. 
        print('Connection interrupted, reconnecting...')
        continue

Scraping page 1/441...
Scraping page 2/441...


KeyboardInterrupt: 

Save magazine data.

In [192]:
data_df = pd.DataFrame(data)
data_df.to_csv('issues.csv', index=False)

Unnamed: 0,category,date,link,comments
0,MIND,September 2020,https://www.scientificamerican.com/index.cfm/_...,
1,SCIENTIFIC AMERICAN,September 2020,https://www.scientificamerican.com/index.cfm/_...,
2,SA SPECIAL EDITIONS,Climate Change,https://www.scientificamerican.com/index.cfm/_...,
3,SPACE & PHYSICS,August 2020,https://www.scientificamerican.com/index.cfm/_...,
4,SCIENTIFIC AMERICAN,August 2020,https://www.scientificamerican.com/index.cfm/_...,
...,...,...,...,...
5275,SCIENTIFIC AMERICAN,"September 25, 1845",https://www.scientificamerican.com/index.cfm/_...,
5276,SCIENTIFIC AMERICAN,"September 18, 1845",https://www.scientificamerican.com/index.cfm/_...,
5277,SCIENTIFIC AMERICAN,"September 11, 1845",https://www.scientificamerican.com/index.cfm/_...,
5278,SCIENTIFIC AMERICAN,"September 04, 1845",https://www.scientificamerican.com/index.cfm/_...,


In [194]:
data_df.to_csv('issues.csv', index=False)

### Collect download links for ebooks.

In [132]:
# Set up data
data = {'title': [], 
        'date': [],
        'summary': [],
        'pdf_link': [],
        'epub_link': [],
        'mobi_link': []}

# Navigate to URL for ebooks
current_url = 'https://www.scientificamerican.com/store/ebooks/'
browser.get(current_url)

# Collect all download links on the first page
total_pages = int(browser.find_elements_by_class_name('pagination__nums__item')[-1].find_element_by_tag_name('a').text)
pages_left = total_pages - 1
current_page = 2
print('Scraping page 1/' + str(total_pages) + '...')
scrape_ebook_links()

# Paginate until the end, scraping all download links
while pages_left > 0: 
    try: 
        browser.get('https://www.scientificamerican.com/store/ebooks/?page=' + str(current_page))
        current_total_pages = int(browser.find_elements_by_class_name('pagination__nums__item')[-1].find_element_by_tag_name('a').text)
        print('Scraping page ' + str(current_page) + '/' + str(total_pages) + '...')
        scrape_ebook_links()      
        if total_pages != current_total_pages:
            print('Detected change in total pages.')
            pages_left = current_total_pages - (total_pages - pages_left)
            total_pages = current_total_pages
        current_page += 1
        pages_left -= 1
        time.sleep(5) # according to robots.txt
    except IndexError: # Your connection was interrupted. 
        print('Connection interrupted, reconnecting...')
        continue

Scraping page 1/9...
Scraping page 2/9...
Scraping page 3/9...
Scraping page 4/9...
NoSuchElementException...
Scraping page 5/9...
Scraping page 6/9...
Scraping page 7/9...
Scraping page 8/9...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
Scraping page 9/9...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...
NoSuchElementException...


Save ebooks data.

In [133]:
ebooks_df = pd.DataFrame(data)
ebooks_df

Unnamed: 0,title,date,summary,pdf_link,epub_link,mobi_link
0,Hacking the Immune System,11/2/20,"The immune system is a marvel, but sometimes t...",,https://www.scientificamerican.com/index.cfm/_...,https://www.scientificamerican.com/index.cfm/_...
1,Technology vs. Truth: Deception in the Digital...,10/5/20,"In the digital age, information, both true and...",,https://www.scientificamerican.com/index.cfm/_...,https://www.scientificamerican.com/index.cfm/_...
2,"Stressed Out: Causes, Effects and Keeping Calm",8/31/20,Chronic stress makes people sick. The fight-or...,,https://www.scientificamerican.com/index.cfm/_...,https://www.scientificamerican.com/index.cfm/_...
3,Black Holes: Going to Extremes,8/3/20,"Once dismissed as a mathematical curiosity, bl...",,https://www.scientificamerican.com/index.cfm/_...,https://www.scientificamerican.com/index.cfm/_...
4,Quantum Universe,7/6/20,"Strange and probabilistic, physics at the smal...",,https://www.scientificamerican.com/index.cfm/_...,https://www.scientificamerican.com/index.cfm/_...
...,...,...,...,...,...,...
103,Nature Books and Arts Special: 2014. No 1,,A collection of book reviews for the scientifi...,https://www.scientificamerican.com/index.cfm/_...,,
104,Books & Arts Special 2015 No.1,,In Nature’s first book reviews supplement for ...,https://www.scientificamerican.com/index.cfm/_...,,
105,Nature Collections: Stem Cells – Breaking Barr...,,The most versatile of stem cells can be made f...,https://www.scientificamerican.com/index.cfm/_...,,
106,Nature Collections: Homo Floresiensis – the ‘h...,,The remains of the tiny hominin Homo floresien...,https://www.scientificamerican.com/index.cfm/_...,,


In [134]:
ebooks_df.to_csv('ebooks.csv', index=False)

## Download all issues and ebooks.

Download ebooks.

Download issues.

In [71]:
print('Scraped download links for ' + str(len(download_links)) + ' issues.')
for link in download_links:
    browser.get(link)
    time.sleep(10) # so that we don't download too many at once

Scraped download links for 5283 issues.


KeyboardInterrupt: 

## Verify scraper is working properly (delete later)

In [161]:
data_df['link'].duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
5267    False
5268    False
5269    False
5270    False
5271    False
Name: link, Length: 5272, dtype: bool

In [84]:
for x, i in enumerate(download_links):
    if i == 'https://www.scientificamerican.com/index.cfm/_api/render/file/?method=inline&amp;fileID=ABF5DC40-BF47-4CEB-9FCC5BA0EA567959':
        print(i)
        print(x)

https://www.scientificamerican.com/index.cfm/_api/render/file/?method=inline&amp;fileID=ABF5DC40-BF47-4CEB-9FCC5BA0EA567959
4
https://www.scientificamerican.com/index.cfm/_api/render/file/?method=inline&amp;fileID=ABF5DC40-BF47-4CEB-9FCC5BA0EA567959
12


Data fields:
- Publication name
- Date
- Link 
- Comments

If the link already exists in the database, then add a comment to the comments list to the previous link and skip adding the current link. For example, the March 31, 1917 issue appears twice for downloading, as the March 31, 1917 issue and the WW1 Anniversary Issue. 

In [168]:
data_df['comments'].value_counts()

October 2019              1
January 2020              1
WW1 Anniversary Issu..    1
December 2019             1
Scientific American ..    1
November 2019             1
April 2020                1
March 2020                1
January 1959              1
February 2020             1
May 2020                  1
Name: comments, dtype: int64

In [188]:
data_df[data_df['category'].str.contains('Quantum')]

Unnamed: 0,category,date,link,comments


In [12]:
data_df = pd.read_csv('data.csv')

In [108]:
pd.DataFrame(data)['pdf_link']

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
37    None
38    None
39    None
40    None
41    None
42    None
43    None
44    None
45    None
46    None
47    None
48    None
49    None
50    None
51    None
52    None
53    None
54    None
55    None
Name: pdf_link, dtype: object