# Web Scraping Dynamic Javascript Web Page

## Environment setup

Libraries

pip install BeautifulSoup4

pip install selenium

pip install pandas

Browser: Firefox or Chrome with Chromium

## Scraping using Selenium with geckodriver

In [25]:
# import libraries
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd

from lxml import html

In [39]:
# specify the url
# I will obtain data on the newest and hottest books at Chapters Indigo.
urlpage = 'https://www.chapters.indigo.ca/en-ca/books/new-and-hot/'
print(urlpage)

https://www.chapters.indigo.ca/en-ca/books/new-and-hot/


In [105]:
# run firefox webdriver from executable path of your choice
# if geckodriver is not in an executable path
# driver = webdriver.Firefox(executable_path = 'GECKODRIVER_PATH')

# if geckodriver is in an executable path
driver = webdriver.Firefox()

In [87]:
# get web page
driver.get(urlpage)

In [89]:
# find elements by xpath
# the number of results depends on the number of loaded elements
results = driver.find_elements_by_xpath('//*[@id="grid-view-product-list-7731"]//*[@class="product-list__product-details--grid"]')
print('Number of results', len(results))

Number of results 83


In [103]:
# loop over results and store in array
data = []

for res in results:
    title = res.find_element_by_tag_name('h4').find_element_by_tag_name('a').text
    author = res.find_element_by_tag_name('p').find_element_by_tag_name('a').text
    
    # get the discounted price
    price = res.find_element_by_class_name('product-list__price--grid').text
    price = price[1:] # take off dollar sign
    
    data.append({"title" : title, "author" : author, "price" : price})

In [110]:
data

[{'title': 'THE PERFECT GIRLFRIEND',
  'author': 'Karen Hamilton',
  'price': '15.00'},
 {'title': 'THE GOWN: A NOVEL OF THE ROYAL WEDDING',
  'author': 'Jennifer Robson',
  'price': '15.00'},
 {'title': 'FIRE & BLOOD: 300 YEARS BEFORE A GAME OF…',
  'author': 'George R. R. Martin',
  'price': '25.35'},
 {'title': 'HOMEBODY: A GUIDE TO CREATING SPACES YOU NEVER…',
  'author': 'Joanna Gaines',
  'price': '28.66'},
 {'title': 'THE RECKONING: A NOVEL',
  'author': 'John Grisham',
  'price': '20.06'},
 {'title': 'KINGDOM OF THE BLIND: INDIGO EXCLUSIVE…',
  'author': 'Louise Penny',
  'price': '25.00'},
 {'title': 'BECOMING', 'author': 'Michelle Obama', 'price': '22.30'},
 {'title': 'WASHINGTON BLACK: A NOVEL',
  'author': 'Esi Edugyan',
  'price': '20.90'},
 {'title': 'THEN SHE WAS GONE: A NOVEL',
  'author': 'Lisa Jewell',
  'price': '13.66'},
 {'title': 'THE MELTDOWN (DIARY OF A WIMPY KID BOOK 13)',
  'author': 'Jeff Kinney',
  'price': '9.10'},
 {'title': 'RICK MERCER FINAL REPORT',
  '

In [107]:
# close driver
driver.quit()

In [118]:
# save to pandas dataframe
df = pd.DataFrame(data)

# reorder the columns
df = df[['title', 'author', 'price']]

In [119]:
df

Unnamed: 0,title,author,price
0,THE PERFECT GIRLFRIEND,Karen Hamilton,15.00
1,THE GOWN: A NOVEL OF THE ROYAL WEDDING,Jennifer Robson,15.00
2,FIRE & BLOOD: 300 YEARS BEFORE A GAME OF…,George R. R. Martin,25.35
3,HOMEBODY: A GUIDE TO CREATING SPACES YOU NEVER…,Joanna Gaines,28.66
4,THE RECKONING: A NOVEL,John Grisham,20.06
5,KINGDOM OF THE BLIND: INDIGO EXCLUSIVE…,Louise Penny,25.00
6,BECOMING,Michelle Obama,22.30
7,WASHINGTON BLACK: A NOVEL,Esi Edugyan,20.90
8,THEN SHE WAS GONE: A NOVEL,Lisa Jewell,13.66
9,THE MELTDOWN (DIARY OF A WIMPY KID BOOK 13),Jeff Kinney,9.10


In [122]:
# write to csv without index
STORE_DATA_PATH = ''
df.to_csv(STORE_DATA_PATH + 'books.csv', index=False)