In [1]:
import pandas as pd
import numpy as np
import re
import time
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import random

# ECON 323 FINAL PROJECT: WEB-SCRAPING CODE

In this workbook I use web-scraping techniques to create the data set used for my analysis. I am scraping from Amazon's e-book featured listings to get data on books that appear in the first 30 pages of the [Kindle Store](https://www.amazon.ca/s?rh=n%3A2980423011&fs=true&ref=lp_2980423011_sar). The code is loosely inspired by [this tutorial](https://www.datacamp.com/community/tutorials/amazon-web-scraping-using-beautifulsoup). I use the Beautiful Soup package to navigate the html I extracted from Amazon.

### 1. HTML Extraction

The first step is to write a function which will request the needed html for all the listings in a specific page range. The attributes I am interested in scraping are not all available in the information shown on the listings page so for each book on the page I need to request the html for each books individual page. We want to do this for more then one page of listings, as there are only around 16 non-sponsored listings per page. 

In the images below, for each of the books listed on the page on the right we want to extract the html for the page on the left.

![alt text](listings.png "Title")

For each page in the rage, the get soup function performs the following major steps: 
- First it requests the html for the listings page and applies the BeautifulSoup function to create a navigable object.
- Then it finds all of the non-sponsored listings on the page.
- Then, iterating through each lisitng, it finds the link to the individual page for the book and requests the html for that page.
- Finally, it applies the BeautifulSoup function to the individual book page and adds it to a list.

The function returns a list of soup objects for all the books listed.

Unsurprisingly, Amazon has some bot-detection so we need to make our requests look realistic, in order to do this we need to send some headers with our requests. If you submit too many requests in a row from the same User Agent, you encounter a recapcha. To work around this I set up a list of possible user agent, and for each page the function randomly selects a new user agents from the list to use in the header. Nevertheless, I still encounter issues relatively frequently (~ 1 in 10 requests fail to pull any data) but this is a huge improvement from where I started. From what I can find, there are a number of other steps you can take to reduce chances of running into issues with bot detection but they are beyond my ability to implement within the time frame. As it is, we are not reliably getting data on all the books in the requested page range, but this is not of material consequence for the issue at hand.

In [2]:
#set possible user agents
user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299",
                   "Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0",
                   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
                   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
                   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/601.7.7 (KHTML, like Gecko) Version/9.1.2 Safari/601.7.7",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/601.5.17 (KHTML, like Gecko) Version/9.1 Safari/601.5.17",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.6.3 (KHTML, like Gecko) Version/8.0.6 Safari/600.6.3",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Safari/602.1.50",
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/600.4.10 (KHTML, like Gecko) Version/8.0.4 Safari/600.4.10"
                   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15"
                  ]

def get_soup(page_start,page_end):
    """
    Takes a range of pages and returns a list of the html from the individual listings of all books 
    listed on the pages indicated.
    """
    book_soups = []
    for pageNo in range(page_start , page_end+1):
        #randomly select a user agent
        user_agent = random.choice(user_agent_list)
        #set header
        headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
                   "Accept-Encoding": "gzip, deflate, br", 
                   "Accept-Language": "en-US,en;q=0.9",
                   "User-Agent": user_agent}
        
        #request html for listings page
        r = requests.get('https://www.amazon.ca/s?i=digital-text&rh=n%3A2980423011&fs=true&page='+str(pageNo)+'&qid=1640233972&ref=sr_pg_'+str(pageNo), headers=headers)
        content = r.content
        soup = BeautifulSoup(content)
        
        #itterate over individual listings
        for d in soup.findAll('div', class_ = "s-result-item s-asin sg-col-0-of-12 sg-col-16-of-20 sg-col s-widget-spacing-small sg-col-12-of-16"):
            #find link to individual listing
            links = d.find('a', class_ = "a-link-normal s-no-outline")
            link = links['href']
            #request html for individual listing
            book_r = requests.get('https://www.amazon.ca'+link, headers=headers)
            book_content = book_r.content
            book_soup = BeautifulSoup(book_content)
            
            book_soups.append(book_soup)
            
    return(book_soups)

### 2. Scraping Data

Next I define a function that will extract all the wanted attributes from the html object acquired for each books. I am interested in the following information:
- Title
- Author
- Price
- Inclusion in Kindle Unlimited
- Ranking
- Rating
- Number of Ratings
- Blurb
- Reviews (title, rating, text content)

![alt text](attribute.png "Title")

For each of these attributes, the get data function uses the infrastructure of the BeautifulSoup package to navigate the html of the web-page to find the given attribute and add it to a list. If the attribute can not be found then NA is appended instead. The function returns a list of all the attributes. 

For the reviews, I extract only the reviews shown in the "top reviews" section. There is a varying number of reviews that show up in this section for each listing. The review related attributes are stored as lists.

In [3]:
def get_data(s):
    data_list = []
    
    #book title
    title = s.find('span', id = 'productTitle')
    data_list.append(title.text if title is not None else np.nan)
    
    #author (allows for more than one)
    author_list = s.findAll('span', class_ = 'author notFaded')
    authors = []
    for a in author_list:
        author = a.find('a', class_ = 'a-link-normal contributorNameID')
        if author is not None:
            authors.append(author.text)
    data_list.append(authors if authors is not [] else np.nan)
    
    #price
    price = s.find('span', id = 'kindle-price')
    data_list.append(price.text if price is not None else np.nan)
    
    #kindle unlimited 
    ku_icon = s.find('i', class_ = 'a-icon a-icon-kindle-unlimited a-icon-medium')
    data_list.append(1 if ku_icon is not None else 0)
    
    #ranking
    product_details = s.find('div', id="detailBullets_feature_div")
    if product_details is not None:
        detail_groups = product_details.findAll('ul', class_ = "a-unordered-list a-nostyle a-vertical a-spacing-none detail-bullet-list")
        ranks = detail_groups[1].find('span', class_ = "a-list-item")
        data_list.append(ranks.text if ranks is not None else np.nan)
    
    #overall rating
        rating = detail_groups[2].find('span', class_ = 'a-icon-alt')
        data_list.append(rating.text if rating is not None else np.nan)
    
    #number of ratings
    rating_count = s.find('span', attrs={'id':'acrCustomerReviewText', 'class':'a-size-base'})
    data_list.append(rating_count.text if rating_count is not None else np.nan)
    
    #blurb
    blurb = s.find('div', id = 'bookDescription_feature_div')
    data_list.append(blurb.text if blurb is not None else np.nan)
    
    #reviews
    reviews = s.find('div', class_="a-section a-spacing-large reviews-content filterable-reviews-content celwidget")
    
    reviews_ratings = []
    reviews_titles = []
    reviews_texts = []
    
    if reviews is not None:
        reviews_list = reviews.findAll('div', class_="a-section review aok-relative")
    
        for r in reviews_list:
            #get rating from all top reviews
            review_rating = r.find('a', class_ = "a-link-normal")
            reviews_ratings.append(review_rating['title'] if review_rating is not None else np.nan)
        
            #get titles from all top reviews
            review_title = r.find('a', class_ = "a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold")
            reviews_titles.append(review_title.text if review_title is not None else np.nan)
        
            #get text from all top reviews
            review_text = r.find('div', class_ = 'a-expander-content reviewText review-text-content a-expander-partial-collapse-content')
            reviews_texts.append(review_text.text if blurb is not None else np.nan)
    
    data_list.append(reviews_ratings if reviews_ratings is not [] else np.nan)
    data_list.append(reviews_titles if reviews_titles is not [] else np.nan)
    data_list.append(reviews_texts if reviews_texts is not [] else np.nan)
    
    return data_list


### 3. Apply Functions

Next we apply our two functions, I do this in three separate 10 page intervals because I was running into issues with the kernel restarting randomly in the middle of the code executing and losing all the progress. 

For each group of ten pages, I first apply the get_soup function I defined previously, to extract the html for all the individual listings. Then I use a for loop to apply the get data function to each listing. Finally I convert the data to a data frame and export it as a csv.

In [4]:
book_soups_1_10 = get_soup(1,10)

data_1_10 = []
for b in book_soups_1_10:
    b_data = get_data(b)
    data_1_10.append(b_data)
    
book_data_1_10 = pd.DataFrame(data_1_10,columns=['Book Name','Author', 'Price', 'KU', 'Rankings','Rating','Customers_Rated','Blurb', 'Reviews_Ratings', 'Reviews_Titles', 'Reviews_Texts' ])
book_data_1_10.to_csv('amazon_books_1_10.csv', index=False, encoding='utf-8')

In [5]:
book_soups_11_20 = get_soup(11,20)

data_11_20 = []
for b in book_soups_11_20:
    b_data = get_data(b)
    data_11_20.append(b_data)
    
book_data_11_20 = pd.DataFrame(data_11_20,columns=['Book Name','Author', 'Price', 'KU', 'Rankings','Rating','Customers_Rated','Blurb', 'Reviews_Ratings', 'Reviews_Titles', 'Reviews_Texts' ])
book_data_11_20.to_csv('amazon_books_11_20.csv', index=False, encoding='utf-8')

In [4]:
book_soups_21_30 = get_soup(21,30)

data_21_30 = []
for b in book_soups_21_30:
    b_data = get_data(b)
    data_21_30.append(b_data)
    
book_data_21_30 = pd.DataFrame(data_21_30,columns=['Book Name','Author', 'Price', 'KU', 'Rankings','Rating','Customers_Rated','Blurb', 'Reviews_Ratings', 'Reviews_Titles', 'Reviews_Texts' ])
book_data_21_30.to_csv('amazon_books_21_30.csv', index=False, encoding='utf-8')

In [6]:
book_soups_31_40 = get_soup(31,40)

data_31_40 = []
for b in book_soups_31_40:
    b_data = get_data(b)
    data_31_40.append(b_data)
    
book_data_31_40 = pd.DataFrame(data_31_40,columns=['Book Name','Author', 'Price', 'KU', 'Rankings','Rating','Customers_Rated','Blurb', 'Reviews_Ratings', 'Reviews_Titles', 'Reviews_Texts' ])
book_data_31_40.to_csv('amazon_books_31_40.csv', index=False, encoding='utf-8')

In [7]:
book_soups_41_50 = get_soup(41,50)

data_41_50 = []
for b in book_soups_41_50:
    b_data = get_data(b)
    data_41_50.append(b_data)
    
book_data_41_50 = pd.DataFrame(data_41_50,columns=['Book Name','Author', 'Price', 'KU', 'Rankings','Rating','Customers_Rated','Blurb', 'Reviews_Ratings', 'Reviews_Titles', 'Reviews_Texts' ])
book_data_41_50.to_csv('amazon_books_41_50.csv', index=False, encoding='utf-8')