<a href="https://colab.research.google.com/github/banned-books/project_banned_books/blob/main/data/scripts_data_scripting/custom_amazon_reviews_web_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Mount GDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import Libraries

In [None]:
import pandas as pd
from requests_html import HTMLSession
import time
from bs4 import BeautifulSoup 
import re
import os
from tqdm import tqdm

## Banned Books List with Amazon Urls

In [None]:
# Banned Book List with Amazon Links
ala_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/original_data/pen_dataset.csv')

# Keep certain columns
ala_df = ala_df[['Title', 'Author', 'Type of Ban', 'State', 'District','Date of Challenge/Removal', 'Origin of Challenge', 'amazon_url']]

# Remove duplicate books for web scraper
ala_df = ala_df.drop_duplicates(subset=['Title'])

# Remove books without Amazon reviews
clean_df = ala_df.dropna(subset=['amazon_url'])

# Add the unique Amazon book review urls to a list to feed into the web scraper
urls = list(clean_df['amazon_url'])

In [None]:
print("The number of unique books with Amazon.com reviews is", len(urls))

The number of unique books with Amazon.com reviews is 1547


## Scrape Amazon Reviews

In [None]:
class Reviews:
    
    review_date_pattern = re.compile('(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?) \d+, \d{4}')
    product_name_pattern = re.compile('^https:\/{2}www.amazon.com\/(.+)\/product-reviews')
    
    def __init__(self, url) -> None:
        """Initialize a session."""
        
        self.session = HTMLSession()
        self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36'}
        self.session.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
        self.session.headers['Accept-Language'] = 'en-US,en;q=0.5'
        self.session.headers['Connection'] = 'keep-alive'
        self.session.headers['Upgrade-Insecure-Requests'] = '1'        
        self.url = url
        
    def pagination(self, page):
        """Work through pagination."""
        
        r = self.session.get(self.url + str(page))
        print(self.url + str(page))

        if not r.html.find('div[data-hook=review]'):
            return False

        else:
            return r.html.find('div[data-hook=review]')

    def parse(self, reviews, page):
        """Parse the html."""
        
        total = []
        
        response = self.session.get(self.url + str(page))
        soup = BeautifulSoup(response.content, 'html.parser')
        review_list = soup.find('div', {'id': 'cm_cr-review_list' })  
        product_reviews = review_list.find_all('div', {'data-hook': 'review'}) 
        product_name = self.product_name_pattern.search(self.url).group(1) if self.product_name_pattern.search(self.url) else ''

        if not product_name:
            print('url is invalid. Please check the url.')
            product_name = self.url
            return
        else:
            product_name = product_name.replace('-', ' ')
        
        for review in product_reviews:
            
            try:
                title = review.find('a', {'data-hook': 'review-title'}).text.strip()
            except:
                print('No title')
                break
             
            try:
                body = review.find('span', {'data-hook': 'review-body'}).text.strip()
            except: 
                print('No body')
                break
                
            try:
                rating = review.find('i', {'data-hook': 'review-star-rating'}).text
            except:
                print('No rating')
                break
                
            try:
                verified_purchase = True if review.find('span', {'data-hook': 'avp-badge'}) else False
            except: 
                print('No verified purchase')
                break

            try:
                review_date = self.review_date_pattern.search(review.find('span', {'data-hook': 'review-date'}).text).group(0)
            except:
                print('No review date')
                break
                
            data = {
                'product_name': product_name,
                'title': title,
                'body': body,
                'rating': rating,
                'verified_purchase': verified_purchase,
                'review_date': review_date
            }

            total.append(data)

        
        return total

In [None]:
if __name__ == '__main__':
    
    dfs = [] 
    
    for url in tqdm(urls):
        
        full_url = url + 'sortBy=recent&pageNumber='
    
        amz=Reviews(full_url)

        results = []

        for x in range(1,1000):

            print('getting page ', x)
            time.sleep(0.6)

            reviews = amz.pagination(x)

            if reviews is not False:
                results.append(amz.parse(reviews, x))

            else:
                print('No more review pages.')
                break

        flat_list = [item for sublist in results for item in sublist]

        df = pd.DataFrame(flat_list)
        dfs.append(df)
        
    pd.concat(dfs).to_csv(r'out_amz_full.csv', index=False)