# Load library

In [1]:
import requests # send request to website
from bs4 import BeautifulSoup as bs # convert the web content to bs object
from bs4 import Comment # search if we are caught by Amazon as a robot
from fake_useragent import UserAgent #create fake user agent from different browser
import re # regular expression
import pandas as pd # output dataframe
import numpy as np # fast data manipulation
import random # randomly use agent header for sending request
import time #If access is denied, sleep 5s and then request again
from collections import defaultdict #Used to declare a dictionary with emply 
print(requests.__version__)
import os
import csv
from string import punctuation

2.28.1


# How to create headers for request
1. Some Tutorials I used:
    - https://www.crummy.com/software/BeautifulSoup/bs4/doc/#comments-and-other-special-strings
    - https://www.blog.datahut.co/post/web-scraping-best-practices-tips
    - https://stackoverflow.com/questions/63305902/why-cant-i-scrape-amazon-products-by-beautifulsoup
    - https://www.digitalocean.com/community/tutorials/scrape-amazon-product-information-beautiful-soup
    - https://stackoverflow.com/questions/63615686/how-to-scrape-data-from-amazon-canada
    - https://stackoverflow.com/questions/33138937/how-to-find-all-comments-with-beautiful-soup
    - https://pypi.org/project/fake-useragent/
    - https://github.com/jhnwr/scrape-amazon-reviews/blob/main/review-scraper.py
    - https://www.fullstaxx.com/2021/05/23/multipage-scraping-amazon-python/
    - https://github.com/sergioteula/python-amazon-paapi
    
2. Depends on where Amazon location you are scraping, you need to use different headers. The following are just 2 examples:

    - For Amazon Canada: you use:

    `headers = {
        'content-type': 'text/html;charset=UTF-8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'en-US,en;q=0.8',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    }`

    - For Amazon Indian, you use:

    `headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}`

    - For Amazon UK, you use:
    
    `headers = {
    'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.8.0.8) Gecko/20061025 Firefox/1.5.0.8'}`
    
    
3. Here is a list of User-Agent strings for different browsers: https://www.useragentstring.com/pages/useragentstring.php
4. I will use fake-useragent (pip3 install fake-useragent)to generate a list of fake user agent.

# Fetch data from individual website using a list of fake User Agent to disguise our IP

In [2]:
#Create a class to deal with web request and convert it to beautiful soup
class get_soup:
    header = None
    #When the class is initiated, a list of user agent will be generated
    '''
    There is a pretty useful third-party package called fake-useragent 
    that provides a nice abstraction layer over user agents: https://pypi.org/project/fake-useragent/

    If you don't want to use the local data, you can use the external data source to retrieve the user-agents. 
    #Set use_external_data to True:
    '''
    def __init__(self, total_user_agent = 1000):
        ua = UserAgent(browsers=["chrome", "edge", "internet explorer", "firefox", "safari", "opera"])
        # I will generate a lsit of fake agent string with total number of total_user_agent
        self.user_agent_set = set()
        # Set a cap for user_agent_set to prevent endless loop
        while(len(self.user_agent_set)<total_user_agent and len(self.user_agent_set) < 4500):
            self.user_agent_set.add(ua.random)
    '''
    Define the function to get contents from each page. 
    Each header_attempts will use the same header until it is caught by the weg server.
    In each header_attempts, we will try request_attempts times to request contents until we get the right contents
    '''
    def get_individual_soup(self, url, header_attempts = 10, request_attempts = 10):
        self.soup = 'No Data Returned'
        for _ in range(header_attempts):
            request_count = 0
            page = ''
            notDenied = True
            # We want to keep using the same header if that one particular header is working
            # We change it unless it is recognized and banned by Web server
            if get_soup.header is None:
                user_agent = random.choice(list(self.user_agent_set))
                get_soup.header = {'content-type': 'text/html;charset=UTF-8',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Accept-Language': 'en-US,en;q=0.8',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                "User-Agent": user_agent}

            while page == '' and request_count < request_attempts and notDenied:
                try:
                    request_count += 1
                    page = requests.get(url, headers=get_soup.header, timeout=10)
                    self.soup = bs(page.content, "lxml")
                    '''If the page returns a message like To discuss automated access 
                        to Amazon data please contact api-services-support@amazon.com.
                        We know we are denied access to the web page.
                        Or,
                        Amazon page blocks you by returning a login page
                        In either case, lets try again using different header
                    '''
                    comments = self.soup.find_all(string=lambda text: isinstance(text, Comment))
                    login_page = self.soup.find('a', id = 'createAccountSubmit', class_ = 'a-button-text')                    
                    for comment in comments:
                        if ("api-services-support@amazon.com" in comment) or login_page:
                            notDenied = False
                            get_soup.header = None
                            self.soup = 'No Data Returned'
                            break
                            
                    if (notDenied):
                        return self.soup
                    #We are caught by Web server as a bot, break this while and try a new header
                    break
                except:
                    get_soup.header = None
                    print("Connection refused by the server..")
                    print("Let me sleep for 5 seconds")
                    time.sleep(5)
                    print("Now I will use a different header to request data...")
                    #The server does not respond to our request, break this while and try a new header
                    break
        return self.soup
    '''
    Customer Reviews, including Product Star Ratings, 
    help customers to learn more about the product and decide whether it is the right product for them.
    To calculate the overall star rating and percentage breakdown by star, we don’t use a simple average. 
    Instead, our system considers things like how recent a review is and if the reviewer bought the item on Amazon. 
    It also analyses reviews to verify trustworthiness.
    Learn more from
    https://www.amazon.co.uk/gp/help/customer/display.html/ref=cm_cr_arp_d_omni_lm_btn?nodeId=G8UYX7LALQC8V9KA'''
    #Define a function to get the review of a product on one page only
    def get_page_reviews(self, ASIN, soup = None):
        reviewlist = []
        if soup is not None:
            for item in soup.find_all('div', {'data-hook': 'review'}):
                try:
                    #This is domenstic review
                    review = {
                                'ASIN': ASIN,
                                'product Name': soup.title.text.replace('Amazon.co.uk:Customer reviews:', '').strip(),
                                'Review Title': item.find('a', {'data-hook': 'review-title'}).get_text().strip(),
                                'Review Rating':  float(item.find('i', {'data-hook': 'review-star-rating'}).get_text().replace('out of 5 stars', '').strip()),
                                'Review Body': item.find('span', {'data-hook': 'review-body'}).get_text().strip(),
                                'Review Date': item.find('span', {'data-hook': 'review-date'}).get_text().strip(),
                                }
                except AttributeError:
                    #This is international review
                    try:
                        review = {
                                'ASIN': ASIN,
                                'product Name': soup.title.text.replace('Amazon.co.uk:Customer reviews:', '').strip(),
                                'Review Title': item.find('span', {'data-hook': 'review-title'}).get_text().strip(),
                                'Review Rating':  float(item.find('i', {'data-hook': 'cmps-review-star-rating'}).get_text().replace('out of 5 stars', '').strip()),
                                'Review Body': item.find('span', {'data-hook': 'review-body'}).get_text().strip(),
                                'Review Date': item.find('span', {'data-hook': 'review-date'}).get_text().strip(),
                                }
                    except:
                        #If there is still error, return None
                        review = {
                                'ASIN': None,
                                'product Name': None,
                                'Review Title': None,
                                'Review Rating': None,
                                'Review Body': None,
                                'Review Date': None,
                                }
                reviewlist.append(review)
        return reviewlist

#Create a class to handle all the file I/O
class Review_file_io:
    '''
    This method is to get the root link for each product
    '''
    @classmethod
    def get_review_link(cls, file_loc):
        #Get the review entrance link for all the product items
        review_links = {}
        with open (file_loc, mode = "r") as f:
            for link in f:
                entry_link = link.strip().split(",")[0]
                if (not re.search("product-reviews/.*/ref", entry_link)):
                    continue
                ASIN = re.search("product-reviews/.*/ref", entry_link).group(0).split("/")[1]
                '''Need to think this again, this is mainly for empty page loc'''
                if re.search(r'&pageNumber=\d+$', entry_link):
                    review_links[ASIN] = entry_link
                else:
                    review_links[ASIN] = entry_link + "&pageNumber="
        return review_links
    '''
    This method is to get all the reviews on every page of a product
    '''
    def get_product_reviews(self, file_loc, reviews_loc, empty_page_loc, total_page = 999, header_attempts = 3, request_attempts = 1):
        review_links = Review_file_io.get_review_link(file_loc)
        mySoup = get_soup()
        empty_page = defaultdict(list)
        reviews = []
        #loop through each page and get reviews on each page
        for ASIN, review_link in review_links.items():
            for page_number in range(1,total_page):
                print(f"You are on product {ASIN} page {page_number}")
                page_url = f"{review_link}{page_number}"
                page_soup = mySoup.get_individual_soup(page_url,header_attempts = header_attempts, request_attempts = request_attempts)
                '''
                There are 3 cases page_soup equals 'No Data Returned'.
                1st is when you get caught by Amazon as a bot;
                2nd is Amazon returns you a login page
                3rd is when our scrapper has tried header_attempts*request_attempts times to reach the page,
                    but still got nothing, either rejected or caught by the server;

                There are case that you do get the page content from our web scrapper,
                but there are no reviews on that page. For example, 
                1. You get the page, but the page 
                2. you hit the last review page;
                3. the product item just does not have any reviews at all.
                '''
                if page_soup != 'No Data Returned':
                    review = mySoup.get_page_reviews(ASIN, page_soup)
                    #There are simply no reviews for this product item, break the loop
                    if not review:
                        break 
                    reviews.extend(review)
                    #Last page is hit, we break the for loop
                    if page_soup.find('li', {'class': 'a-disabled a-last'}):
                        break
                    else:
                        continue
                #When we failed to get the content for this page, record this page, and go to the next page
                else:
                    empty_page[ASIN].append(page_url)
                    continue
        #Save the reviews and empty page link
        try:
            with open (reviews_loc, mode = "w") as f:
                csv_columns = ['ASIN', 'product Name', 'Review Title', 'Review Rating', 'Review Body', 'Review Date']
                writer = csv.DictWriter(f, fieldnames=csv_columns)
                writer.writeheader()
                for prod_info in reviews:
                    writer.writerow(prod_info)

            with open (empty_page_loc, mode = "w") as f:
                writer = csv.writer(f)
                writer.writerow(['URLs', 'ASIN'])
                for key, page in empty_page.items():
                    for link in page:
                        writer.writerow([link, key])
        except:
            print("I/O error")
    

# Example how you can iterate through each page to get the item link

In [3]:
# Get the link for each product in the home page
mySoup = get_soup()
#Grab the item link from each page and save them in a text file
item_link = []
# root_url = "https://www.amazon.ca/s?k=headphones&i=electronics&page="
# root_url = "https://www.amazon.in/s?k=headphones&page="
root_url = "https://www.amazon.co.uk/s?k=headphones&i=electronics&s=review-rank&page="

for page_number in range(200,250):
    print(f"You are on page {page_number}")
    home_soup = mySoup.get_individual_soup(root_url+str(page_number),
                                          header_attempts = 2, request_attempts = 1)
    #If there is nothing return from the website, go to next page
    if home_soup != 'No Data Returned':
        if (mySoup.header is not None):
            print("You are using " + mySoup.header["User-Agent"] + " to retrieve data")
    else:
        print(f"No data returned. You are using `{mySoup.header}` to retrieve data")
        continue
    for link in home_soup.select("h2 a.a-link-normal.s-underline-text.s-underline-link-text.s-link-style"):
        item_link.append(link['href'])

with open ("./Dataset/partial items link CA7.txt", mode = "wt") as f:
    for link in item_link:
        f.write(link+"\n\n")

You are on page 200
You are using Mozilla/4.0 (compatible; MSIE 5.5b1; Mac_PowerPC) to retrieve data
You are on page 201
You are using Mozilla/4.0 (compatible; MSIE 5.5b1; Mac_PowerPC) to retrieve data
You are on page 202
You are using Mozilla/4.0 (compatible; MSIE 5.5b1; Mac_PowerPC) to retrieve data
You are on page 203
You are using Mozilla/4.0 (compatible; MSIE 5.5b1; Mac_PowerPC) to retrieve data
You are on page 204
You are using Mozilla/4.0 (compatible; MSIE 5.5b1; Mac_PowerPC) to retrieve data
You are on page 205
You are using Mozilla/4.0 (compatible; MSIE 5.5b1; Mac_PowerPC) to retrieve data
You are on page 206
You are using Mozilla/4.0 (compatible; MSIE 5.5b1; Mac_PowerPC) to retrieve data
You are on page 207
You are using Mozilla/4.0 (compatible; MSIE 5.5b1; Mac_PowerPC) to retrieve data
You are on page 208
You are using Mozilla/4.0 (compatible; MSIE 5.5b1; Mac_PowerPC) to retrieve data
You are on page 209
You are using Mozilla/4.0 (compatible; MSIE 5.5b1; Mac_PowerPC) to retr

# Generate a csv of links to each of those items, the price and the #of reviews From Stu

In [4]:
# Initiate a new soup object
mySoup = get_soup()

# home_soup = mySoup.get_individual_soup(root_url+str(page_number),
#                                           header_attempts = 2, request_attempts = 1)

linklist = []
duplicates = []
Skipped_pages = []
for x in range(2,10):
    soup = mySoup.get_individual_soup(f'https://www.amazon.co.uk/s?k=heaphones&page={x}',
                                          header_attempts = 2, request_attempts = 1)
    
    #If there is nothing return from the website, go to next page
    if soup != 'No Data Returned':
        for link in soup.find_all('a', href=True):
            href = link['href']
            if 'keywords=heaphones' in href:
                if 'offer-listing' not in href:
                    if '#customerReviews' not in href:
                        duplicates.append(href)
    else:
        print(f"No data returned. You are using `{mySoup.header}` to retrieve data")
        Skipped_pages.append(x)
        continue

duplicates = [x.split('/ref')[0] for x in duplicates]
duplicates = [x.split('?keywords')[0] for x in duplicates]
for i in duplicates:
    # Add to the new list
    # only if not present
    if i not in linklist:
        linklist.append(i)

finalList = ['https://www.amazon.co.uk' + s for s in linklist]

In [5]:
price = []
for x in finalList:
    soup = mySoup.get_individual_soup(x, header_attempts = 2, request_attempts = 1)
    spans = soup.find('span', attrs = {'class' : 'a-price-whole'})
    if spans == None:
        price.append('')
    else:
        price.append(spans.text.strip(punctuation))

In [6]:
review = []
for x in finalList:
    soup = mySoup.get_individual_soup(x, header_attempts = 2, request_attempts = 1)
    spans = soup.find('span',id ="acrCustomerReviewText", attrs = {'class' : 'a-size-base'})
    if spans == None:
        review.append('')
    else:
        review.append(spans.text.strip(punctuation))

finalList

['https://www.amazon.co.uk/Apple-EarPods-with-Lightning-Connector/dp/B01M1EEPOB',
 'https://www.amazon.co.uk/Sony-WH-1000XM3-Wireless-Cancelling-Headphones-Black/dp/B07GDR2LYK',
 'https://www.amazon.co.uk/Earphones-Blukar-Headphones-Sensitivity-Microphone-Silver/dp/B07QLWMDLC',
 'https://www.amazon.co.uk/JVC-HA-L50-B-E-Lightweight-Headphones-Black/dp/B000I2J4S4',
 'https://www.amazon.co.uk/Sony-MDR-ZX310AP-Foldable-Headphones-Smartphone-Metallic-Red/dp/B00I3LV3EU',
 'https://www.amazon.co.uk/EarFun-Wireless-Bluetooth-Detection-Headphones-Matte-Black/dp/B088H7GMHZ',
 'https://www.amazon.co.uk/Sony-MDR-EX15AP-Earphones-Smartphone-Control-Black/dp/B00I3LV1HE',
 'https://www.amazon.co.uk/Betron-AX5-Headphones-Microphone-Smartphones-Black-Gold/dp/B0786S43W4',
 'https://www.amazon.co.uk/JVC-Headphones-Earphones-Compatible-Samsung-Black/dp/B00ZAT03S0',
 'https://www.amazon.co.uk/Isolating-Headphones-Microphone-Lightweight-Earphones/dp/B083J88QRS',
 'https://www.amazon.co.uk/OneOdio-Bluetooth-

In [7]:
headers = ['URLs', 'Price', '#Ratings']

finalList = [s+ '/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews' for s in finalList]
finalList = [s.replace("/dp/", "/product-reviews/") for s in finalList]

with open('links.csv', 'w', newline='') as csvfile:
    file_is_empty = os.stat('links.csv').st_size == 0
    writer = csv.writer(csvfile)
    if file_is_empty:
        writer.writerow(headers)
    writer.writerows(zip(finalList, price, review))

# Extract product information from multiple product items
There are a few product information we can get from a single product item.

- Product Name
- Review Title
- Review Rating
- Review Body
- Review Date

Because each product has many pages of reviews and each product takes quite some time, I split the links.csv file to smaller files. Each file has about 35 links in there, and I will need other team members to work separately to reduce running time. https://phoenixnap.com/kb/linux-split#:~:text=The%20Linux%20split%20command%20breaks,Linux%20split%20command%20with%20examples.&text=Access%20to%20the%20terminal%20line.

In [4]:
my_review = Review_file_io()
my_review.get_review_link('./Dataset/Sample_link.csv')
my_review.get_product_reviews('./Dataset/Sample_link.csv', './Dataset/review.csv', './Dataset/empty_link.csv', total_page = 10)

You are on product B01M1EEPOB page 1
You are on product B01M1EEPOB page 2
You are on product B01M1EEPOB page 3
You are on product B01M1EEPOB page 4
You are on product B01M1EEPOB page 5
You are on product B01M1EEPOB page 6
You are on product B01M1EEPOB page 7
You are on product B01M1EEPOB page 8
You are on product B01M1EEPOB page 9
