# Load library

In [66]:
import requests # send request to website
from bs4 import BeautifulSoup as bs # convert the web content to bs object
from bs4 import Comment # search if we are caught by Amazon as a robot
from fake_useragent import UserAgent #create fake user agent from different browser
import re # regular expression
import pandas as pd # output dataframe
import numpy as np # fast data manipulation
import random # randomly use agent header for sending request
import time #If access is denied, sleep 5s and then request again
print(requests.__version__)

2.28.1


# How to create headers for request
1. Some Tutorials I used:
    - https://www.crummy.com/software/BeautifulSoup/bs4/doc/#comments-and-other-special-strings
    - https://www.blog.datahut.co/post/web-scraping-best-practices-tips
    - https://stackoverflow.com/questions/63305902/why-cant-i-scrape-amazon-products-by-beautifulsoup
    - https://www.digitalocean.com/community/tutorials/scrape-amazon-product-information-beautiful-soup
    - https://stackoverflow.com/questions/63615686/how-to-scrape-data-from-amazon-canada
    - https://stackoverflow.com/questions/33138937/how-to-find-all-comments-with-beautiful-soup
    - https://pypi.org/project/fake-useragent/
    - https://github.com/jhnwr/scrape-amazon-reviews/blob/main/review-scraper.py
    - https://www.fullstaxx.com/2021/05/23/multipage-scraping-amazon-python/
    - https://github.com/sergioteula/python-amazon-paapi
    
2. Depends on where Amazon location you are scraping, you need to use different headers. The following are just 2 examples:

    - For Amazon Canada: you use:

    `headers = {
        'content-type': 'text/html;charset=UTF-8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'en-US,en;q=0.8',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    }`

    - For Amazon Indian, you use:

    `headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}`

    - For Amazon UK, you use:
    
    `headers = {
    'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.8.0.8) Gecko/20061025 Firefox/1.5.0.8'}`
    
    
3. Here is a list of User-Agent strings for different browsers: https://www.useragentstring.com/pages/useragentstring.php
4. I will use fake-useragent (pip3 install fake-useragent)to generate a list of fake user agent.

# Create a list of fake User Agent to disguise our IP

In [181]:
'''
There is a pretty useful third-party package called fake-useragent 
that provides a nice abstraction layer over user agents: https://pypi.org/project/fake-useragent/

If you don't want to use the local data, you can use the external data source to retrieve the user-agents. 
#Set use_external_data to True:
'''
ua = UserAgent(browsers=["chrome", "edge", "internet explorer", "firefox", "safari", "opera"])
# I will generate a lsit of fake agent string
user_agent_set = set()
for _ in range(100000):
    user_agent_set.add(ua.random)
    
# Total unique fake agent string
print(len(user_agent_set))

4688


# Fetch data from website and create bs object

In [182]:
# Global varialbe header subject to change
header = None

#Define a function to request data from the website
def get_soup(url, user_agent_list, soup_attempts = 10, request_attempts = 10):
    soup = 'No Data Returned'
    global header
    for _ in range(soup_attempts):
        request_count = 0
        page = ''
        notDenied = True
        # We want to keep using the same header if that one particular header is working
        # We change it unless it is recognized and banned by Web server
        if header is None:
            user_agent = random.choice(user_agent_set)
            header = {'content-type': 'text/html;charset=UTF-8',
                            'Accept-Encoding': 'gzip, deflate, sdch',
                            'Accept-Language': 'en-US,en;q=0.8',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            "User-Agent": user_agent}
            
        
        while page == '' and request_count <= request_attempts and notDenied:
            try:
                request_count += 1
                page = requests.get(url, headers=header, timeout=10)
                soup = bs(page.content, "lxml")
                '''If the page returns a message like To discuss automated access 
                    to Amazon data please contact api-services-support@amazon.com.
                    We know we are denied access to the web page.
                    In this case, lets try again using different header
                '''
                comments = soup.find_all(string=lambda text: isinstance(text, Comment))
                for comment in comments:
                    if ("api-services-support@amazon.com" in comment):
                        notDenied = False
                        header = None
                        soup = 'No Data Returned'
                if (notDenied):
                    return soup
                #We are caught by Web server as a bot, break this while and try a new header
                break
            except:
                print("Connection refused by the server..")
                print("Let me sleep for 5 seconds")
                time.sleep(5)
                print("Was a nice sleep, now let me continue...")
                continue
    return soup

#Define a function to get the reciew of a product
def get_reviews(soup):
    reviews = soup.find_all('div', {'data-hook': 'review'})
    try:
        for item in reviews:
            review = {
            'product': soup.title.text.replace('Amazon.co.uk:Customer reviews:', '').strip(),
            'title': item.find('a', {'data-hook': 'review-title'}).text.strip(),
            'rating':  float(item.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip()),
            'body': item.find('span', {'data-hook': 'review-body'}).text.strip(),
            }
            reviewlist.append(review)
    except:
        pass

# #Iterate through each page and get the review
# for x in range(1,999):
#     soup = get_soup(f'https://www.amazon.co.uk/product-reviews/B07WD58H6R/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber={x}',
#                    header_list, soup_attempts = 10, request_attempts = 10)
#     print(f'Getting page: {x}')
#     get_reviews(soup)
#     print(len(reviewlist))
#     if not soup.find('li', {'class': 'a-disabled a-last'}):
#         pass
#     else:
#         break
# #Generate the final dataframe
# df = pd.DataFrame(reviewlist)
# df.to_excel('sony-headphones.xlsx', index=False)

In [215]:
# Get the link for each product in the home page
item_link = []
root_url = "https://www.amazon.ca/s?k=headphones&i=electronics&page="
for page_number in range(1,400):
    print(page_number)
    home_soup = get_soup(root_url+str(page_number), list(user_agent_set))
    if header is not None:
        print(header["User-Agent"])
    if home_soup == 'No Data Returned':
        print(header)
        continue
    for link in home_soup.select("h2 a.a-link-normal.s-underline-text.s-underline-link-text.s-link-style"):
        item_link.append(link['href'])

with open ("partial ttems link CA1.txt", mode = "wt") as f:
    for link in item_link:
        f.write(link+"\n\n")

1
Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4325)
2
Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4325)
3
Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4325)
4
Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4325)
5
Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4325)
6
Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4325)
7
Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4325)
8
Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4325)
9
Opera/9.20(Windows NT 5.1; U; en)
10
Opera/9.20(Windows NT 5.1; U; en)
11
Opera/9.20(Windows NT 5.1; U; en)
12
Opera/9.20(Windows NT 5.1; U; en)
13
Opera/9.20(Windows NT 5.1; U; en)
14
Opera/9.20(Windows NT 5.1; U; en)
15
Opera/9.20(Windows NT 5.1; U; en)
16
Opera/9.20(Windows NT 5.1; U; en)
17
Opera/9.20(Windows NT 5.1; U; en)
18
Opera/9.20(Windows NT 5.1; U; en)
19
Opera/9.20(W

Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)
75
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)
76
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)
77
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)
78
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .

Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.7.12) Gecko/20051222 Firefox/1.0.7
145
Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.7.12) Gecko/20051222 Firefox/1.0.7
146
Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.7.12) Gecko/20051222 Firefox/1.0.7
147
Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.7.12) Gecko/20051222 Firefox/1.0.7
148
Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.7.12) Gecko/20051222 Firefox/1.0.7
149
Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.7.12) Gecko/20051222 Firefox/1.0.7
150
Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.7.12) Gecko/20051222 Firefox/1.0.7
151
Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.7.12) Gecko/20051222 Firefox/1.0.7
152
Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.7.12) Gecko/20051222 Firefox/1.0.7
153
Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.7.12) Gecko/20051222 Firefox/1.0.7
154
Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.7.12) Gecko/20051222 Firefox/1.0.7
155
Mozilla/5.0 (X11; U; Linux ppc; en-US; rv:1.7.12) Gecko/20051222 Firefox/1.0.7
156
Mozi

Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.7) Gecko/20050414 Firefox/1.0.3
243
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.7) Gecko/20050414 Firefox/1.0.3
244
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.7) Gecko/20050414 Firefox/1.0.3
245
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.7) Gecko/20050414 Firefox/1.0.3
246
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.7) Gecko/20050414 Firefox/1.0.3
247
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.7) Gecko/20050414 Firefox/1.0.3
248
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.7) Gecko/20050414 Firefox/1.0.3
249
Mozilla/5.0 (Windows; U; Windows NT 5.1; fr-FR) AppleWebKit/525.19 (KHTML, like Gecko) Version/3.1.2 Safari/525.21
250
Mozilla/5.0 (Windows; U; Windows NT 5.1; fr-FR) AppleWebKit/525.19 (KHTML, like Gecko) Version/3.1.2 Safari/525.21
251
Mozilla/5.0 (Windows; U; Windows NT 5.1; fr-FR) AppleWebKit/525.19 (KHTML, like Gecko) Version/3.1.2 Safari/525.21
252
Mozill

Mozilla/5.0 (Windows NT 6.2; WOW64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1
316
Mozilla/5.0 (Windows NT 6.2; WOW64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1
317
Mozilla/5.0 (Windows NT 6.2; WOW64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1
318
Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.6.30 Version/10.61
319
Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.6.30 Version/10.61
320
Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.6.30 Version/10.61
321
Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.6.30 Version/10.61
322
Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.6.30 Version/10.61
323
Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.210.0 Safari/532.0
324
Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.210.0 Safari/532.0
325
Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.210.0 Safari/532.0
326
Mozilla/5.0 (Ma

Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
383
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
384
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
385
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
386
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
387
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
388
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
389
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
390
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
391
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
392
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
393
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
394
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
395
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
396
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
397
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
398
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)
399
Mozilla/4.0 (compatible;MSIE 7.0;Windows NT 6.0)


In [216]:
len(item_link)

90