# Load library

In [1]:
import requests # send request to website
from bs4 import BeautifulSoup as bs # convert the web content to bs object
from bs4 import Comment # search if we are caught by Amazon as a robot
from fake_useragent import UserAgent #create fake user agent from different browser
import re # regular expression
import pandas as pd # output dataframe
import numpy as np # fast data manipulation
import random # randomly use agent header for sending request
import time #If access is denied, sleep 5s and then request again
print(requests.__version__)
import os
import csv
from string import punctuation



2.27.1


# How to create headers for request
1. Some Tutorials I used:
    - https://www.crummy.com/software/BeautifulSoup/bs4/doc/#comments-and-other-special-strings
    - https://www.blog.datahut.co/post/web-scraping-best-practices-tips
    - https://stackoverflow.com/questions/63305902/why-cant-i-scrape-amazon-products-by-beautifulsoup
    - https://www.digitalocean.com/community/tutorials/scrape-amazon-product-information-beautiful-soup
    - https://stackoverflow.com/questions/63615686/how-to-scrape-data-from-amazon-canada
    - https://stackoverflow.com/questions/33138937/how-to-find-all-comments-with-beautiful-soup
    - https://pypi.org/project/fake-useragent/
    - https://github.com/jhnwr/scrape-amazon-reviews/blob/main/review-scraper.py
    - https://www.fullstaxx.com/2021/05/23/multipage-scraping-amazon-python/
    - https://github.com/sergioteula/python-amazon-paapi
    
2. Depends on where Amazon location you are scraping, you need to use different headers. The following are just 2 examples:

    - For Amazon Canada: you use:

    `headers = {
        'content-type': 'text/html;charset=UTF-8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'en-US,en;q=0.8',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    }`

    - For Amazon Indian, you use:

    `headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}`

    - For Amazon UK, you use:
    
    `headers = {
    'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.8.0.8) Gecko/20061025 Firefox/1.5.0.8'}`
    
    
3. Here is a list of User-Agent strings for different browsers: https://www.useragentstring.com/pages/useragentstring.php
4. I will use fake-useragent (pip3 install fake-useragent)to generate a list of fake user agent.

# Fetch data from individual website using a list of fake User Agent to disguise our IP

In [2]:
class get_soup:
    header = None
    #When the class is initiated, a list of user agent will be generated
    '''
    There is a pretty useful third-party package called fake-useragent 
    that provides a nice abstraction layer over user agents: https://pypi.org/project/fake-useragent/

    If you don't want to use the local data, you can use the external data source to retrieve the user-agents. 
    #Set use_external_data to True:
    '''
    def __init__(self, total_user_agent = 1000):
        ua = UserAgent(browsers=["chrome", "edge", "internet explorer", "firefox", "safari", "opera"])
        # I will generate a lsit of fake agent string with total number of total_user_agent
        self.user_agent_set = set()
        # Set a cap for user_agent_set to prevent endless loop
        while(len(self.user_agent_set)<total_user_agent and len(self.user_agent_set) < 4500):
            self.user_agent_set.add(ua.random)
    '''
    Define the function to get contents from each page. 
    Each header_attempts will use the same header until it is caught by the weg server.
    In each header_attempts, we will try request_attempts times to request contents until we get the right contents
    '''
    def get_individual_soup(self, url, header_attempts = 10, request_attempts = 10):
        soup = 'No Data Returned'
        for _ in range(header_attempts):
            request_count = 0
            page = ''
            notDenied = True
            # We want to keep using the same header if that one particular header is working
            # We change it unless it is recognized and banned by Web server
            if get_soup.header is None:
                user_agent = random.choice(list(self.user_agent_set))
                get_soup.header = {'content-type': 'text/html;charset=UTF-8',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Accept-Language': 'en-US,en;q=0.8',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                "User-Agent": user_agent}

            while page == '' and request_count < request_attempts and notDenied:
                try:
                    request_count += 1
                    page = requests.get(url, headers=get_soup.header, timeout=10)
                    soup = bs(page.content, "lxml")
                    '''If the page returns a message like To discuss automated access 
                        to Amazon data please contact api-services-support@amazon.com.
                        We know we are denied access to the web page.
                        In this case, lets try again using different header
                    '''
                    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
                    for comment in comments:
                        if ("api-services-support@amazon.com" in comment):
                            notDenied = False
                            get_soup.header = None
                            soup = 'No Data Returned'
                    if (notDenied):
                        return soup
                    #We are caught by Web server as a bot, break this while and try a new header
                    break
                except:
                    get_soup.header = None
                    print("Connection refused by the server..")
                    print("Let me sleep for 5 seconds")
                    time.sleep(5)
                    print("Now I will use a different header to request data...")
                    #The server does not respond to our request, break this while and try a new header
                    break
        return soup

# Example how you can iterate through each page to get the item link

In [3]:
# Get the link for each product in the home page
mySoup = get_soup()
#Grab the item link from each page and save them in a text file
item_link = []
# root_url = "https://www.amazon.ca/s?k=headphones&i=electronics&page="
# root_url = "https://www.amazon.in/s?k=headphones&page="
root_url = "https://www.amazon.co.uk/s?k=headphones&i=electronics&s=review-rank&page="

for page_number in range(200,205):
    print(f"You are on page {page_number}")
    home_soup = mySoup.get_individual_soup(root_url+str(page_number),
                                          header_attempts = 2, request_attempts = 1)
    #If there is nothing return from the website, go to next page
    if home_soup != 'No Data Returned':
        if (mySoup.header is not None):
            print("You are using " + mySoup.header["User-Agent"] + " to retrieve data")
    else:
        print(f"No data returned. You are using `{mySoup.header}` to retrieve data")
        continue
    for link in home_soup.select("h2 a.a-link-normal.s-underline-text.s-underline-link-text.s-link-style"):
        item_link.append(link['href'])

#with open ("./Dataset/partial items link CA6.txt", mode = "wt") as f:
  #  for link in item_link:
    #    f.write(link+"\n\n")

You are on page 200
You are using Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7 to retrieve data
You are on page 201
You are using Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7 to retrieve data
You are on page 202
You are using Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7 to retrieve data
You are on page 203
You are using Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7 to retrieve data
You are on page 204
You are using Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7 to retrieve data


# Generate a csv of links to each of those items, the price and the #of reviews From Stu

In [4]:
# Initiate a new soup object
mySoup = get_soup()

home_soup = mySoup.get_individual_soup(root_url+str(page_number),
                                          header_attempts = 2, request_attempts = 1)

linklist = []
duplicates = []
Skipped_pages = []
for x in range(2,10):
    soup = mySoup.get_individual_soup(f'https://www.amazon.co.uk/s?k=heaphones&page={x}',
                                          header_attempts = 2, request_attempts = 1)
    
    #If there is nothing return from the website, go to next page
    if soup != 'No Data Returned':
        for link in soup.find_all('a', href=True):
            href = link['href']
            if 'keywords=heaphones' in href:
                if 'offer-listing' not in href:
                    if '#customerReviews' not in href:
                        duplicates.append(href)
    else:
        print(f"No data returned. You are using `{mySoup.header}` to retrieve data")
        Skipped_pages.append(x)
        continue

duplicates = [x.split('/ref')[0] for x in duplicates]
duplicates = [x.split('?keywords')[0] for x in duplicates]
for i in duplicates:
    # Add to the new list
    # only if not present
    if i not in linklist:
        linklist.append(i)

finalList = ['https://www.amazon.co.uk' + s for s in linklist]

In [5]:
price = []
for x in finalList:
    soup = mySoup.get_individual_soup(x, header_attempts = 2, request_attempts = 1)
    spans = soup.find('span', attrs = {'class' : 'a-price-whole'})
    if spans == None:
        price.append('')
    else:
        price.append(spans.text.strip(punctuation))

In [6]:
review = []
for x in finalList:
    soup = mySoup.get_individual_soup(x, header_attempts = 2, request_attempts = 1)
    spans = soup.find('span',id ="acrCustomerReviewText", attrs = {'class' : 'a-size-base'})
    if spans == None:
        review.append('')
    else:
        review.append(spans.text.strip(punctuation))

finalList

['https://www.amazon.co.uk/Apple-EarPods-with-Lightning-Connector/dp/B01M1EEPOB',
 'https://www.amazon.co.uk/Sony-WH-1000XM3-Wireless-Cancelling-Headphones-Black/dp/B07GDR2LYK',
 'https://www.amazon.co.uk/Earphones-Blukar-Headphones-Sensitivity-Microphone-Silver/dp/B07QLWMDLC',
 'https://www.amazon.co.uk/JVC-HA-L50-B-E-Lightweight-Headphones-Black/dp/B000I2J4S4',
 'https://www.amazon.co.uk/Sony-MDR-ZX310AP-Foldable-Headphones-Smartphone-Metallic-Red/dp/B00I3LV3EU',
 'https://www.amazon.co.uk/EarFun-Wireless-Bluetooth-Detection-Headphones-Matte-Black/dp/B088H7GMHZ',
 'https://www.amazon.co.uk/Sony-MDR-EX15AP-Earphones-Smartphone-Control-Black/dp/B00I3LV1HE',
 'https://www.amazon.co.uk/Betron-AX5-Headphones-Microphone-Smartphones-Black-Gold/dp/B0786S43W4',
 'https://www.amazon.co.uk/JVC-Headphones-Earphones-Compatible-Samsung-Black/dp/B00ZAT03S0',
 'https://www.amazon.co.uk/Isolating-Headphones-Microphone-Lightweight-Earphones/dp/B083J88QRS',
 'https://www.amazon.co.uk/OneOdio-Bluetooth-

In [7]:
headers = ['URLs', 'Price', '#Ratings']

finalList = [s+ '/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews' for s in finalList]
finalList = [s.replace("/dp/", "/product-reviews/") for s in finalList]

with open('links.csv', 'w', newline='') as csvfile:
    file_is_empty = os.stat('links.csv').st_size == 0
    writer = csv.writer(csvfile)
    if file_is_empty:
        writer.writerow(headers)
    writer.writerows(zip(finalList, price, review))