# Load library

In [9]:
import requests # send request to website
from bs4 import BeautifulSoup as bs # convert the web content to bs object
from bs4 import Comment # search if we are caught by Amazon as a robot
from fake_useragent import UserAgent #create fake user agent from different browser
import re # regular expression
import pandas as pd # output dataframe
import numpy as np # fast data manipulation
import random # randomly use agent header for sending request
print(requests.__version__)
import os
import csv
from string import punctuation


2.27.1


# How to create headers for request
1. Some Tutorials I used:
    - https://www.crummy.com/software/BeautifulSoup/bs4/doc/#comments-and-other-special-strings
    - https://www.blog.datahut.co/post/web-scraping-best-practices-tips
    - https://stackoverflow.com/questions/63305902/why-cant-i-scrape-amazon-products-by-beautifulsoup
    - https://www.digitalocean.com/community/tutorials/scrape-amazon-product-information-beautiful-soup
    - https://stackoverflow.com/questions/63615686/how-to-scrape-data-from-amazon-canada
    - https://stackoverflow.com/questions/33138937/how-to-find-all-comments-with-beautiful-soup
    - https://pypi.org/project/fake-useragent/
    - https://github.com/jhnwr/scrape-amazon-reviews/blob/main/review-scraper.py
    - https://www.fullstaxx.com/2021/05/23/multipage-scraping-amazon-python/
    - https://github.com/sergioteula/python-amazon-paapi
    
2. Depends on where Amazon location you are scraping, you need to use different headers. The following are just 2 examples:

    - For Amazon Canada: you use:

    `headers = {
        'content-type': 'text/html;charset=UTF-8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'en-US,en;q=0.8',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    }`

    - For Amazon Indian, you use:

    `headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}`

    - For Amazon UK, you use:
    
    `headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}`
3. Here is a list of User-Agent strings for different browsers: https://www.useragentstring.com/pages/useragentstring.php
4. I will use fake-useragent (pip3 install fake-useragent)to generate a list of fake user agent.

# Createa a list of fake User Agent to disguise our IP

In [10]:
'''
There is a pretty useful third-party package called fake-useragent 
that provides a nice abstraction layer over user agents: https://pypi.org/project/fake-useragent/

If you don't want to use the local data, you can use the external data source to retrieve the user-agents. 
#Set use_external_data to True:
'''
ua = UserAgent(browsers=["chrome", "edge", "internet explorer", "firefox", "safari", "opera"])
# I will generate a lsit of fake agent string
user_agent_set = set()
for _ in range(100000):
    user_agent_set.add(ua.random)
'''
Create a list of UserAgent, so that we can alternate using them
'''
#Creater the corresponding headers
header_list = []
header = {
    'content-type': 'text/html;charset=UTF-8',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
for user_agent in user_agent_set:
    header["User-Agent"] = user_agent
    header_list.append(header)
#Total unique fake agent string
print(len(header_list))

4688


# Fetch data from website and create bs object

In [11]:
#Define a function to request data from the website
import numpy as np
reviewlist = []
def get_soup(url, header_list, soup_attempts, request_attempts):
    soup = 'No Data Returned'
    for _ in range(soup_attempts):
        request_count = 0
        page = ''
        notDenied = True
        #changes our header every time
        header = random.choice(header_list)
        while page == '' and request_count <= request_attempts and notDenied:
            try:
                request_count += request_count
                page = requests.get(url, headers=header, timeout=10)
                soup = bs(page.content, "lxml")
                '''If the page returns a message like To discuss automated access 
                    to Amazon data please contact api-services-support@amazon.com.
                    We know we are denied access to the web page.
                    In this case, lets try again using different header
                '''
                comments = soup.find_all(string=lambda text: isinstance(text, Comment))
                for comment in comments:
                    if ("api-services-support@amazon.com" in comment):
                        notDenied = False
                        print('denied')
                        soup = 'No Data Returned'
                if (notDenied):
                    return soup
                #Break the while loop if everything goes well
                break
            except:
                print("Connection refused by the server..")
                print("Let me sleep for 5 seconds")
                time.sleep(5)
                print("Was a nice sleep, now let me continue...")
                continue
    return soup

#Define a function to get the price of a product
def get_amazon_price(soup):
    try:
        price = soup.find_all('span.a-price')
        print(int(price))
        return int(price)

    except Exception as e:
        print('didnt work')
        price = 'Not Available'
        return None
 
linklist = []
duplicates = []
for x in range(2,3):
    soup = get_soup(f'https://www.amazon.co.uk/s?k=heaphones&page={x}',
                    header_list, soup_attempts = 20, request_attempts = 50)
    
    for link in soup.find_all('a', href=True):
        href = link['href']
        if 'keywords=heaphones' in href:
            if 'offer-listing' not in href:
                if'#customerReviews' not in href:
                    duplicates.append(href)

duplicates = [x.split('/ref')[0] for x in duplicates]

for i in duplicates:
    # Add to the new list
    # only if not present
    if i not in linklist:
        linklist.append(i)

finalList = ['https://www.amazon.co.uk' + s for s in linklist]


In [12]:
price = []
for x in finalList:
    soup = get_soup(x,header_list, soup_attempts = 20, request_attempts = 50)
    spans = soup.find('span', attrs = {'class' : 'a-price-whole'})
    if spans == None:
        finalList.remove(x)
        finalList = finalList
        continue
    price.append(spans.text.strip(punctuation))

In [13]:
review = []
for x in finalList:
    soup = get_soup(x,header_list, soup_attempts = 20, request_attempts = 50)
    spans = soup.find('span',id ="acrCustomerReviewText", attrs = {'class' : 'a-size-base'})
    if spans == None:
        finalList.remove(x)
        continue
    review.append(spans.text.strip(punctuation))

In [14]:
headers = ['URLs', 'Price', '#Ratings']

with open('links.csv', 'w', newline='') as csvfile:
    file_is_empty = os.stat('links.csv').st_size == 0
    writer = csv.writer(csvfile)
    if file_is_empty:
        writer.writerow(headers)
    writer.writerows(zip(finalList, price, review))
