# Amazon Web Scraper 

In [1]:
import csv
from bs4 import BeautifulSoup

In [None]:
# for firefox and chrome

from selenium import webdriver

In [2]:
# for edge

from msedge.selenium_tools import Edge, EdgeOptions

## Startup the webdriver 

In [None]:
# for firefox and chrome

driver = webdriver.Firefox()
driver = webdriver.chrome()

In [7]:
#edge

options = EdgeOptions()
options.use_chromium = True
driver = Edge(options=options)

In [8]:
url = 'https://www.amazon.com'
driver.get(url)

In [9]:
def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.com/s?k={}&crid=A2M1F9AUHSO7&sprefix=ultrawide%2Caps%2C389&ref=nb_sb_ss_ts-doa-p_1_9'
    search_term = search_term.replace(' ','+')
    return template.format(search_term)

In [11]:
url = get_url('ultrawide monitor')
print(url)

https://www.amazon.com/s?k=ultrawide+monitor&crid=A2M1F9AUHSO7&sprefix=ultrawide%2Caps%2C389&ref=nb_sb_ss_ts-doa-p_1_9


In [12]:
driver.get(url)

##  Extract the collection

In [13]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [14]:
results = soup.find_all('div',{'data-component-type':'s-search-result'})

In [15]:
len(results)

22

### Prototype the record

In [16]:
item = results[0]

In [17]:
atag = item.h2.a

In [37]:
description = atag.text.strip()

In [21]:
url = 'https://www.amazone.com' + atag.get('href')

In [22]:
price_parent = item.find('span', 'a-price')

In [36]:
price = price_parent.find('span','a-offscreen').text

In [27]:
rating = item.i.text

In [35]:
review_count = item.find('span', {'class': 'a-size-base'}).text

### Genralize the pattern

In [38]:
def extract_record(item):
    """Extract and return data from a single record"""
    
    
    #description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazone.com' + atag.get('href')
    
    # price 
    price_parent = item.find('span', 'a-price')
    price = price_parent.find('span','a-offscreen').text
    
    #rating
    rating = item.i.text
    review_count = item.find('span', {'class': 'a-size-base'}).text
    
    result = (description, price, rating, review_count, url)
    
    return result

In [39]:
record = []
results = soup.find_all('div',{'data-component-type':'s-search-result'})

for item in results:
    record.append(extract_record(item))

AttributeError: 'NoneType' object has no attribute 'find'


###  Error handling

In [43]:
def extract_record(item):
    """Extract and return data from a single record"""
    
    
    #description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazone.com' + atag.get('href')
    
    try:
        # price 
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rank and rating
        rating = item.i.text
        review_count = item.find('span', {'class': 'a-size-base'}).text
    except AttributeError:
        rating = ''
        review_count = ''
    
    result = (description, price, rating, review_count, url)
    
    return result

In [45]:
records = []
results = soup.find_all('div',{'data-component-type':'s-search-result'})

for item in results:
    record = extract_record(item)
    if record:
        records.append(record)

In [46]:
records[0]

('Z-Edge U29IA 29" Ultrawide Gaming Monitor 2560x1080 WFHD 21:9 Aspect Ratio 100Hz Refresh Rate 4ms MPRT IPS Monitor, HDMIx2+DP with Wireless Keyword and Mouse (Black)',
 '$199.99',
 '4.3 out of 5 stars',
 '43',
 'https://www.amazone.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A05266702RRBEX3RRYY6&url=%2FZ-Edge-Ultrawide-Gaming-Monitor-2560x1080%2Fdp%2FB08PZBCV9F%2Fref%3Dsr_1_1_sspa%3Fcrid%3DA2M1F9AUHSO7%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1625647887%26sprefix%3Dultrawide%252Caps%252C389%26sr%3D8-1-spons%26psc%3D1&qualifier=1625647887&id=8379526897264068&widgetName=sp_atf')

In [47]:
for row in records:
    print(row[1])

$199.99
$549.99
$196.99
$430.61
$549.99
$593.51
$196.99
$409.99
$239.99
$372.00
$799.99
$229.00
$199.99
$319.99
$429.99
$430.61
$289.99


###  Getting to the next page

In [48]:
def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.com/s?k={}&crid=A2M1F9AUHSO7&sprefix=ultrawide%2Caps%2C389&ref=nb_sb_ss_ts-doa-p_1_9'
    search_term = search_term.replace(' ','+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page quey placeholder
    url += '&page{}'
    
    return url

#### putting it all together 

In [59]:
import csv
from bs4 import BeautifulSoup
from msedge.selenium_tools import Edge, EdgeOptions

def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.com/s?k={}&crid=A2M1F9AUHSO7&sprefix=ultrawide%2Caps%2C389&ref=nb_sb_ss_ts-doa-p_1_9'
    search_term = search_term.replace(' ','+')
    
    # add term query to url
    url = template.format(search_term)
    
    # add page quey placeholder
    url += '&page{}'
    
    return url

def extract_record(item):
    """Extract and return data from a single record"""
    
    
    #description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazone.com' + atag.get('href')
    
    try:
        # price 
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rank and rating
        rating = item.i.text
        review_count = item.find('span', {'class': 'a-size-base'}).text
    except AttributeError:
        rating = ''
        review_count = ''
    
    result = (description, price, rating, review_count, url)
    
    return result

def main(search_term):
    """Run main program routine"""
    # startup the webdriver
    options = EdgeOptions()
    options.use_chromium = True
    driver = Edge(options=options)
    
    record = []
    url = get_url(search_term)
    
    for page in range(1, 21):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div',{'data-component-type':'s-search-result'})
        
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
        
    driver.close()
    
    # save the data to csv file
    with open('result.csv','w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'Url'])
        writer.writerows(records)

In [60]:
main('ultrawide monitor')