In [1]:
import csv

In [2]:
from bs4 import BeautifulSoup

In [3]:
from selenium import webdriver

In [4]:
driver = webdriver.Chrome()

In [5]:
url = 'https://www.amazon.com'

In [6]:
driver.get(url)

In [7]:
def get_url(search_text):
    """Generate a url from search text"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_text.replace(' ', '+')
    
    return template.format(search_term)
    

In [8]:
url = get_url('ultrawide monitor')
print(url)

https://www.amazon.com/s?k=ultrawide+monitor&ref=nb_sb_noss_1


In [9]:
driver.get(url)

## Extract the collection

In [10]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [11]:
results = soup.find_all('div', {'data-component-type' : 's-search-result'})

In [12]:
len(results)

22

## Prototype the record

In [13]:
item = results[0]

In [14]:
atag = item.h2.a

In [15]:
description = atag.text.strip()

In [16]:
url = 'https://www.amazon.com' + atag.get('href')

In [17]:
price_parent = item.find('span', 'a-price')

In [18]:
price = price_parent.find('span', 'a-offscreen').text

In [19]:
rating = item.i.text

In [20]:
review_count = item.find('span', {'class': 'a-size-base s-underline-text'}).text

## Generalise the pattern


In [22]:
def extract_record(item):
    """Extract and record data froma single record"""
    
    #description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    
    #price
    price_parent = item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text
    
    #rank and rating
    rating = item.i.text
    review_count = item.find('span', {'class': 'a-size-base s-underline-text'}).text
    
    result = (description, price, rating, review_count, url)
    
    return result

In [23]:
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    records.append(extract_record(item))

AttributeError: 'NoneType' object has no attribute 'find'

## Error Handling

In [24]:
def extract_record(item):
    """Extract and record data froma single record"""
    
    #description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    
    try:    
        #price
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    
    try:
        #rank and rating
        rating = item.i.text
        review_count = item.find('span', {'class': 'a-size-base s-underline-text'}).text
    except AttributeError:
        rating=''
        review_count = ''
    

    result = (description, price, rating, review_count, url)
    return result

In [25]:
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    record = extract_record(item)
    if record:
        records.append(record)

In [26]:
records[0]

('Z-Edge UG27 27-inch Curved Gaming Monitor 16:9 1920x1080 200/144Hz 1ms Frameless LED Gaming Monitor, AMD Freesync Premium Display Port HDMI Build-in Speakers',
 '$199.99',
 '4.5 out of 5 stars',
 '265',
 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A04532402EOL1VJG532E0&url=%2FZ-Edge-27-inch-Curved-Gaming-Monitor%2Fdp%2FB08L3BHN3P%2Fref%3Dsr_1_1_sspa%3Fkeywords%3Dultrawide%2Bmonitor%26qid%3D1658855776%26sr%3D8-1-spons%26psc%3D1&qualifier=1658855776&id=8453180501261447&widgetName=sp_atf')

In [27]:
for row in records:
    print(row[1])

$199.99
$242.97
$474.99
$379.99
$429.97
$539.99
$679.99
$346.99
$239.99
$196.99
$179.00
$546.03
$239.99
$299.99
$329.99
$132.59


## Getting the next page

In [28]:
def get_url(search_text):
    """Generate a url from search text"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_text.replace(' ', '+')
    
    #add term query to the url
    url = template.format(search_term)
    
    #add page query 
    url += '&page{}'
    
    return url
    

## Putting it all together