In [1]:
import requests
import smtplib
import time
import datetime
from bs4 import BeautifulSoup
import csv
import pandas as pd
from selenium import webdriver
from msedge.selenium_tools import Edge, EdgeOptions

In [2]:
driver = webdriver.Firefox()

In [3]:
url = 'https://www.amazon.com'
driver.get(url)

In [4]:
#Generate a url from search term

def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss'
    search_term = search_term.replace(' ', '+')
    return template.format(search_term)

In [5]:
url = get_url('gaming chair')
print(url)

https://www.amazon.com/s?k=gaming+chair&ref=nb_sb_noss


In [6]:
driver.get(url)

### Extract the collection

In [9]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [10]:
results = soup.find_all('div', {'data-component-type': 's-search-result'})

In [11]:
len(results)

60

### Prototype the record

In [12]:
item = results[0]

In [13]:
atag = item.h2.a

In [16]:
description = atag.text.strip()

In [17]:
url = 'https://www.amazon.com' + atag.get('href')

In [18]:
price_parent = item.find('span','a-price')

In [19]:
price_parent.find('span','a-offscreen').text

'$215.24'

In [21]:
rating = item.i.text

In [30]:
review_count = item.find('span', 'a-size-base s-underline-text').text

### Generalize the pattern

In [31]:
def extract_record(item):
    #description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    
    #price
    price_parent = item.find('span','a-price')
    price = price_parent.find('span','a-offscreen').text
    
    #rating
    rating = item.i.text
    review_count = item.find('span', 'a-size-base s-underline-text').text\
    
    result = (description, price, rating, review_count, url)
    
    return result

In [33]:
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    records.append(extract_record(item))

AttributeError: 'NoneType' object has no attribute 'find'

### Error handiling

In [45]:
def extract_record(item):
    #description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    
    try:
        #price
        price_parent = item.find('span','a-price')
        price = price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
    try:
        #rating
        rating = item.i.text
        review_count = item.find('span', 'a-size-base s-underline-text').text
    except AttributeError:
        rating = ''
        review_count = ''
        
    result = (description, price, rating, review_count, url)
    
    return result

In [46]:
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    record = extract_record(item)
    if record:
        records.append(record)

In [47]:
print(records[0])

('Amazon Basics Ergonomic Gaming Chair with Bluetooth Speakers and Built-in Mic, Push-Button Height Control - Grey', '$215.24', '4.3 out of 5 stars', '28', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A10218423IUEO2KM08TEI&url=%2FAmazon-Basics-Ergonomic-Microphone-Push-Button%2Fdp%2FB08DF13CF7%2Fref%3Dsr_1_1_sspa%3Fkeywords%3Dgaming%2Bchair%26qid%3D1658100384%26sr%3D8-1-spons%26psc%3D1&qualifier=1658100384&id=2058959122136027&widgetName=sp_atf')


In [49]:
for row in records:
    print(row[1])

$215.24
$164.99
$199.99
$105.00
$279.99
$280.00
$149.90
$399.99
$169.90
$199.99
$379.00
$326.92
$199.99
$148.64
$89.99
$180.35
$179.00
$152.91
$249.99
$164.98
$409.99
$99.99
$289.00
$149.99
$239.99
$229.99
$269.99
$343.98
$199.00
$179.99
$99.99
$209.99
$119.99
$239.99
$153.52
$239.31
$134.00
$128.99
$179.99
$239.99
$139.99
$149.99
$389.99
$232.99
$219.00
$81.20
$149.90
$129.99
$482.19
$344.70
$154.30
$92.99
$189.99
$222.34
$215.24


### Getting to next page

In [51]:
def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss'
    search_term = search_term.replace(' ', '+')
    
    url = template.format(search_term)
    url += '&page={}'
    return url

### Puting it all together

In [54]:
from bs4 import BeautifulSoup
import csv
from selenium import webdriver

def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss'
    search_term = search_term.replace(' ', '+')
    
    url = template.format(search_term)
    url += '&page={}'
    return url

def extract_record(item):
    #description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    
    try:
        #price
        price_parent = item.find('span','a-price')
        price = price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
    try:
        #rating
        rating = item.i.text
        review_count = item.find('span', 'a-size-base s-underline-text').text
    except AttributeError:
        rating = ''
        review_count = ''
        
    result = (description, price, rating, review_count, url)
    
    return result

def main(search_term):
    driver = webdriver.Firefox()
    record =[]
    url = get_url(search_term)
    
    for page in range(1,21):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
        
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
                
    driver.close()
    
    with open('results.csv', 'w',newline= '', encoding = 'UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(['Description', 'Price', 'Rating', 'Review_count', 'Url'])
        writer.writerows(records)

In [55]:
main('Gaming Chair')