In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import csv
import selenium
from selenium import webdriver

In [2]:
def get_url(search_term):
    template = f'https://www.amazon.com/s?k={search_term}&crid=2LAC2KLUYYL46&sprefix=laptop%2Caps%2C257&ref=nb_sb_noss_2'
    return template

In [3]:
def extract_sku_data(item):
    
    try:
        description = item.find('span', class_='a-size-medium a-color-base a-text-normal').text
    except:
        description = 'empty'
        pass
    
    try:
        price = item.find('span', class_='a-offscreen').text
    except:
        price = 'empty'
        pass
    
    try:
        rating = item.find('span', class_='').text
    except:
        rating = 'empty'
        pass    
        
    try:
        sku_link = item.find("a", class_="a-size-base a-link-normal s-no-hover s-underline-text s-underline-link-text s-link-style a-text-normal")
        sku_link = sku_link['href'] 
    except:
        sku_link = 'empty'
        pass


    elem = {
        "description": description, 
        "price": price, 
        "rating": rating, 
        "sku_link":sku_link #.split("ref")[0]
    }
    
    return elem


def scrape_page(driver, url):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser') 

    items = soup.find_all('div', {'data-asin': True, 'data-component-type':True})

    # for item in items:
    data = []
    for item in items:
        item_id = item.get("data-asin")
        elem = extract_sku_data(item)
        data.append(elem)

    # get next page link
    next_page = soup.find(class_="s-pagination-item s-pagination-next s-pagination-button s-pagination-separator")
    
    if next_page:
        next_page = "https://www.amazon.com" + next_page['href']
    else:
        next_page = "last-page"

    return next_page, pd.DataFrame(data)

In [4]:
# Function to clean up rating column
def clean_rating(rating):
    try:
        rating_value = float(rating)
        if 0 <= rating_value <= 5:  # Assuming rating is within the range of 0 to 5
            return rating_value
        else:
            return None
    except ValueError:
        return None

In [5]:
# initialize the browser
driver = webdriver.Firefox()

# define a search term
search_term = 'laptop'
url = get_url(search_term)
url

'https://www.amazon.com/s?k=laptop&crid=2LAC2KLUYYL46&sprefix=laptop%2Caps%2C257&ref=nb_sb_noss_2'

In [6]:
data = []
while url != "last-page":
    print(url)
    url, page_data = scrape_page(driver, url)
    data.append(page_data)

https://www.amazon.com/s?k=laptop&crid=2LAC2KLUYYL46&sprefix=laptop%2Caps%2C257&ref=nb_sb_noss_2
https://www.amazon.com/s?k=laptop&page=2&crid=2LAC2KLUYYL46&qid=1692980273&sprefix=laptop%2Caps%2C257&ref=sr_pg_1
https://www.amazon.com/s?k=laptop&page=3&crid=2LAC2KLUYYL46&qid=1692980276&sprefix=laptop%2Caps%2C257&ref=sr_pg_2
https://www.amazon.com/s?k=laptop&page=4&crid=2LAC2KLUYYL46&qid=1692980279&sprefix=laptop%2Caps%2C257&ref=sr_pg_3
https://www.amazon.com/s?k=laptop&page=5&crid=2LAC2KLUYYL46&qid=1692980283&sprefix=laptop%2Caps%2C257&ref=sr_pg_4
https://www.amazon.com/s?k=laptop&page=6&crid=2LAC2KLUYYL46&qid=1692980285&sprefix=laptop%2Caps%2C257&ref=sr_pg_5
https://www.amazon.com/s?k=laptop&page=7&crid=2LAC2KLUYYL46&qid=1692980288&sprefix=laptop%2Caps%2C257&ref=sr_pg_6
https://www.amazon.com/s?k=laptop&page=8&crid=2LAC2KLUYYL46&qid=1692980290&sprefix=laptop%2Caps%2C257&ref=sr_pg_7
https://www.amazon.com/s?k=laptop&page=9&crid=2LAC2KLUYYL46&qid=1692980293&sprefix=laptop%2Caps%2C257&ref

In [7]:
df = pd.concat(data)
df['rating'] = df.rating.str.extract(r'(\d+\.\d)')

In [8]:
df['rating'] = pd.to_numeric(df['rating'])

In [9]:
# Applying the clean_rating function to the 'rating' column
df['rating'] = df['rating'].apply(clean_rating)

In [12]:
df.head()

Unnamed: 0,description,price,rating,sku_link
0,2023 Newest Upgraded IdeaPad 1i Laptops for St...,$329.99,,/sspa/click?ie=UTF8&spc=MTo0MjYwODg4ODE0ODAwNj...
1,"Acer Aspire 1 A115-32-C96U Slim Laptop | 15.6""...",$229.99,,/sspa/click?ie=UTF8&spc=MTo0MjYwODg4ODE0ODAwNj...
2,Lenovo 2023 High Performance 15'' FHD IPS Lapt...,$269.99,4.3,/Lenovo-Performance-15-Laptop-Super-Fast/dp/B0...
3,"Acer Aspire 5 A515-56-347N Slim Laptop - 15.6""...",$299.99,4.3,/Acer-Aspire-A515-56-347N-Slim-Laptop/dp/B0BL8...
4,Lenovo IdeaPad 3 – (2023) - Everyday Notebook ...,$280.98,4.5,/Lenovo-IdeaPad-Everyday-Notebook-i3-1115G/dp/...


In [11]:
df.to_csv('amazon_laptop.csv', index=False)