In [1]:
from requests_html import HTML
import re
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [2]:
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

In [1]:
categories = [
    {'name': 'electronics', 'url': 'https://www.amazon.in/gp/bestsellers/electronics/'},
    {'name': 'baby', 'url': 'https://www.amazon.in/gp/bestsellers/baby/'},
    {'name': 'apparel', 'url': 'https://www.amazon.in/gp/bestsellers/apparel/'}
]
title_lookup = "#productTitle"
price_lookup = "#priceblock_ourprice"

In [None]:
regex_options = [
    r'https://www.amazon.in/gp/product/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/dp/(?P<product_id>[\w-]+)/',
    r'https://www.amazon.in/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/',
]

def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [None]:
def scrace_category_product_links(categories=[]):
    for categorie in categories:
        time.sleep(1.5)
        url = categorie.get('url')
        driver.get(url)
        body_el = driver.find_element_by_css_selector('body')
        html_str = body_el.get_attribute('innerHTML')
        html_obj = HTML(html=html_str)
        page_links = [f'https://www.amazon.in{x}' for x in html_obj.links if x.startswith('/')]

In [6]:
def scrape_product_page(url, title_lookup = "#productTitle", price_lookup = "#priceblock_ourprice"):
    driver.get(url)
    time.sleep(1.2)
    body_el = driver.find_element_by_css_selector('body')
    html_str = body_el.get_attribute('innerHTML')
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price

In [8]:
def clean_page_links(page_links=[]):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({'url': url, 'product_id': product_id})
    return final_page_links

cleaned_links = clean_page_links(page_links)

In [9]:
def perform_scrape(cleaned_item=[]):
    data_extracted = []
    for obj in cleaned_item:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price != None:
            print(link, title, price)
        product_data = {
            'url': link,
            'product_id': product_id,
            'title': title,
            'price': price
        }
        data_extracted.append(product_data)
    return data_extracted

In [None]:
x = perform_scrape(cleaned_links)

https://www.amazon.in/Dazzling-Storage-Additional-Exchange-Offers/dp/B07SDPJ4XJ/ref=zg_bs_electronics_19/257-0070257-0737949?_encoding=UTF8&psc=1&refRID=7A6G930A6DZTG1EJHAMP OPPO A5 2020 (Dazzling White, 4GB RAM, 64GB Storage) with No Cost EMI/Additional Exchange Offers ₹ 11,990.00
https://www.amazon.in/JBL-C50HI-Ear-Headphones-Black/dp/B07JQKQ91F/ref=zg_bs_electronics_43/257-0070257-0737949?_encoding=UTF8&psc=1&refRID=7A6G930A6DZTG1EJHAMP JBL C50HI in-Ear Headphones with Mic (Black) ₹ 499.00
https://www.amazon.in/Logitech-B170-Wireless-Mouse-Black/dp/B01J0XWYKQ/ref=zg_bs_electronics_46/257-0070257-0737949?_encoding=UTF8&psc=1&refRID=7A6G930A6DZTG1EJHAMP Logitech B170 Wireless Mouse, 2.4 GHz with USB Nano Receiver, Optical Tracking, 12-Months Battery Life, Ambidextrous, PC/Mac/Laptop - Black ₹ 645.00
https://www.amazon.in/Samsung-Galaxy-Storage-Additional-Exchange/dp/B086KCDGTQ/ref=zg_bs_electronics_16/257-0070257-0737949?_encoding=UTF8&psc=1&refRID=7A6G930A6DZTG1EJHAMP Samsung Galaxy 

In [None]:
print(x)