In [1]:
import os
from requests_html import HTML
import requests
import time
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import re
import pandas as pd
import datetime

In [2]:
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options= options) # so that browser does not get open everytime we run the code.

In [3]:
BASE_DIR = Path.cwd()
data_dir = os.path.join(BASE_DIR , "data")
os.makedirs(data_dir, exist_ok = True)
product_category_links_output = os.path.join(data_dir , "category-products.csv")
products_output = os.path.join(data_dir , "products.csv")

In [4]:
categories = [
    {"url" : "https://www.amazon.com/Best-Sellers-Toys-Games/zgbs/toys-and-games/", "category_name":"toys-and-games"},
    {"url" : "https://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_0","category_name":"electronics"},
    {"url":"https://www.amazon.in/gp/bestsellers/garden/ref=zg_bs_nav_0", "category_name":"garden"}
]

In [5]:
# 1
def scrape_category_product_links(categories = []): 
    all_product_links = []
    for category in categories:
        url = category['url']
#         category_name = category['category_name']
        driver.get(url)
        body_el = driver.find_element_by_css_selector("body")
        html_str = body_el.get_attribute("innerHTML")
        html_obj = HTML(html = html_str)
        page_links = [f"https://www.amazon.com{x}" for x in html_obj.links if x.startswith("/")]
        cleaned_page_links = clean_page_links(page_links, category=category)
        all_product_links += cleaned_page_links
    return all_product_links

In [6]:
# 5
def scrape_product_page(url, title_lookup="#productTitle", price_lookup="#priceblock_ourprice"):
    driver.get(url)
#     print(url)
    time.sleep(2)
    body_el = driver.find_element_by_css_selector("body")
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html = html_str)
    title = html_obj.find(title_lookup, first=True).text # first= True as find returns a list of elements
#     print(html_obj.find(title_lookup, first=True))
    price = html_obj.find(price_lookup, first=True).text
    return title, price

# scrape_product_page("https://www.amazon.com/Munchkin-17040-Fishin-Bath-Toy/dp/B01N52DUNK/ref=zg_bs_toys-and-games_32?_encoding=UTF8&psc=1&refRID=6GGEZWST2GSQY9R15K77")

("Munchkin Fishin' Bath Toy", '$7.73')

In [7]:
regex_options = [
    r"https://www.amazon.com/(?P<slug>[\w-]+)/dp/(?P<product_id>[\w-]+)/",
    r"https://www.amazon.com/dp/(?P<product_id>[/w-]+)",
    r"https://www.amazon.com/gp/product/(?P<product_id>[/w-]+)"
]

# 3
def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
        return product_id

In [8]:
# 2
def clean_page_links(page_links=[], category=None):
    final_page_links = []
    for url in page_links:
        product_id = extract_product_id_from_url(url)
        if product_id != None:
            final_page_links.append({"url" :url, "product_id":product_id , "category":category})
    return final_page_links

In [9]:
def perform_scrape(cleaned_items = []):
    extracted_data = []
#     print(len(cleaned_items))
    for link in cleaned_items:
        url = link["url"]
        product_id = link["product_id"]
        product_title, product_price = None , None
        try:
            product_title, product_price = scrape_product_page(url)
         
        except:
            pass
        
        if product_title != None and product_price != None:
            product_data = {
                "url":url,
                "product_id": product_id ,
                "product_title" : product_title,
                "product_price": product_price
            }
            
            extracted_data.append(product_data)
            
    return extracted_data

In [10]:
# cleaned_page_links = scrape_category_product_links(categories)
# # print(cleaned_page_links)
# data_extracted = perform_scrape(cleaned_items = cleaned_page_links)
# print(data_extracted)

In [11]:
# 4
def extract_categories_and_save(categories=[]):
    cleaned_page_links = scrape_category_product_links(categories)
    df = pd.DataFrame(cleaned_page_links)
    df.to_csv(product_category_links_output, index=False)

In [12]:
extract_categories_and_save(categories)

In [13]:
df = pd.read_csv(product_category_links_output)
df.head(n=50)

Unnamed: 0,url,product_id,category
0,https://www.amazon.com/6102278-Suplemento-base...,B00NHQF65S,{'url': 'https://www.amazon.com/Best-Sellers-T...
1,https://www.amazon.com/Little-Tikes-EasyScore-...,B01C5A2WJO,{'url': 'https://www.amazon.com/Best-Sellers-T...
2,https://www.amazon.com/Play-Doh-Pack-Case-Blue...,B07BC152DC,{'url': 'https://www.amazon.com/Best-Sellers-T...
3,https://www.amazon.com/Baby-Einstein-Creative-...,B083XNLSN3,{'url': 'https://www.amazon.com/Best-Sellers-T...
4,https://www.amazon.com/First-Princess-Make-Kit...,B07FPWYY79,{'url': 'https://www.amazon.com/Best-Sellers-T...
5,https://www.amazon.com/First-Years-Stack-Up-Cu...,B00005C5H4,{'url': 'https://www.amazon.com/Best-Sellers-T...
6,https://www.amazon.com/SplashEZ-Splash-Sprinkl...,B0836WRWFY,{'url': 'https://www.amazon.com/Best-Sellers-T...
7,https://www.amazon.com/Intex-River-Lounge-Infl...,B000PEOMC8,{'url': 'https://www.amazon.com/Best-Sellers-T...
8,https://www.amazon.com/Crayola-Shark-Coloring-...,B07PMLL5L7,{'url': 'https://www.amazon.com/Best-Sellers-T...
9,https://www.amazon.com/Crayola-12-Ultra-Clean-...,B003HGGPLW,{'url': 'https://www.amazon.com/Best-Sellers-T...


In [14]:
df.shape # to get rows and columns in dataframe

(147, 3)

In [15]:
df_sub = df.copy() #to copy dataframe
# df_sub = df.head(n=10)

In [16]:
def row_scrape_event(row):
    link = row['url']
    scraped = 0
    try:
        scraped = row['scraped']
    except:
        pass
    if scraped == 1 or scraped =="1":
        return row
    product_id = row['product_id']
    title, price = (None, None)
    try:
        title,price = scrape_product_page(link)
    except:
        pass
    
    row['title'] = title
    row['price'] = price
    row['scraped'] = 1
    row['timestamp'] = datetime.datetime.now().timestamp()
#     print(title, price)
    return row

In [22]:
df_sub = df_sub.apply(row_scrape_event , axis=1)
df_sub.head(n=60)

Unnamed: 0,url,product_id,category,title,price,scraped,timestamp
0,https://www.amazon.com/6102278-Suplemento-base...,B00NHQF65S,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,1,1591820000.0
1,https://www.amazon.com/Little-Tikes-EasyScore-...,B01C5A2WJO,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,1,1591820000.0
2,https://www.amazon.com/Play-Doh-Pack-Case-Blue...,B07BC152DC,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,1,1591820000.0
3,https://www.amazon.com/Baby-Einstein-Creative-...,B083XNLSN3,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,1,1591820000.0
4,https://www.amazon.com/First-Princess-Make-Kit...,B07FPWYY79,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,1,1591820000.0
5,https://www.amazon.com/First-Years-Stack-Up-Cu...,B00005C5H4,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,1,1591820000.0
6,https://www.amazon.com/SplashEZ-Splash-Sprinkl...,B0836WRWFY,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,1,1591820000.0
7,https://www.amazon.com/Intex-River-Lounge-Infl...,B000PEOMC8,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,1,1591820000.0
8,https://www.amazon.com/Crayola-Shark-Coloring-...,B07PMLL5L7,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,1,1591820000.0
9,https://www.amazon.com/Crayola-12-Ultra-Clean-...,B003HGGPLW,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,1,1591820000.0


In [18]:
final_df = pd.concat([df, df_sub])
final_df.to_csv(products_output, index=False)
final_df.head()

Unnamed: 0,url,product_id,category,title,price,scraped,timestamp
0,https://www.amazon.com/6102278-Suplemento-base...,B00NHQF65S,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,,
1,https://www.amazon.com/Little-Tikes-EasyScore-...,B01C5A2WJO,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,,
2,https://www.amazon.com/Play-Doh-Pack-Case-Blue...,B07BC152DC,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,,
3,https://www.amazon.com/Baby-Einstein-Creative-...,B083XNLSN3,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,,
4,https://www.amazon.com/First-Princess-Make-Kit...,B07FPWYY79,{'url': 'https://www.amazon.com/Best-Sellers-T...,,,,
