In [1]:
import pandas as pd
from pyppeteer import launch
from bs4 import BeautifulSoup
import re
import html2markdown
from IPython.display import display, Markdown, HTML
from urllib.parse import urlparse
import stringcase

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df

Unnamed: 0,id,url
0,kelloggs-froot_loops,https://www.walmart.com/ip/Kellogg-s-Froot-Loo...
1,post-sout_patch_kids,https://www.walmart.com/ip/Post-Sour-Patch-Kid...
2,general_mills-cookie_crisp,https://www.walmart.com/ip/Cookie-Crisp-Cereal...
3,general_mills-cocoa_puffs,https://www.walmart.com/ip/Cocoa-Puffs-Chocola...
4,kelloggs-pop_tarts_cereal,https://www.walmart.com/ip/Kellogg-s-Pop-Tarts...
5,kelloggs-krave_chocolate,https://www.walmart.com/ip/Kellogg-s-Krave-Cho...
6,post-fruity_pebbles,https://www.walmart.com/ip/Post-Fruity-Pebbles...
7,kelloggs-froot_loops_marshmallows,https://www.walmart.com/ip/Kellogg-s-Froot-Loo...
8,post-cocoa_pebbles,https://www.walmart.com/ip/Post-Cocoa-Pebbles-...
9,post-oreo_os,https://www.walmart.com/ip/Post-Oreo-O-s-Cerea...


In [4]:
def make_key(key: str) -> str:
    return stringcase.snakecase(key).replace("__", "_")

In [7]:
reg = re.compile('url\("(.*)"\)')
def get_url_from_div(div_text: str) -> str:
#     div_text = div_text.replace("&quot;",'"')
    s = BeautifulSoup(div_text)
    divs = s.select("div")
    imgs = s.select("img")
    if len(divs) > 0:
        try:
            m = reg.search(s.div['style'])
            u = m.group(1)
        except:
            print(div_text)
    elif len(imgs) > 0:
        u = imgs[0]['src']
    
    up = urlparse(u)
    up = up._replace(query = '')
    return up.geturl()

In [8]:
async def get_data_from_url(u: str) -> str:
    print(u)
    browser = await launch()
    page = await browser.newPage()
    await page.goto(u)
    # Get the page title
    cereal_title = await page.J("h1.prod-ProductTitle>div")
    cereal_title = await page.evaluate('(element) => element.innerHTML', cereal_title)
    # Get the about section of the product
    cereal_about = await page.J("div#product-about>div.about-desc")
    cereal_about = await page.evaluate('(element) => element.innerHTML', cereal_about)
    # Get the ingredients of the product
    cereal_ingredients = await page.J("p.Ingredients")
    cereal_ingredients = await page.evaluate('(element) => element.innerHTML', cereal_ingredients)
    # Click on the specifications tab
    await page.click('li[data-automation-id="ProductPage-item-0"]')
    # Get the specifications of the product
    cereal_specifications = await page.J("div#specifications>table>tbody")
    cereal_specifications = await page.evaluate('(element) => element.innerHTML', cereal_specifications)
    # Click on the nutrition tab
    await page.click('li[data-automation-id="ProductPage-item-1"]')
    # Get the nutritions of the product
    cereal_nutrition = await page.J("div#nutritionFacts>div.nutrition-facts")
    cereal_nutrition = await page.evaluate('(element) => element.innerHTML', cereal_nutrition)
    # Get the link to images
    thumb_list = await page.JJ(".prod-alt-image-wrapper .slider-slide")
    image_dump = []
    for thumb in thumb_list:
        await thumb.click()
        try:
            image_div = await page.J(".hover-zoom-large-img")
            image_div = await page.evaluate('(element) => element.outerHTML', image_div)
        except:
            image_div = await page.J('img[data-tl-id="ProductPage-primary-image"]')
            image_div = await page.evaluate('(element) => element.outerHTML', image_div)
        image_dump.append(image_div)
    await browser.close()
    
    data = {}
    
    treg = re.compile('([a-zA-Z\'\s]*)(\d+(\.\d*)?)')
    m = treg.search(cereal_title)
    title, quantity, unit = m.group(1), m.group(2), "oz"; title, quantity, unit
    
    # Basic Info about the product
    data["title"] = title.strip()
    data["quantity"] = quantity
    data["uom"] = unit
    
    s = BeautifulSoup(cereal_specifications)
    for tr in s.find_all("tr"):
        k, v = tr.find_all("td")
        k, v = k.get_text(), v.get_text()
        data[make_key(k)] = v
    
    # About Text
    data["about"] = html2markdown.convert(cereal_about)
    
    # Ingredients
    s = BeautifulSoup(cereal_ingredients)
    ireg = re.compile('Ingredients:\s?(.*)\.$')
    m = ireg.search(s.get_text())
    data["ingredients"] = stringcase.titlecase(m.group(1).lower())
    
    # Nutrition facts
    data["nutrition"] = {}
    nut = data["nutrition"]
    
    s = BeautifulSoup(cereal_nutrition)
    
    serving_tag = s.select("div.nutrition-facts-all-facts-servingSize")[0]
    
    for row in serving_tag.select("div"):
        k, v = row.select("span")
        k, v = k.get_text(), v.get_text()
        nut[make_key(k)] = v
    
    per_serving_tag = s.select("div.nutrition-facts-all-facts-calorie-info")[0]
    
    for row in per_serving_tag.select("div"):
        spans = row.select("span")
        if len(spans) > 1:
            k, v = row.select("span")
            k, v = k.get_text(), v.get_text()
            nut[make_key(k) + "_per_serving"] = v
    
    nutrient_value_tag = s.select("div.nutrition-facts-all-facts-nutrient-info")[0]
    
    for row in nutrient_value_tag.select("div"):
        spans = list(row.children)
        if len(spans) >= 3:
            t, w, p = spans[:3]
            t, w, p = t.get_text(), w.get_text(), p.get_text()
            nut[make_key(t)] = {"weight": w, "percentage": p}

        if len(spans) == 2:
            t, w = spans
            t, w = t.get_text(), w.get_text()
            nut[make_key(t)] = {"weight": w} 
    
    minerals_tag = s.select("div.nutrition-facts-all-facts-vitamins-minerals-info")[0]
    
    for row in minerals_tag.select("div"):
        spans = list(row.children)
        if len(spans) >= 3:
            t, w, p = spans[:3]
            t, w, p = t.get_text(), w.get_text(), p.get_text()
            nut[make_key(t)] = {"weight": w, "percentage": p}

        if len(spans) == 2:
            t, w = spans
            t, w = t.get_text(), w.get_text()
            nut[make_key(t)] = {"percentage": w} 
        
    image_links = list(map(get_url_from_div, image_dump))
    
    data["images"] = image_links
    
    return data

In [9]:
await get_data_from_url(df.iloc[2]["url"])

https://www.walmart.com/ip/Post-Sour-Patch-Kids-Breakfast-Cereal-Sour-Then-Sweet-18oz/352239677


{'title': '',
 'quantity': '18',
 'uom': 'oz',
 'brand': 'MONDELEZ',
 'model': '31430',
 'container_type': 'Box',
 'about': 'A SOUR THEN SWEET. TALE Sour Patch Kids',
 'ingredients': 'Sugar, Corn Flour, Wheat Flour, Whole Grain Oat Flour, Hydrogenated Vegetable Oil (coconut And Palm Kernel Oils), Corn Syrup, Salt, Citric Acid, Sodium Citrate, Calcium Carbonate, Canola Oil, Natural And Artificial Flavor, Red 40, Yellow 5, Yellow 6, Blue 1, Bht Added To Preserve Freshness  Vitamins And Minerals: Ferric Orthophosphate (source Of Iron), Niacinamide (vitamin B3), Zinc Oxide, Thiamin Mononitrate (vitamin B1), Calcium Pantothenate (vitamin B5), Pyridoxine Hydrochloride (vitamin B6), Folic Acid',
 'nutrition': {'serving_size': '1 Cup (32g)',
  'servings_per_container': '16',
  'calories_per_serving': '170.0 Cal',
  'calories_from_fat_per_serving': '30',
  'total_fat': {'weight': '3', 'percentage': '5%'},
  'saturated_fat': {'weight': '3.0 g', 'percentage': '15.0'},
  'trans_fat': {'weight': '0