In [None]:
from selenium import webdriver
import urllib
import time
import os
import pandas as pd
import math

# Read/write dataframe to file
def io_dataframe(action,start_index,items_list,file_name):
    
    if(action == "write"):
        
        Df = pd.DataFrame(items_list)
        if(start_index == 0):
           Df.to_csv(file_name,encoding='utf-8',index=False) 
        else:
           Df.to_csv(file_name,mode='a',header=False,encoding='utf-8',index=False) 
            
    elif(action == "read"):  
        
        try:
          items_list = pd.read_csv(file_name)
        except:
          print("No csv file to read!")
          items_list = {"Last item id":-1}
        
    return items_list

# Folder creation
def create_folder(path):

    try: 
       os.makedirs(path)
    except OSError:
       if not os.path.isdir(path):
          raise

# Remember to configure your browser to disable image loading
# to speed up page retrieval           
def launch_browser(browser_id):
    
    if browser_id == "chrome":
      driver = webdriver.Chrome() 
    elif browser_id == "firefox":
      driver = webdriver.Firefox()
    
    return driver

# Javascript command assembler
def assemble_JS_command(access_point,path,index,attribute):
    
    if(access_point == "query"):
       js_command = "document.querySelectorAll('" + path + "')"
    elif(access_point == "class"):
       js_command = "document.getElementsByClassName('" + path + "')"
    elif(access_point == "id"):
       js_command = "document.getElementById('" + path + "')" 
    
    if(index != None): js_command += "[" + str(index) + "]"
    
    if(attribute != None): js_command += attribute
    
    return js_command

# Checking an html object presence
def verify_element_present(driver,script):
    pause = 1
    max_steps = 100
    status = False
    for step in range(max_steps):
      response = driver.execute_script(script) 
      #print("Step is " + str(step))
      #print("State is " + str(state)) 
      if(response != None):
        status = True
        break
      time.sleep(pause)
    
    if(status == False):
      print("Element not present!")
      print("Aborting executing: ", script)
      exit()

# Act on element via javascript
def act_on_element_JS(driver,access_point,path,index,attribute):
    
    jstring = assemble_JS_command(access_point,path,index,attribute)
    script = "var output = " + jstring + ";return output;"
    verify_element_present(driver,script)
    output = driver.execute_script(script)
    
    return output

# Number of items per page
def get_items_per_page(driver):
    
    # Get items per page
    N_items_page = \
    act_on_element_JS(driver,"query",".centered-img-wrap img",None,".length")
    
    return N_items_page

# Gather urls of all products
def collect_products_urls(driver,category_url):
    
    driver.get(category_url)
    
    # Get total number of items
    N_items_total = \
    act_on_element_JS(driver,"class","total-num-recs",0,".textContent") 
    print("Total number of items is: " + N_items_total)
    
    # Get items per page
    N_items_page = get_items_per_page(driver)
    page_max = int(math.ceil(float(N_items_total)/N_items_page))
    print("Total number of pages is: " + str(page_max))
    page_number = 1
    Url_list = []
    while (page_number <= page_max):
          
          for item in range(N_items_page):
             # Product url
             prod_url = \
             act_on_element_JS(driver,"query",".centered-img a",item,".href") 
             #print(prod_url)
             Url_list.append({'Product Url':prod_url})
          
          # Next page click  
          jstring = assemble_JS_command("id","loadmore",None,None)
          script = "var next_page = " + jstring + ";return next_page;"
          next_page = driver.execute_script(script)
          if(next_page != None): 
            script = jstring + ".click();" 
            driver.execute_script(script)
            driver.refresh()
        
          print("PAGE NUMBER:" + str(page_number))
          page_number += 1
          N_items_page = get_items_per_page(driver)
            
    return Url_list

# Category list per topic
def gather_main_categories(driver,base_url):
    
    driver.get(base_url)
    
    base_class = ".tile"
    N_categories = \
    act_on_element_JS(driver,"query",base_class + " article", \
                     None,".length")
    
    Category_list = []
    for icategory in range(N_categories):
        
        Category_dic = {}
        Name = \
        act_on_element_JS(driver,"query",base_class + " article",icategory,".title")
        #print(Name)
        Category_dic["Category"] = Name
        # Creating category folder
        create_folder(Name)
        
        Url = \
        act_on_element_JS(driver,"query",base_class + " a",icategory,".href")
        #print(Link)
        Category_dic["Url"] = Url
        
        Image = \
        act_on_element_JS(driver,"query",base_class + " article",\
                                      icategory,".style.backgroundImage")
        #print(Image)
        
        # Below some manipulation is requiered to properly assemble the image url
        Image_Url = "https:" + Image[5:-2]
        Category_dic["Image url"] = Image_Url
        Category_list.append(Category_dic)
        Images_dir = "Category_images/"
        create_folder(Images_dir)
        urllib.urlretrieve(Image_Url,Images_dir + Name)
        
    return Category_list    

# Properties to collect per item
def gather_item_data(driver,item_url):
    
    driver.get(item_url)
    
    Item_dic = {}
    
    # Get product url
    Item_dic["Product Url"] = item_url
    
    # Get product title
    Title = \
    act_on_element_JS(driver,"query","#product-desc h1",0,".textContent")
    #print("Product name: " + Title)
    Item_dic["Product name"] = Title
    
    # Get packaging info
    Packaging = \
    act_on_element_JS(driver,"query","#product-desc .description",0,".textContent")
    #print("Packaging info: " + Packaging)
    Item_dic["Packaging info"] = Packaging
    
    # Get price
    Price = \
    act_on_element_JS(driver,"query",".price-current",0,".textContent.replace(/\s/g,'')")
    print("Price: " + Price)
    Item_dic["Price"] = Price
    
    # Get image link
    Image_url = \
    act_on_element_JS(driver,"query","#product-images img",0,".src")
    #print("Image url: " + Link)
    Item_dic["Image url"] = Image_url
    
    # Get product description
    Description = \
    act_on_element_JS(driver,"query",".description",1,".textContent")
    #print("Description: " + Description)
    Item_dic["Description"] = Description
    
    # Get rollup id
    RollupID = \
    act_on_element_JS(driver,"query","#product-purchase-cartridge",0,\
                                  ".getAttribute('data-rollup-id')")
    #print("RollUpId: " + RollupID)
    Item_dic["RollUpId"] = RollupID
    
    # Get sku id
    SKUID = \
    act_on_element_JS(driver,"query","#product-purchase-cartridge",0,\
                                  ".getAttribute('data-sku-id')")
    #print("SKUId: " + SKUID)
    Item_dic["SKUId"] = SKUID
    
    # Get unique cartridge index
    Cartridge_index = \
    act_on_element_JS(driver,"query","#product-purchase-cartridge",0,\
                                  ".getAttribute('data-uniquecartridgeindex')")
    #print("Unique Cartridge index: " + Cartridge_index)
    Item_dic["Unique Cartridge index"] = Cartridge_index
    
    # Save image into proper local folder
    Images_dir = "Product_images/"
    create_folder(Images_dir)
    urllib.urlretrieve(Image_url,Images_dir + Title + "_" + SKUID)
    
    return Item_dic


def assemble_items_catalog(driver,products_urls):

    start_index = \
    io_dataframe("read",0,[0],"Scrape_state.csv")["Last item id"] + 1
    
    print("Start index: " + str(start_index))
    
    for item_id in range(start_index,len(products_urls)):
        
        item_url = products_urls[item_id]["Product Url"]
        print("Retrieving item " + str(item_id) + \
              " from location: " + str(item_url))
        
        Item_dic = gather_item_data(driver,item_url)
        io_dataframe("write",item_id,[Item_dic],"Product_catalog.csv")
        
        dic = {}
        dic["Last item id"] = item_id
        io_dataframe("write",0,[dic],"Scrape_state.csv")

In [None]:
# To launch the browser
browser_name = "chrome"
driver = launch_browser(browser_name)

In [None]:
# To create the category tree structure
base_url = "https://www.walmart.ca/en/grocery/N-117"
category_list = gather_main_categories(driver,base_url)

In [None]:
# To gather all items urls for a given category
category_url = "https://www.walmart.ca/en/grocery/fruits-vegetables/N-3799"
items_urls = collect_products_urls(driver,category_url)

In [None]:
# To assemble the items catalog for a given category
assemble_items_catalog(driver,items_urls)