In [1]:
from bs4 import BeautifulSoup as BS
import pandas as pd, numpy as np
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
import time, re


In [2]:
site_scrapers={
    'target':{
        #specific top-level fields for a product from Target
        'product_data':
            {'product_title':"//h1[@data-test='product-title']",
            #'product_cost':"",
            'product_details':"//div[@data-test='productDetailTabs-itemDetailsTab']",
            'product_specs':"//div[@data-test='item-details-specifications']"
            },
# details = driver.find_element(by=By.XPATH,value="//div[@data-test='@web/site-top-of-funnel/ProductDetailCollapsible-Details']")
# specs = driver.find_element(by=By.XPATH,value="//div[@data-test='@web/site-top-of-funnel/ProductDetailCollapsible-Specifications']")

        #target has dynamic content; this allows us to expand the product details and specifications page
        'product_expand':{
            'product_details':"//div[@data-test='@web/site-top-of-funnel/ProductDetailCollapsible-Details']",
            'product_specs':"//div[@data-test='@web/site-top-of-funnel/ProductDetailCollapsible-Specifications']"
        },
        #this allows us to navigate within a product page to the reviews and customer feedback
        'navigation':{
            'feedback':"//a[@data-test='ratingCountLink']",
            'reviews':"//button[@data-test='totalReviewLink']",
            'expand':"//button[@class='styles__StyledBaseButtonInternal-sc-ysboml-0 styles__ButtonSecondary-sc-125aivg-0 brTHah bxLMor']"
        },
        #these are the items we extract at the top level (e.g. for all reviews summary)
        'review_summary':[
            ('total_star_rating',"//div[@data-test='rating-value']"),
            ('total_star_votes',"//div[@data-test='rating-count']")            
        ],
        #this is the class and style of every block that contains a review
        'review_list':"//div[@class='styles__StyledRow-sc-wmoju4-0 jJAWfL styles__ReviewRow-sc-4u2mi2-1 hmboNn']",
        #this is the data we extract from each review.

        'review_data':{
            'review_header':('h4',{'data-test':'review-card--title'}),
            'reviewer_name':('span',{'data-test':'review-card--username'}),
            'review_content':('div',{'data-test':'review-card--text'}),
            'review_star_rating':('span',{'class':'utils__ScreenReaderOnly-sc-1b93ups-0 hMtWwx'}),
            'review_helpful_votes':('div',{'class':'h-text-sm h-margin-v-tight h-text-grayDark'}),
            'verified_purchaser':('span',{'review-card--verified-purchaser'})
        }
    },
    'bestbuy':{
        'product_data':{
            'product_title':"//h1[@class:'heading-5 v-fw-regular']",
            'product_details':"//div[@class:'overflow-scroll-wrapper']",
            'product_specs':"//div[@class:'overflow-scroll-wrapper']"
        },
        'product_expand':{
            'product_desc':"//button:[@class=c-button-unstyled features-drawer-btn w-full flex justify-content-between align-items-center py-200]",
            'product_specs':"//button[@class='c-button c-button-outline c-button-md show-full-specs-btn col-xs-6']"
        },
        'navigation':{
            'reviews':"//a[@data-track='See All Customer Reviews']",
            'review_links':"//ul[@class='pagination ugc body-copy-lg']", #navigate to other pages.
        },
        'review_summary':[

        ],
        'review_data':{
            'review_header':('h4',{'class':'c-section-title review-title heading-5 v-fw-medium'}),
            'reviewer_name':('div',{'class':'ugc-author v-fw-medium body-copy-lg'}),
            'review_content':('div',{'class':'ugc-review-body'}),
            'review_star_rating':('p',{'class':'visually-hidden'}),
            'review_helpful_votes':('button',{'data-track':'Helpful'}),
            'verified_purchaser':('div',{'class':'verified-purchaser-mv-wrapper'}),
            'promo_consideration?':('div',{'class':'body-copy-sm pt-50'})
        }
    },
    'amazon':{
        'product_data':{

        },
        'product_expand':{
            
        },
        'navigation':{

        },
        'review_summary':[
            
        ],
        'review_data':{

        }
    }
}

#provide the list of specific products for which we will scrape data
target_product_urls = [
'https://www.target.com/p/doritos-nacho-cheese-flavored-tortilla-chips-14-5oz/-/A-13319564?ref=tgt_adv_xsp&AFID=google&fndsrc=tgtao&DFA=71700000108139139&CPNG=PLA_Snacks%2BCandy%2BShopping_Local%7CSnacks%2BCandy_Ecomm_Food_Bev&adgroup=SC_Snacks%2BCandy&LID=700000001170770pgs&LNM=PRODUCT_GROUP&network=g&device=c&location=9028882&targetid=pla-566872943420&gad_source=1&gclid=CjwKCAiA_tuuBhAUEiwAvxkgTrVUgEoQxO7gDRv43EXIBS0LKEJIZzesWGUMd9JSJq7OybsxjUxfSxoCYmYQAvD_BwE&gclsrc=aw.ds',
'https://www.target.com/p/adjustable-storage-desk-black-room-essentials-8482/-/A-54364953'
]

#open a web browser
driver = webdriver.Edge()

In [3]:
#lists to collect records/data for each link
product_records=[]
review_records=[]
#iterate through the target list of products
for target in target_product_urls:
    curr_product = target #doing this to save time after restructuring code
    #dictionary to store data on the current product
    product_record = {'url':curr_product}
    #navigate to the page
    driver.get(curr_product)
    #wait for page to load
    time.sleep(3)
    #select dynamic fields and expand them for the product
    for k,v in site_scrapers['target']['product_expand'].items():
        time.sleep(0.5)
        e = None
        while e is None:
            e = driver.find_element(By.XPATH,value=v)
            time.sleep(0.1)
        e.click()

    #iterate through the fields we want to pull for product data
    for k,v in site_scrapers['target']['product_data'].items():
        data = driver.find_element(by=By.XPATH,value=v)
        product_record[k] = data.text
    #print("finished top level")

    #navigate to the top of the page to find the button we're looking for
    driver.find_element(By.TAG_NAME,value='body').send_keys(Keys.CONTROL+Keys.HOME)

    #navigate to the reviews
    for k,v in site_scrapers['target']['navigation'].items():
        time.sleep(1)
        #print(k,v)
        #print("driver.find_element({})".format(v))
        c = driver.find_element(By.XPATH,value=v)
        if k != 'expand':
            time.sleep(1)
            c.click()
            #time.sleep(0.5)
        else:
            #get the summarized review data (x.x/5 stars, number of votes)
            for r in site_scrapers['target']['review_summary']:
                product_record[r[0]] = driver.find_element(By.XPATH,value=r[1]).text
            #expand out to all reviews - keep expanding until we can't.
            while c:
                c.click()
                time.sleep(0.25)
                try:
                    c = driver.find_element(By.XPATH,value=v)
                except:
                    c = None

    #append the product record to the list 
    product_records.append(product_record)
    # #build a list of review records.
    # review_records = []

    #get the container that has all the reviews in it
    review_elements = driver.find_elements(By.XPATH,"//div[@class='styles__StyledRow-sc-wmoju4-0 jJAWfL styles__ReviewRow-sc-4u2mi2-1 hmboNn']")

    for review in review_elements:
        #create a dictionary for the current review
        review_record = {'url':curr_product}
        #beautiful soup seems to work better than selenium for parsing here.
        current = BS(review.get_attribute('innerHTML'))
        for k1,v1 in site_scrapers['target']['review_data'].items(): #my_map.items():#site_scrapers['target']['review_data']:
            if k1 not in ['review_star_rating','review_helpful_votes']:
                val = current.find(v1[0],v1[1])
                if val:
                    review_record[k1] = val.getText()
                else:
                    review_record[k1] = None
                #review_record[k1] = current.find(v1[0],v1[1]).getText()
            else:
                if k1 == 'review_star_rating':
                    #use regex to find the star rating for the review
                    review_record[k1] = re.findall('(.) ?(?=out of 5 stars)',review.get_attribute('innerHTML'))[0]
                else: 
                    #use regex to get the number of votes that said this one was useful.
                    m = re.findall('([1-9]+) ?(?=guests found)',review.get_attribute('innerHTML'))
                    #sometimes reviews aren't voted as useful.  If they're not, mark them as zero.
                    if m:
                        review_record[k1] = m[0]
                    else:
                        review_record[k1] = 0
        #append the current review to the list of reviews.
        review_records.append(dict.copy(review_record))


In [4]:
products = pd.DataFrame(product_records)
reviews = pd.DataFrame(review_records)
#products.to_csv('../data/target_products.csv',index=False)
#reviews.to_csv('../data/target_reviews.csv',index=False)

In [5]:
products.head()

Unnamed: 0,url,product_title,product_details,product_specs,total_star_rating,total_star_votes
0,https://www.target.com/p/doritos-nacho-cheese-...,Doritos Nacho Cheese Flavored Tortilla Chips -...,Highlights\n15.0oz. bag of DORITOS Nacho Chees...,Contains: Milk\nState of Readiness: Ready to E...,4.8,2610 star ratings
1,https://www.target.com/p/adjustable-storage-de...,Adjustable Storage Desk Black - Room Essentials™,Highlights\nRectangular storage desk with open...,Dimensions (Overall): 30 Inches (H) x 42.67 In...,4.5,490 star ratings


In [10]:
len(reviews)

522

In [7]:
len(reviews)
#reviews = pd.read_csv('../data/target_reviews.csv')

522