In [None]:
# imports needed to run this app

from bs4 import BeautifulSoup
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
from datetime import time, datetime
import pandas as pd

In [None]:
# depreciated way of opening ChromeDriver

# !which chromedriver

In [None]:
# open ChromeDriver

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
# go to the disired url

base_url = 'https://www.walmart.com'
search_url = '/search/?query=room%20air%20purifier'
browser.visit(base_url + search_url)

In [None]:
# grab the html and parse it

html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [None]:
# Do the product scrape

# Create empty lists for scraped data to be stored in.
productURL_list = []
productImage_list = []
productTitle_list = []
starReview_list = []
reviewAmount_list = []
currentPrice_list = []
reviewURL_list = []
freepickup_list = []

# Find the number of pages for this site
listOfPages = soup.find('ul', class_='paginator-list').find_all('li')
numberOfPages = listOfPages[0].find('a')['aria-label'].split()[3]
print(f"the number of pages to loop through is {numberOfPages}")

# datetime object containing current date and time
now = datetime.now()

for i in range(1,int(numberOfPages)+1):
# for i in range(1,2):
    
    # click on the next page number and scrape the html
    page_link = browser.links.find_by_text(i).click()
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
    # loop through all of the products on that page
    products = soup.find_all('div', class_='search-result-gridview-item')
    print(f"quering page {i}...............")

    for product in products:

# Get the link to the product page.
        link = product.find('a')
        href = link['href']
        product_url = base_url + href
        productURL_list.append(product_url)

# Get the link to the product image.
        img = product.find('img')['src']
        productImage_list.append(img)

# Get the product title.
        product_title = product.find('img')['alt']
        productTitle_list.append(product_title)

# Get the number of reviews, there may not be any.
        try:
            review_amount = product.find('span', class_='seo-review-count visuallyhidden').text
            reviewAmount_list.append(review_amount)

        except:
            review_amount = 0
            reviewAmount_list.append(review_amount)

        # In the case where there are at least one review...
        if int(review_amount) > 0:
            # Get the average number of stars
            stars_review = product.find('span', class_='visuallyhidden seo-avg-rating').text   
            starReview_list.append(stars_review)

            # Get the URL to the reviews section for that product.
            review_url = product.find('div', class_='stars').find('a')['href']
            reviewURL_list.append(base_url+review_url)

        # Otherwise, use defalt 0 or NaN values for these entries.
        else:
            stars_review = 0
            starReview_list.append(stars_review)

            review_url = "NaN"
            reviewURL_list.append(review_url)

# Get the price of the product.
        try:
            price = product.find('span', class_='price-main-block')
            current_price = price.find('span', class_='visuallyhidden').text
        # Some products don't have a specific price, or price is only shown in cart
        except:
            price = product.find('span', class_='search-result-productprice')
            current_price = price.find('span', class_='visuallyhidden').text
        
        currentPrice_list.append(current_price)
        
# Check to see if the product has 'Free pickup' AKA in store
        # Look at the div under the price div
        try:
            shipping_details = product.find('div', class_='search-result-product-shipping-details')
            delivery_options_list = []
            # loop through all possile delivery options displayed
            for option in shipping_details.children:
                option.span.unwrap() #unwrap takes the span tag off
                delivery_option = option.text
                if delivery_option == 'Free pickup': #assuming 'free pickup' means avaliable in store
                    free_pickup = True
                    delivery_options_list.append(free_pickup)
                else:
                    free_pickup = False
                    delivery_options_list.append(free_pickup)

            instore = any(delivery_options_list)
        # if there are no delivery options, assume not avaialble in store
        except:
            instore = False
        freepickup_list.append(instore)

    # Close the browser window
    # browser.quit()

    # Create a dictionary with the lists of the scrapped data.
    data = {
        "Title": productTitle_list,
        "URL": productURL_list,
        "Image": productImage_list,
        "AverageStars": starReview_list,
        "NumberofReviews": reviewAmount_list,
        "ReviewsURL": reviewURL_list,
        "Price": currentPrice_list,
        "Free Pickup": freepickup_list
           }

    # Create a Pandas DataFrame with that dictionary
    product_df = pd.DataFrame.from_dict(data)

In [None]:
# casting columns as necessary and checking the data types

product_df = product_df.astype({
    "NumberofReviews": 'int',
    "AverageStars": 'float'
})

product_df.dtypes

In [None]:
# ordering dataframe before saving as csv

ordered_df = product_df.sort_values(by=['Free Pickup', 'NumberofReviews'], ascending=False)
ordered_df.head()
# shortlist_df = product_df.sort_values(by='NumberofReviews', ascending=False)[product_df['Free Pickup']==True]
# shortlist_df.head()

In [None]:
# checking the length of the dataframe and other stats

# product_df.describe()
# product_df.head(6)
ordered_df.describe()

In [None]:
# adding datetime of scrape to csv name
# dd/mm/YY H:M:S

dt_string = now.strftime("%d_%m_%Y__%H_%M_%S")
ordered_df.to_csv(f"WalmartRAPScrape_{dt_string}.csv")

# End Scrape Here

# Start Testing Comments Scrape Here

In [None]:
print(product_df['URL'][0])
print('----------------------------------------------------------------------------------------------------')
print(product_df['ReviewsURL'][5])

In [None]:
# LEFT OFF HERE
# WANT TO CLICK BY TEXT LIKE IN ABOVE QUERY, DIDN'T WORK THOUGH...
# THIS WILL LET ME GO TO EACH PAGE OF THE REVIEWS, 
# AND I THINK IT WILL LET ME LOAD ALL OF THAT PAGES REVIEWS

# Get reviews 

review_df = pd.DataFrame()

# loop through all of the products in the product dataframe
# for i in range(len(product_df["ReviewsURL"])):
for i in range(4,6):
    if product_df['NumberofReviews'][i] > 0:
        
        productTitleList = []
        reviewURLList = []
        reviewTitleList = []
        reviewRatingList = []
        reviewCommentList = []
    
#         try:
        browser.visit(product_df["ReviewsURL"][i])
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        allReviewsURL = soup.find('a', class_="button ReviewBtn-container ReviewsHeader-seeAll button--primary")['href']
        browser.visit(base_url+allReviewsURL)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        
        # loop through all of the pages in this search
        reviewPages = soup.find('ul', class_='paginator-list').find_all('li')
        pagesOfReviews = reviewPages[-1].find('button').text
        print(f"the number of pages to loop through is {pagesOfReviews}")
        for j in range(1, int(pagesOfReviews)+1):
            
            page_link = browser.links.find_by_text(j).click()
            html = browser.html
            soup = BeautifulSoup(html, 'html.parser')
            reviews = soup.find_all('div', class_='ReviewList-content')

            for review in reviews:

                productTitleList.append(product_df['Title'][i])
                reviewURLList.append(product_df['ReviewsURL'][i])

                try:
                    title = review.find('h3', class_='review-title').text
                    reviewTitleList.append(title)
                    print(title)

                except:
                    title = 'None'
                    reviewTitleList.append(title)
                    print(title)
#                         pass

#                 # adding try only because the browser needs to scroll
#                 # once this is added, there wont need to be a try
#                 # each review has to have a star value
#                 try:
#                     reviewRating = review.find('span', class_='seo-avg-rating').text
#                     reviewRatingList.append(reviewRating)
#                     print(reviewRating)
#                 except:
#                     pass

#                 # adding try and except for the review body for same reason as stars
#                 try:
#                     reviewComment = review.find('p').text
#                     reviewCommentList.append(reviewComment)
# #                         print(reviewComment)
#                 except:
#                     pass


#             # only until I fix the scroll
#             reviewTitleList.pop()
#             productTitleList.pop()
#             reviewURLList.pop()

#             data = {
#                 'ProductTitle': productTitleList,
#                 'ReviewURL': reviewURLList,
#                 'ReviewTitle': reviewTitleList,
#                 'ReviewStarRating': reviewRatingList,
#                 'ReviewComment': reviewCommentList
#             }
#             print('stored data')
#             productReviews_df = pd.DataFrame.from_dict(data)
#             productReviews_df.head()
#             print('into dataframe')

#         except:
#             pass
    
#         print('add to existing dataframe')
#         review_df = review_df.append(productReviews_df, ignore_index=True)
# print('done')
# #                 browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

In [None]:
# len(review_df)
# review_df.tail()
review_df.describe()

In [None]:
reviews = soup.find('div', class_='review-highlight')
positive = reviews.find('div', class_='font-bold highlight-title').text
stars = reviews.find('span', class_='seo-avg-rating').text
# starts = reviews.find('span', class_='seo-average-rating')
body = reviews.find('div', class_='collapsable-content-container').text
print(positive)
print(stars)
print(body)

In [None]:
reviews = soup.find_all('div', class_='ReviewList-content')

In [None]:
# Get reviews 

review_df = pd.DataFrame()

# for i in range(len(product_df["ReviewsURL"])):
for i in range(4,5):
    if product_df['NumberofReviews'][i] > 0:
#     number_of_reviews = product_df['NumberofReviews'][i]
#     for j in range(number_of_reviews):
        
        productTitleList = []
        reviewURLList = []
        reviewSiteLink = []
        reviewTitleList = []
        reviewRatingList = []
        reviewCommentList = []
        
#         if len(reviewRatingList) < product_df['NumberofReviews'][i]:
        
#         while len(reviewRatingList) < product_df['NumberofReviews'][i]:
    
        try:
            # go to the product url link
            browser.visit(product_df["ReviewsURL"][i])
            html = browser.html
            soup = BeautifulSoup(html, 'html.parser')
            
            # find the 'see all reviews' button and go there
            allReviewsURL = soup.find('a', class_="button ReviewBtn-container ReviewsHeader-seeAll button--primary")['href']
            browser.visit(base_url+allReviewsURL)
            soup = BeautifulSoup(html, 'html.parser')
            
            # find the list of reviews
            reviews = soup.find_all('div', class_='ReviewList-content')
            
#             for review in reviews:
            for j in range(len(reviews)):
                reviewSiteLink.append(review)
                reviewSiteLink[j].send_keys(Keys.PAGE_DOWN)
                soup = BeautifulSoup(html, 'html.parser')
                reviews = soup.find('div', class_='ReviewList-content')
                
#                 print(f"j={j}")
                productTitleList.append(product_df['Title'][i])
                reviewURLList.append(product_df['ReviewsURL'][i])
                title, stars, comments = scrapeReviews(review)
                reviewTitleList.append(title)
                reviewRatingList.append(stars)
                reviewCommentList.append(comments)


                data = {
                    'ProductTitle': productTitleList,
                    'ReviewURL': reviewURLList,
                    'ReviewTitle': reviewTitleList,
                    'ReviewStarRating': reviewRatingList,
                    'ReviewComment': reviewCommentList
                }
                print('stored data')
                productReviews_df = pd.DataFrame.from_dict(data)
                productReviews_df.head()
                print('into dataframe')

        except:
            pass

        print('add to existing dataframe')
        review_df = review_df.append(productReviews_df, ignore_index=True)
print('done')

In [None]:
# len(review_df)
# review_df.tail()
review_df.describe()

In [None]:
def scrapeReviews(review):
    
    # get the title of the review
    try:
        title = review.find('h3', class_='review-title').text
    except:
        title = 'None'
        
    # get the number of stars in this review
    reviewRating = review.find('span', class_='seo-avg-rating').text
    
    # get the context of the review
    reviewComment = review.find('p').text
        
    # return these items as a tuple
    return title, stars, comments

In [None]:
browser.quit()

In [None]:
soup.execute_script("window.scrollTo(0, document.body.scrollHeight);")