# Actionable Insights from Lululemon Reviews - Data Acquisition

Amanda Cheney  
Metis Project 4  
Part 1 of 4   
November 13, 2020  

**Objective**  

Natural language processing & unsupervised learning exploration of customer reviews of lululemon’s best-selling sports bras to derive actionable insights for product development and management team and develop a recommender system to provide a curated collection of reviews specifically tailored to customer product needs.

**Data Sources**   
9,000+ reviews of all 13 of Lululemon's bestselling sports bras, collected using Selenium.  

**This Notebook**  
Scrapes all reviews for all 13 of Lululemon's bestselling sports bras. 

## Imports

In [1]:
import time, os

from selenium import webdriver

from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

import re
import pickle

In [2]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [3]:
driver = webdriver.Chrome(chromedriver)
driver.get('https://shop.lululemon.com/c/women-sports-bras/_/N-7vlZ1z12l0t')
# Initiate scrolling so that all products on the page to load
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
print("Scrolled to bottom - moving to product pages.")

Scrolled to bottom - moving to product pages.


In [4]:
product_urls = []
for i in driver.find_elements_by_xpath('.//h3[@class="product-tile__product-name lll-text-body-1"]//a'):
    product_urls.append(i.get_attribute('href'))
product_index = 1

print('To scrape: {} products.'.format(len(product_urls)))

To scrape: 13 products.


In [5]:
def my_scraper(product_url):
    """
    Take a product url and extract core product information--product name, list price,
    average rating, total number of ratings as well as details for each and every customer 
    review including: title, content, reviewer name, date of review and store all this information 
    for each review in a dictionary. Append all dictionaries to a list and return the list. 
    """
    product_url = product_url
    driver = webdriver.Chrome(chromedriver)
    driver.get(product_url)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(15)


    my_list=[]
    product_dict = {}
    product_name = driver.find_element_by_xpath('.//h1[@class="pdp-title"]/div').text.replace("\n", " ")
    
# product information 
    try:
        product_list_price = driver.find_element_by_xpath('.//span[@class="price-1SDQy price"]').text
    except Exception as e:
        product_list_price = ""

    try:
        product_avg_rating = driver.find_element_by_xpath('.//span[@class="bv-rating"]').text
    except Exception as e:
        product_avg_rating = ""

    try:
        num_total_ratings = driver.find_element_by_xpath('.//span[@class="reviews-link__count"]').text
    except Exception as e:
        num_total_ratings = ""


# load all of the reviews 
    more_reviews = True 

    while more_reviews:
        try:
            button = driver.find_element_by_xpath('.//button[@class="bv-content-btn bv-content-btn-pages bv-content-btn-pages-load-more bv-focusable"]')
            driver.execute_script("arguments[0].click();", button)
            time.sleep(10)
        except NoSuchElementException:
            more_reviews = False 
            print("No more reviews to load ")

    while True:
        try:
            reviews=driver.find_elements_by_xpath('//li[@itemprop="review"]')

            counter = 0
            # loop through reviews to extract review information 
            for review in reviews:
                review_dict = {}
                counter+=1
                try:
                    title = review.find_element_by_xpath('.//div[@class="bv-content-title-container"]').text 

                except Exception as e:
                    title = ""

                try:
                    content = review.find_element_by_xpath('.//div[@class="bv-content-summary-body-text"]').text
                except Exception as e:
                    content = ""    

                try:
                    rating = review.find_element_by_xpath('.//meta[@itemprop="ratingValue"]').get_attribute("content")
                except Exception as e:
                    rating = ""

                try:
                    name = review.find_element_by_xpath('.//div[@class="bv-author"]').text
                except Exception as e:
                    name = ""

                try:
                    date = review.find_element_by_xpath('.//meta[@itemprop="datePublished"]').get_attribute("content")
                except Exception as e:
                    date = ""

                review_dict['product_name'] = product_name
                review_dict['product_url'] = product_url
                review_dict['product_list_price'] = product_list_price
                review_dict['product_avg_rating'] = product_avg_rating
                review_dict['title'] = title
                review_dict['content'] = content
                review_dict['rating'] = rating
                review_dict['name'] = name
                review_dict['date'] = date
                review_dict['review counter'] = counter
                review_dict['num_total_ratings'] = num_total_ratings

                my_list.append(review_dict)
            return my_list

        # At the end of all reviews scraped for the product
        except NoSuchElementException:
            print('No more reviews...one product done now!')
        driver.close()                    

the above code works and will return all 89 reviews for the first product url correctly 

In [6]:
zero = my_scraper(product_urls[0])
print(len(zero))
with open('zero.pickle', 'wb') as to_write:
    pickle.dump(zero, to_write)

No more reviews to load 
89


In [7]:
one = my_scraper(product_urls[1])
print(len(one))
with open('one.pickle', 'wb') as to_write:
    pickle.dump(one, to_write)

No more reviews to load 
3036


In [8]:
two = my_scraper(product_urls[2])
print(len(two))
with open('two.pickle', 'wb') as to_write:
    pickle.dump(two, to_write)

No more reviews to load 
1534


In [9]:
three = my_scraper(product_urls[3])
print(len(three))
with open('three.pickle', 'wb') as to_write:
    pickle.dump(three, to_write)

No more reviews to load 
53


In [10]:
four = my_scraper(product_urls[4])
print(len(four))
with open('four.pickle', 'wb') as to_write:
    pickle.dump(four, to_write)

No more reviews to load 
848


In [11]:
five = my_scraper(product_urls[5])
print(len(five))
with open('five.pickle', 'wb') as to_write:
    pickle.dump(five, to_write)

No more reviews to load 
123


In [12]:
six = my_scraper(product_urls[6])
print(len(six))
with open('six.pickle', 'wb') as to_write:
    pickle.dump(six, to_write)

No more reviews to load 
691


In [13]:
seven = my_scraper(product_urls[7])
print(len(seven))
with open('seven.pickle', 'wb') as to_write:
    pickle.dump(seven, to_write)

No more reviews to load 
860


In [14]:
eight = my_scraper(product_urls[8])
print(len(eight))
with open('eight.pickle', 'wb') as to_write:
    pickle.dump(eight, to_write)

No more reviews to load 
308


In [15]:
nine = my_scraper(product_urls[9])
print(len(nine))
with open('nine.pickle', 'wb') as to_write:
    pickle.dump(nine, to_write)

No more reviews to load 
540


In [16]:
ten = my_scraper(product_urls[10])
print(len(ten))
with open('ten.pickle', 'wb') as to_write:
    pickle.dump(ten, to_write)

No more reviews to load 
334


In [17]:
eleven = my_scraper(product_urls[11])
print(len(eleven))
with open('eleven.pickle', 'wb') as to_write:
    pickle.dump(eleven, to_write)

No more reviews to load 
225


In [18]:
twelve = my_scraper(product_urls[12])
print(len(twelve))
with open('twelve.pickle', 'wb') as to_write:
    pickle.dump(twelve, to_write)

No more reviews to load 
486
