In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
import requests
import time, os
from mongoengine import *
import pickle

with open('sephora_list.pkl', 'rb') as f:
    sephora_prod_list = pickle.load(f)

"""
MongoDB Document Setup
"""
connect("mongodb_sephora_reviews")



"""
Selenium setup
"""
chrome_options = Options()
chrome_options.add_argument("user-data-dir=selenium") 
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)






## Classes

In [None]:
class Product(Document):
    """
    Creates MongoDB document in the Product collection.
    """
    brand_name = StringField()
    product_name = StringField(unique=True, required=True)
    price = FloatField()
    size = StringField()
    description = StringField()
    skin_type = StringField()
    skin_concerns = StringField()
    formulation = StringField()
    ingredient_highlights = StringField()
    product_url = StringField()
    product_img_url = StringField()

    
class Reviews(Document):
    """
    Creates MongoDB documentin the Reviews collection.
    Uses Product ID as reference.
    """
    product = ReferenceField(Product)
    username = StringField()
    user_skin_type = StringField()
    review_title = StringField()
    review_text = StringField()
    review_rating = StringField()
    #date_posted = StringField()


## Definitions

In [None]:

def soup_get():
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup


def sephora_url_generator(baseurl):
    full_url = 'sephora.com'+baseurl
    return full_url


def check_exists_by_xpath(xpath):
    try:
        driver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True


def create_product_document():
    for a in soup.find_all('li', class_='css-1xhaj19'):
        if a.img:
            img_url = a.img['src']
            break
        else:
            img_url = None    
    
    #description if statement
    if soup.find(text="What it is:") != None and soup.find(text="What it is:").next != "":
        try:
            descrip = soup.find(text="What it is:").next.strip()
        except TypeError:
            descrip = None
    elif soup.find(text="What it is: ") != None:
        descrip = soup.find(text="What it is: ").next.strip()
    elif soup.find(text="What it is:  ") != None:
        descrip = soup.find(text="What it is:  ").next.strip()
    else:
        descrip = None
    
    
    if soup.find(class_='css-15ro776') != None:
        size_ = soup.find(class_='css-15ro776').text.strip("Size: ")
    else:
        size_ = None
    
    #skin_type if statement    
    if soup.find(text="Skin Type:") != None:
        if soup.find(text="Skin Type:").next.strip() != None:
            skin_type_ = soup.find(text="Skin Type:").next.strip()
        else:
            skin_type_ = None
    elif soup.find(text="Skin Type: ") != None:
        skin_type_ = soup.find(text="Skin Type: ").next.strip()
    elif soup.find(text="Skin Type:  ") != None:
        skin_type_ = soup.find(text="Skin Type:  ").next.strip()
    else:
        skin_type_ = None
    
    
    #skin_concerns if statement
    if soup.find(text="Skincare Concerns:") != None:
        skin_concerns_ = soup.find(text="Skincare Concerns:").next.strip()
    elif soup.find(text="Skincare Concerns: ") != None:
        skin_concerns_ = soup.find(text="Skincare Concerns: ").next.strip()
    elif soup.find(text="Skincare Concerns:  ") != None:
        skin_concerns_ = soup.find(text="Skincare Concerns:  ").next.strip()
    else:
        skin_concerns_ = None
        
    #formuation if statement
    if soup.find(text="Formulation:") != None:
        formulation_ = soup.find(text="Formulation:").next.strip()
    elif soup.find(text="Formulation: ") != None:
        formulation_ = soup.find(text="Formulation: ").next.strip()
    elif soup.find(text="Formulation:  ") != None:
        formulation_ = soup.find(text="Formulation: ").next.strip()
    else:
        formulation_ = None
        
    #ingredients
    if soup.find(text="Ingredient Callouts:") != None:
        highlights = soup.find(text="Ingredient Callouts:").next.strip()
    elif soup.find(text="Ingredient Callouts: ") != None:
        highlights = soup.find(text="Ingredient Callouts: ").next.strip()
    elif soup.find(text="Ingredient Callouts:  ") != None:
        highlights = soup.find(text="Ingredient Callouts:  ").next.strip()
    else:
        highlights = None
        
    #price point
    try:
        price_ = soup.find('b', class_='css-0').text.strip("$")
    except AttributeError:
        price_ = soup.find('b', class_='css-5fq4jh').text.strip("$")

    product_page = Product(
        brand_name = soup.find(class_='css-nc375s').text,
        product_name = soup.find(class_='css-1pgnl76').text,
        price = price_,
        size = size_,
        description = descrip,
        skin_type = skin_type_,
        skin_concerns = skin_concerns_,
        formulation = formulation_,
        ingredient_highlights = highlights,
        product_url = driver.current_url,
        product_img_url = img_url
    ).save()    
    
    return product_page


def create_reviews_documet(beaut_soup_reviews_html, product_page): #soup.select('div:is(.css-13o7eu2)')

    for x in beaut_soup_reviews_html:
        
        if x.find('span',class_='css-t72irq') == None:
            skin_type = None
        else:
            skin_type = x.find('span',class_='css-t72irq').text

        if x.find(class_= 'css-m9drnf') == None:
            review_title = None
        else:
            review_title = x.find(class_= 'css-m9drnf').text
            
        if x.find('strong') == None:
            username_ = None
        else:
            username_ = x.find('strong').text
            
        review_page = Reviews(
            product = product_page,
            username = username_,
            user_skin_type = skin_type,
            review_title = review_title,
            review_text = x.find(class_ ="css-1x44x6f").text,
            review_rating = str(x.find(class_='css-4qxrld'))[16:25].strip('"')
        ).save()



# Webscraping Script

In [None]:
for x in sephora_prod_list:
    base_url = 'https://www.sephora.com/'+x
    driver.get(base_url)  
    driver.execute_script("window.scrollTo(0, 300)")
    time.sleep(5)
    driver.execute_script("window.scrollTo(300, 600)")
    time.sleep(5)
    driver.execute_script("window.scrollTo(600, 900)")
    time.sleep(5)
    driver.execute_script("window.scrollTo(900, 1200)")
    time.sleep(5)
    driver.execute_script("window.scrollTo(1200, 1500)")
    time.sleep(5)
    driver.execute_script("window.scrollTo(1500, 1650)")
    time.sleep(5)
    driver.execute_script("window.scrollTo(1650, 1700)")
    time.sleep(5)
    driver.execute_script("window.scrollTo(1700, 2000)")
    soup = soup_get()
    
    if soup.select('div:is(.css-13o7eu2)') == None:
        continue
    
    else:
        
        """
        Create Mongodb document in the product collection.
        """
        #image url collection
        product_page = create_product_document()
                
        
        """
        Create Mongodb document in the reviews collection.
        """
        create_reviews_documet(soup.select('div:is(.css-13o7eu2)'), product_page)
    while driver.find_element_by_class_name('css-2anst8').is_enabled() == True:
        driver.execute_script("window.scrollTo(2000, 2500)")
        time.sleep(1)
        if check_exists_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[9]/button") == True:
            driver.find_element_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[9]/button").click()
        elif check_exists_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[8]/button") == True:
            driver.find_element_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[8]/button").click()
        elif check_exists_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[7]/button") == True:
            driver.find_element_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[7]/button").click()
        elif check_exists_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[6]/button") == True:
            driver.find_element_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[6]/button").click()
        elif check_exists_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[5]/button") == True:
            driver.find_element_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[5]/button").click()
        elif check_exists_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[4]/button") == True:
            driver.find_element_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[4]/button").click()
        elif check_exists_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[3]/button") == True:
            driver.find_element_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[3]/button").click()
        elif check_exists_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[2]/button") == True:
            driver.find_element_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[2]/button").click()
        else:
            break 
        #driver.find_element_by_xpath("//*[@id='ratings-reviews-container']/div[2]/ul/li[9]/button").click()
        soup = soup_get()      
        create_reviews_documet(soup.select('div:is(.css-13o7eu2)'), product_page) 
    print(f"Scrapping Completed for {x}")

        



In [None]:
driver.end()