In [209]:
import time
import numpy as np
import urllib.request as urllib


from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from csv import DictWriter


wurl = "https://www.sephora.com/shop/fragrances-for-women"
murl = "https://www.sephora.com/shop/fragrances-for-men"
burl = "https://www.sephora.com/best-selling-perfume"


def auto_pagedowns(browser, no_of_pagedowns):
    """
    input:
        -selenium browser connection
        -number of pg downs
    output: pg html
    """
    elem = browser.find_element_by_tag_name("a")
    while no_of_pagedowns:
        elem.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.1)
        no_of_pagedowns-=1
    html = browser.page_source
    return html

def close_popup(browser):
    try:
        browser.find_element_by_xpath("//*/button[@aria-label='Close']").click()
    except:
        #close bestseller pop-up
        browser.find_element_by_xpath("//*/button[@class='Modal-close ng-scope']").click()
    
    

def run_automations(url, pages=20, no_of_pagedowns=6):
    """
    input: url from sephora.com/shop/fragrances...
    runs automation to close pop-up, run page down, and navigate to next pg actions
    output: list of urls for each fragrance by page
    """
    url_list = []
    
    browser = webdriver.Chrome()
    browser.get(url)
    time.sleep(.5)
    
    close_popup(browser)
    time.sleep(.2)
  
    for page in range(pages):
        #click on 'next button' to navigate to next pg
        if page == 1:
            browser.find_element_by_class_name('css-1be47h1').click()
        elif page != 0:
            browser.find_elements_by_class_name('css-1be47h1')[1].click()
        #page down 
        html = auto_pagedowns(browser, no_of_pagedowns)
        url_list.append(get_detail_url(html))
        
    #handels case where there are no other pages to navigate to    
    if pages == 0:
        html = auto_pagedowns(browser, no_of_pagedowns)
        url_list.append(get_detail_url(html, True))
        
    return url_list



In [115]:

def get_detail_url(html, page_zero=False):
    """
    input: html from sephora.com/shop/fragrances...
    output: url for detail page for each fragrance
    """
    soup = BeautifulSoup(html, 'html.parser')
    perfume_divs = soup.find_all('div', attrs= {'class': 'css-12egk0t'})
    
    #handles bestsellers html format
    if page_zero == True:
        perfume_divs = soup.find_all('a', attrs= {'class': 'u-size1of4'})
    num_of_perfumes = len(perfume_divs)
    
    urls = []
    
    for i, div in enumerate(perfume_divs):
        try:
            urls.append(div.find('a')['href'])
        
        except:
            urls.append(div['href'])
    return urls


In [116]:
"""ONLY RUN ONCE"""
"""script to write csv files for fragrance urls"""

#get urls for fragrance products
wurls_list = run_automations(wurl)
murls_list = run_automations(murl, pages=6)
burl_list = run_automations(burl, pages = 0, no_of_pagedowns=22)

#flatten list of lists
wurls = [urls.split(' ')[0] for ulist in wurls_list for urls in ulist]
murls = [urls.split(' ')[0] for ulist in murls_list for urls in ulist]
burls = [urls.split(' ')[0] for ulist in burl_list for urls in ulist]

import csv

def url_csv(urls, csvfile):
    """
    write urls to csv
    inputs: 
    list of urls
    file path
    """
    with open(csvfile, "w") as f:
        writer = csv.writer(f, delimiter = ",")
        for url in urls:
            writer.writerow([url])    

women_csvfile = "women_fragrance_urls.csv"
men_csvfile = "men_fragrance_urls.csv"
bestsellers_csvfile = "bestsellers_urls.csv"

url_csv(murls, men_csvfile )
url_csv(wurls, women_csvfile)
url_csv(burls, bestsellers_csvfile )


In [21]:
def build_opener():
    opener = urllib.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    urllib.install_opener(opener)
    

def write_image(url, filepath):
    """takes url and filepath to write image to new file"""
    build_opener()
    urllib.urlretrieve(url, filepath)

write_image(url, 'images/image2.jpg')
       

In [198]:
def get_brand_perfume(soup):
    try:
        header = soup.find('h1', attrs = {'class': 'css-g397qt'})
    except:
        return None
    return [inner.contents[0] for inner in header.find_all('span')]

def get_item_no(soup):
    item = soup.find('div', attrs = {'class': 'css-altys'})
    item_no = list(item.strings)
    return(item_no)

def get_user_feedback(soup):
    try:
        feedback = soup.find('div', attrs = {'class': 'css-12ua0v8'})
    except:
        return None
    feedback_data = [list(inner.strings) for inner in feedback.find_all('span')][:2]
    return feedback_data

def get_price(soup):
    try:
        price = soup.find('div', attrs = {'class': 'css-n8yjg7'}).contents
    except:
        return None
    return price

def get_oz(soup):
    try:
        ozs = soup.find('span', attrs = {'class': 'css-fp7pgu'}).contents
    except:
        return None
    return ozs

def get_options(soup):
    try:
        options = list(soup.find('div', attrs = {'class': 'css-1h02kfs'}).strings)
    except:
        return None
    return options

def get_fragrance_info(soup):
    try:
        fragrance_info = soup.find('div', attrs = {'class': 'css-1vwy1pm'}).strings
    except:
        return None
    return list(fragrance_info)

def get_rating(soup):
    try:
        rating_table = soup.find('table', attrs = {'class': 'css-960eb6'})
        ratings = [list(r.strings) for r in rating_table]
    except:
        return None
    return ratings


def get_image_url(soup):
    try:
        image_url = soup.find('image')['xlink:href']
    except:
        return None
    return image_url
    


def get_all_details(url):
    browser = webdriver.Chrome()
    browser.get(url)
    time.sleep(.5)
    close_popup(browser)
    html = auto_pagedowns(browser, no_of_pagedowns=6)
    time.sleep(.5)
    soup = BeautifulSoup(html, 'html.parser')
    
    
    details_dict = {'brand_info':get_brand_perfume(soup), 'item_no_or_size':get_item_no(soup),
                    'ratings': get_rating(soup), 'feedback': get_user_feedback(soup), 
                    'price': get_price(soup), 'oz_at_price': get_oz(soup), 'options': get_options(soup),
                    'fragrance_info': get_fragrance_info(soup), 'image_url': get_image_url(soup), 'url': url
        }
    
    return details_dict



#sample_dict = get_all_details('https://www.sephora.com/product/replica-by-fireplace-P404758?icid2=products')

In [234]:
def write_dict_csv(urls, filename, dict_fields):
    with open(filename, "a") as f:
        writer = csv.DictWriter(f, dict_fields, delimiter = ",")
        writer.writeheader()
        for url in urls:
            product_dict = get_all_details(url)
            writer.writerow(product_dict)
            