In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.common.exceptions import NoSuchElementException
from collections import defaultdict
import re

## Crawling with requests

In [33]:
#scrape html data
url = 'https://www.paulaschoice.com/ingredient-dictionary'
r = requests.get(url)
soup = BeautifulSoup(r.content)

In [35]:
#list of html data for each ingredient
results = soup.find_all('tr', class_="ingredient-result")
#initialize lists for column data
names = []
descs = []
rats = []
cats = []
#parse data for each ingredient
for r in results:
    #ingredient name
    name = r.find('a').string
    
    #ingredient description
    desc = r.find('p', class_='description ingredient-description')
    if desc== None:
        desc = ''
    else:
        desc = desc.string
        
    #ingredient categories
    categories = r.find_all('div', class_='categories ingredient-categories')
    #create list of categories for each ingredient
    for i in categories:
        to_concat=[]
        cat = i.find_all('a')
        for j in cat:
            to_concat.append(j.string)
        concat = ', '.join(to_concat)
        
    #ingredient rating
    rating = r.find('td', {'class':['col-rating ingredient-rating rating-good', 'col-rating ingredient-rating rating-poor', 'col-rating ingredient-rating rating-best', 'col-rating ingredient-rating rating-average']}).string
    
    names.append(name)
    descs.append(desc)
    cats.append(concat)
    rats.append(rating)
    

In [36]:
#create dataframe
df= pd.DataFrame({'ingredient':names, 'category':cats, 'description':descs, 'rating':rats})

In [37]:
#clean dataframe
#strip leading and trailing whitespaces in the descriptions
df['description']=df.description.str.strip()
#make ingredients lowercase
df['ingredient'] = df.ingredient.str.lower()
#make categories lowercase
df['category'] = df.category.str.lower()

In [12]:
#save dataframe
df.to_csv('skincare ingredient dictionary.csv')

## Crawling with Selenium

In [5]:
#initialize chrome web driver
driver = webdriver.Chrome('chromedriver')
#urls
base = 'https://www.sephora.com/shop/'
pg2 = '&currentPage=2'
webpages = [base+'face-wash-facial-cleanser?pageSize=300',
              base+'face-serum?pageSize=300',
              base+'face-serum?pageSize=300'+pg2,
              base+'moisturizer-skincare?pageSize=300',
              base+'moisturizer-skincare?pageSize=300'+pg2,
              base+'face-sunscreen?pageSize=300']

urls = [] 
cats = []
#establish product categories
categories = ['cleanser', 'serum', 'serum','moisturizer','moisturizer', 'sunscreen']

#loop through webpages
for i,url in enumerate(webpages):
    #open webpage
    driver.get(url)
    #wait for page to load
    time.sleep(1)
    #find body element
    body = driver.find_element_by_tag_name("body")
    
    #scroll down to load entire webpage
    pagedowns = 60
    while pagedowns:
        body.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.2)
        pagedowns-=1
    
    #find product urls
    product_info = driver.find_elements_by_class_name("css-ix8km1")
    for a in product_info:
        suburl = a.get_attribute('href')
        urls.append(suburl)
        #record product category
        cats.append(categories[i])

In [42]:
#create url dataframe
df = pd.DataFrame([urls, cats]).T
df.columns = ['url', 'category']

In [43]:
#category counts
df.category.value_counts()

moisturizer    505
serum          443
cleanser       270
sunscreen      157
Name: category, dtype: int64

In [45]:
#save dataframe
df.to_csv('product urls.csv')

In [46]:
#add columns to dataframe
df_info = pd.DataFrame(columns=['brand', 'name', 'rating', 'reviews','price',
                                'descriptions', 'ingredients','productsize'])
df = pd.concat([df, df_info], axis = 1) 

In [63]:
#initialize chrom web driver
driver = webdriver.Chrome('chromedriver')

#loop through each url
for i in range(len(df)):
    #open webpage
    driver.get(df.url[i])
    #wait for data to load
    time.sleep(2)
    
    #product brand and name
    item = driver.find_element_by_class_name("css-1wd4e6l").text.split('\n')
    df.brand[i] = item[0]
    df.name[i] = item[1]
    
    #price
    df.price[i] = driver.find_element_by_class_name('css-1865ad6').text
    
    #descriptions
    df.descriptions[i]= driver.find_element_by_class_name('css-pz80c5').text
    
    #one page down
    body = driver.find_element_by_tag_name("body")
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(1)

    #ingredients
    #click ingredient tab get ingredient info
    driver.find_element_by_id('tab2').click()
    df.ingredients[i] = driver.find_element_by_id('tabpanel2').text
    
    #scoll down to rating and reviews
    pagedowns = 3
    while pagedowns:
        body = driver.find_element_by_tag_name("body")
        body.send_keys(Keys.PAGE_DOWN)
        time.sleep(1)
        pagedowns-=1
    
    #rating 
    try:
        ra = driver.find_element_by_class_name('css-1kkx19h.eanm77i0').text
        #grab numerator only
        df.rating[i] = float(ra.split(' / ')[0])
    #some product don't have ratings 
    except NoSuchElementException:
        df.rating[i] = np.nan
    
    #productsize
    try:
        s = driver.find_element_by_class_name('css-128n72s').text
        #grab number only
        s = s.split('/')[0].split('SIZE ')[1]
        df.productsize[i] = s
    
    #if element does not appear in first location, check second location
    except (NoSuchElementException, IndexError):
        try:
            s = driver.find_element_by_class_name('css-ta42ek').text 
            s = s.split(': ')[1].split('/')[0]
            df.productsize[i] = s
        #if element does not appear in either location, product size is not listed
        except (NoSuchElementException, IndexError):
            df.productsize[i] = 'not listed'
    time.sleep(0.5)
    #track progress
    print(i, time.ctime())
df.to_csv('sephora_crawl.csv')

1220 Wed Dec 16 21:35:01 2020
1221 Wed Dec 16 21:35:12 2020
1222 Wed Dec 16 21:35:23 2020
1223 Wed Dec 16 21:35:35 2020
1224 Wed Dec 16 21:35:48 2020
1225 Wed Dec 16 21:36:01 2020
1226 Wed Dec 16 21:36:13 2020
1227 Wed Dec 16 21:36:24 2020
1228 Wed Dec 16 21:36:37 2020
1229 Wed Dec 16 21:36:48 2020
1230 Wed Dec 16 21:37:00 2020
1231 Wed Dec 16 21:37:10 2020
1232 Wed Dec 16 21:37:21 2020
1233 Wed Dec 16 21:37:33 2020
1234 Wed Dec 16 21:37:44 2020
1235 Wed Dec 16 21:37:57 2020
1236 Wed Dec 16 21:38:08 2020
1237 Wed Dec 16 21:38:19 2020
1238 Wed Dec 16 21:38:30 2020
1239 Wed Dec 16 21:38:42 2020
1240 Wed Dec 16 21:38:53 2020
1241 Wed Dec 16 21:39:04 2020
1242 Wed Dec 16 21:39:16 2020
1243 Wed Dec 16 21:39:30 2020
1244 Wed Dec 16 21:39:42 2020
1245 Wed Dec 16 21:39:53 2020
1246 Wed Dec 16 21:40:04 2020
1247 Wed Dec 16 21:40:15 2020
1248 Wed Dec 16 21:40:26 2020
1249 Wed Dec 16 21:40:36 2020
1250 Wed Dec 16 21:40:48 2020
1251 Wed Dec 16 21:40:59 2020
1252 Wed Dec 16 21:41:10 2020
1253 Wed D

In [87]:
#clean up ingredient info and delete text not part of ingredient list
cleaned = df.ingredients.str.replace('Please be aware that ingredient lists may change or vary from time to time. Please refer to the ingredient list on the product package you receive for the most up to date list of ingredients.', '')
cleaned = cleaned.str.replace('Clean at Sephora is formulated without a list of over 50 ingredients, including sulfates (SLS and SLES), parabens, phthalates, and more. For the full list, check out the Ingredients tab.', '')
cleaned = cleaned.str.split('\n\n')
for i,product in enumerate(cleaned):
    cleaned[i] = [j for j in product if j != '']
    if len(cleaned[i])!= 1:
        cleaned[i] = cleaned[i][1]
    else:
        cleaned[i] = cleaned[i][0]

df['ingredients_cleaned'] = cleaned

In [88]:
df

Unnamed: 0,url,category,brand,name,rating,price,descriptions,ingredients,productsize,ingredients_cleaned
0,https://www.sephora.com/product/kale-spinach-g...,cleanser,Youth To The People,Superfood Antioxidant Cleanser,4.3,$36.00,What it is: An award-winning face wash with co...,"Water, Cocamidopropyl Hydroxysultaine, Sodium ...",8 oz,"Water, Cocamidopropyl Hydroxysultaine, Sodium ..."
1,https://www.sephora.com/product/soy-face-clean...,cleanser,Fresh,Soy Makeup Removing Face Wash,4.4,$38.00,What it is: A bestselling three-in-one face wa...,-Amino Acid-rich Soy Proteins: Help maintain s...,5.1 oz,"Water, Coco-Glucoside, Glycerin, Butylene Glyc..."
2,https://www.sephora.com/product/green-clean-ma...,cleanser,Farmacy,Green Clean Makeup Removing Cleansing Balm,4.5,$34.00,What it is: A makeup remover and face cleanser...,-Sunflower and Ginger Root Oils: Melt even stu...,3.4 oz,"Cetyl Ethylhexanoate, Caprylic/Capric Triglyce..."
3,https://www.sephora.com/product/the-deep-clean...,cleanser,Tatcha,The Deep Cleanse Gentle Exfoliating Cleanser,4.2,$38.00,Which skin type is it good for?\n✔ Normal\n✔ O...,"-Japanese Luffa fruit Exfoliant: Hydrates, cla...",5 oz,"Water, Sodium Cocoyl Glutamate, Propanediol, G..."
4,https://www.sephora.com/product/tatcha-the-ric...,cleanser,Tatcha,The Rice Wash Skin-Softening Cleanser,4.6,$35.00,"What it is: A PH-neutral, daily cream cleanser...",-Blend of Japanese Algae: Leaves skin feeling ...,4.0 oz,"Aqua/Water/Eau, Microcrystalline Cellulose, Pr..."
...,...,...,...,...,...,...,...,...,...,...
1370,https://www.sephora.com/product/bienfait-teint...,sunscreen,Lancôme,Bienfait Teinté Beauty Balm Sunscreen Broad Sp...,4.2,$47.00,Skin type:\n✔ Normal\n✔ Dry\n✔ Combination\n✔ ...,"-Antioxidants, Vitamin E, Vitamin B5, Vitamin ...",1.7 oz•ITEM 1509348,"Water, Cyclopentasiloxane, Cyclohexasiloxane, ..."
1371,https://www.sephora.com/product/city-block-she...,sunscreen,CLINIQUE,City Block Sheer Oil-Free Daily Face Protector...,4.1,$28.00,What it is:\nA lightweight daily sunscreen tha...,"Titanium Dioxide 7.30% , Zinc Oxide 6.90%Water...",1.4 oz,"Titanium Dioxide 7.30% , Zinc Oxide 6.90%Water..."
1372,https://www.sephora.com/product/sugar-sport-tr...,sunscreen,Fresh,Sugar Sport Treatment Sunscreen SPF 30,4,$25.00,"What it is:\nA durable, water-resistant treatm...",-Avobenzone 2%: Sunscreen\n-Octinoxate 7.49%: ...,0.2 oz•ITEM 1788363,"Cera Alba (Beeswax), Ricinus Communis (Castor)..."
1373,https://www.sephora.com/product/advanced-time-...,sunscreen,Estée Lauder,Advanced Time Zone SPF 15- Normal/Combination ...,3.7,$82.00,What it is: A face cream that helps boost hyal...,"Active Ingredients: Avobenzone 3.00%, Octinoxa...",1.7 oz,"Water, Butylene Glycol, Glycerin, Octyldodecyl..."


In [89]:
#drop uncleaned ingredient column
df.drop('ingredients', axis=1, inplace=True)

In [90]:
#clean up price column
wrong_indices = []
for i in range(len(df)):
    result = re.search('^\$\d{0,3}\.\d{2}$', df.price[i])
    if result == None:
        wrong_indices.append(i)
df.loc[wrong_indices, 'price'] = df.iloc[wrong_indices].price.str.split('\n')
for i in wrong_indices:
    df.price[i] = df.price[i][0]
df.loc[wrong_indices, 'price'] = df.price[wrong_indices].str.split(' ')
for i in wrong_indices:
    df.price[i] = df.price[i][0]

In [91]:
#save dataframe
df.to_csv('cleaned sephora crawl.csv')