In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import sqlite3
from PIL import Image
import regex as re
import requests
import shutil

## Create Database

In [2]:
def store_database(query, query_data):
    database = sqlite3.connect('ProductDatabase.db')
    cur = database.cursor()
    cur.execute('''
    CREATE TABLE IF NOT EXISTS Data_Overview 
    (Product_Name TEXT PRIMARY KEY, Category TEXT, Rating INTEGER, Image BLOB)
    ''')
    cur.execute('''
    CREATE TABLE IF NOT EXISTS Product_Ingredient (Product_Name TEXT PRIMARY KEY, Ingredients TEXT)
    ''')
    cur.execute('''
    CREATE TABLE IF NOT EXISTS  Ingredient_Data (Ingredient_Name TEXT PRIMARY KEY, Rating INTEGER, Concern TEXT)
    ''')
    cur.execute(query, query_data)
    database.commit()
    database.close()
    return

## Start Store of All Data

In [3]:
def store_all_data():
    for i in Data_Overview.keys():
        query_data = tuple((i, Data_Overview[i]['Category'], Data_Overview[i]['Rating'], Data_Overview[i]['Image']))
        query = '''
        INSERT OR REPLACE INTO Data_Overview VALUES (?, ?, ?, ?)
        '''
        store_database(query, query_data)

    for i in Product_Ingredient.keys():
        query_data = tuple((i, Product_Ingredient[i]['Ingredient List']))
        query = '''
        INSERT OR REPLACE INTO Product_Ingredient VALUES (?, ?)
        '''
        store_database(query, query_data)

    for i in Ingredient_Data.keys():
        query_data = tuple((i, Ingredient_Data[i]['Rating'], Ingredient_Data[i]['Concern']))
        query = '''
        INSERT OR REPLACE INTO Ingredient_Data VALUES (?, ?, ?)
        '''
        store_database(query, query_data)


## Function to Close any Pop-Ups

In [4]:
def close_pop_ups():
    try:
        driver.find_element_by_xpath('//*[@id="lightbox-38369f2a-9d8e-4235-889b-c3f626e7be07-1584378520088"]/div').click()
    except Exception:
        try:
            driver.find_element_by_xpath('//*[@id="lightbox-c3d6f1b0-00f4-4afb-9adf-2babd8bee6e4-1584379938908"]/div').click()
        except Exception:
            return

## Function to Download Image and Store Object File

In [5]:
def get_product_image(product_image):
    for i in range(len(product_image)):
        image = requests.get(product_image[i], stream = True)
        
        if image.status_code == 200:
            with open("image.jpg", "wb") as file:
                image.raw.decode_content = True
                shutil.copyfileobj(image.raw, file)
        with open("image.jpg", "rb") as file:
            product_image[i] = file.read()
        

## Function to Get Page Detail of all categories

In [6]:
def get_page_data(category_name):
    product_link, product_name, product_image, product_rating = list(), list(), list(), list()
    products = driver.find_elements_by_class_name('product-tile')
    for product in products:
        temp = product.find_element_by_tag_name('a').get_attribute('href')
        product_link.append(temp)

        product_name.append(temp[45:-1].replace("_", " "))

        product_image.append(product.find_element_by_class_name('product-image').get_attribute('src'))

        try:
            temp = product.find_element_by_class_name('product-score-img')
            val = temp.get_attribute('class')
            if 'verified' in val:
                product_rating.append(0)
            elif 'squircle' in val:
                product_rating.append(int(temp.get_attribute('src')[52 : 54]))
            else:
                product_rating.append(-1)
        except Exception:
            product_rating.append(-1)
    
    get_product_image(product_image)        
    get_product_details(product_link, product_name)


    for i in range(len(product_name)):
        if product_name[i] not in Data_Overview:
            Data_Overview[product_name[i]] = {
            "Category" : category_name, 
            "Rating" : product_rating[i], 
            "Image" : product_image[i]
            }
    return

## Function to get table data of each product

In [7]:
def get_product_details(product_link, product_name):
    
    for link in product_link:
        driver.get(link)
        
        table = driver.find_element_by_tag_name('table')
        
        score = table.find_elements_by_class_name('ingredient-score')
        score = list(map(lambda x : int(x.get_attribute('src')[52 : 54]) , score))
        
        ingredients = table.find_elements_by_class_name('td-ingredient-interior')
        ingredients = list(map(lambda x : x.text.replace('\n', ' | '), ingredients))
        
        concern = table.find_elements_by_class_name('td-concern-interior')
        concern = list(map(lambda x : x.text.replace('• ', '').replace('\n', ' | '), concern))
        
        for i in range(len(ingredients)):
            if ingredients[i] not in Ingredient_Data:
                Ingredient_Data[ingredients[i]] = {
                    "Rating" : score[i], 
                    "Concern" : concern[i] 
                }
        name = product_name[product_link.index(link)]
        if name not in Product_Ingredient:
            Product_Ingredient[name] = {
                "Ingredient List" : " || ".join(ingredients)
            }

## Main Function that drives the Code

In [8]:
def scrape():    
    try:
        makeup = driver.find_element_by_xpath('/html/body/div[2]/header/nav[1]/ul[1]/li[5]')
    except Exception:
        makeup = driver.find_element_by_xpath('/html/body/div[2]/header/nav[2]/ul/li[5]/div')

    categories = list(map(lambda x : x.get_attribute("href"), makeup.find_elements_by_tag_name('a')))[:-2]

    for category in categories[8:]:
        category_name = category.split('/')[-2]     
        next_page = category
        index = 0
        while next_page:
            driver.get(next_page)
            get_page_data(category_name)
            driver.get(next_page)
            link = ['/html/body/div[2]/div/main/section[4]/div[1]/a',
                    '/html/body/div[2]/div/main/section[4]/div[1]/a[2]',
                    '/html/body/div[2]/div/main/section[3]/div[2]/a[7]',
                    '/html/body/div[2]/div/main/section[3]/div[2]/a[8]']
            try:
                next_page = driver.find_element_by_xpath(link[index]).get_attribute("href")
            except Exception:
                try:
                    next_page = driver.find_element_by_xpath(link[index + 2]).get_attribute("href")
                except Exception:
                    next_page = False
            index = 1
        categories[categories.index(category)] = True
        store_all_data()


## Start the Crawler Engine

In [10]:
driver = webdriver.Chrome()
driver.implicitly_wait(10)
URL = 'https://www.ewg.org/skindeep/'
driver.get(URL)
Data_Overview = dict()
Product_Ingredient = dict()
Ingredient_Data = dict()
scrape()
store_all_data()
driver.close()
return