# TDI Capstone Project : Allergen Finder

In [1]:
import json
import pickle
import requests
from bs4 import BeautifulSoup
import re
import os, sys
from collections import namedtuple, defaultdict, Counter
import time
import random
import contextlib
from operator import itemgetter
from tabulate import tabulate
from pprint import pprint
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys as KEYS
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import staleness_of

In [2]:
TARGET_PREFIX = 'https://www.target.com'
RESULTS_URL = f'{TARGET_PREFIX}/s?searchTerm=Sunscreen'
CHROME_DRIVER = "/Users/alexarmstrong/local/bin/chromedriver"
PRODUCT_BLACKLIST = [
    'Babyganics Sun & Bug Duo Set', # Duo set, isolated sunscreen already included
]

In [10]:
ProductLink = namedtuple('ProductLink', ['title', 'url'])

fields = ('name', 'rank', 'percent', 'alt_names')
Ingredient = namedtuple('Ingredient', fields, defaults=(None,)*len(fields))

In [6]:
# URL to sunscreen search result pages
result_urls = json.load(open('data/sunscreen_result_urls.json','r'))

# (name, url) for products in all search result pages
tmp = json.load(open('data/sunscreen_products.json','r'))
product_links = [[ProductLink(x[0],x[1]) for x in p] for p in tmp]

# HTML source for each product page
product_pages = pickle.load(open('data/product_pages.pkl', 'rb'))

# {'product name' : {key : val}}
product_details = pickle.load(open('data/product_details.pkl','rb'))
#json.dump(product_details, open('data/product_details.json', 'w'))

# {'brand_name' : [product names]} for all products
ordered_prod_by_brand = json.load(open('data/products_by_brand.json','r'))

Project Requirements
- [x] Access search results page
- [x] Extract links from a single page
- [x] Extract links from all results pages
- [x] Extract product links from all results pages
- [x] Extract drug facts from a single product page
- [X] Extract drug facts from all product pages
- [x] Clean data enough for pitch demo
- [x] Write function for scoring ingredients
- [x] Write function for scoring products
- [ ] Add a visual (e.g. 2D plot of products; with similarity to irritating and non-irritating products on the two axes)
- [x] Build webapp
- [ ] Deploy webapp

Improvements
- Clean Data
    - [ ] Find and correct typos (e.g. octorcylene instead of octocrylene)
    - [ ] Map synonyms to same word (e.g. water, aqua, and eau)
    - [ ] Remove irrelevant descriptors (e.g. vitis vinifera, vitis vinifera seed extract, and vitis vinifera fruit extract)
    - [ ] Parse product titles (brand, name, spf, fluid oz, other)
- Augment Data
    - [ ] Extract ingredients from products without drug facts tab
    - [ ] Add sunscreen products from sites other than target
    - [ ] Build database of ingredients that includes irritation risk and name variants (e.g. scrape EWG site)
    - [ ] Get info on relative amount of ingredients within a product
    - [ ] Add non-sunscreen products
- Improve ingredient score
    - [ ] Incorporate document frequency of ingredients over entire product database
- Improve product score
    - [ ] cosine similarity or Word2Vec?
- [ ] Normalize data and store in SQLite3 database

Above and Beyond
- [ ] Allow inputs to include brands in addition to exact product names
- [ ] Include images of products

## Application Code

In [7]:
#INGREDIENTS = json.load('product_ingredients.json')

# def find_potential_allergens(product_names):
#     product_ingredients = [get_ingredients(p) for p in product_names]
#     allergens = determine_allergens(product_ingredients)
#     return allergens

# def get_ingredients(product_name):
#     return INGREDIENTS.get(product_name, [])

# def determine_allergens(product_ingredients):
#     all_ingredients = {ingredient for prod in product_ingredients for ingredient in prod}
#     prod_freq = get_document_frequency(all_ingredients, product_ingredients)
#     return []

Data Wrangling Functions

In [8]:
def initialize_driver(headless=True):
    opts = Options()
    if headless:
        opts.add_argument("--headless")
    #opts.binary_location = ""
    driver = webdriver.Chrome(options=opts, executable_path=CHROME_DRIVER)
    return driver

@contextlib.contextmanager
def chrome_driver():
    driver = initialize_driver()
    yield driver
    driver.close()

@contextlib.contextmanager
def wait_for_page_load(driver, timeout=30, poll_freq=2):
    # See https://stackoverflow.com/questions/5868439/wait-for-page-load-in-selenium
    old_page = driver.find_element_by_tag_name('html')
    
    yield
    
    wait = WebDriverWait(driver, timeout, poll_freq)
    wait.until(staleness_of(old_page)) 

def get_page_source(url, driver=None):
    if not driver:
        with chrome_driver() as driver:
            return get_page_source(url, driver)
    
    with wait_for_page_load(driver):
        driver.get(url)
        html = driver.find_element_by_tag_name('html')
        html.send_keys(KEYS.END)
    return driver.page_source

def click_if_one(eles):
    if len(eles) == 1:
        eles[0].click()   

def get_about_page_source(url, driver=None):
    if not driver:
        with chrome_driver() as driver:
            return get_about_page_source(url, driver)
    
    with wait_for_page_load(driver):
        driver.get(url)
    
    click_if_one(driver.find_elements_by_xpath("//button[text()='Show more']"))
    click_if_one(driver.find_elements_by_id('tab-Drugfacts'))
    click_if_one(driver.find_elements_by_id('tab-Details'))
    
    return driver.page_source
    
def get_product_links(page_source):
    soup = BeautifulSoup(page_source) 
    product_cards = soup.find_all('li', attrs={'data-test': 'list-entry-product-card'})
    products = []
    for card in product_cards:
        title = card.find('a', attrs={'data-test': 'product-title'})
        if not title:
            products.append("<Not Found>")
        elif title in PRODUCT_BLACKLIST:
            continue
        else:
            url = f"{TARGET_PREFIX}{title['href']}"
            products.append(ProductLink(title.string, url))
    return products

Utilities

In [9]:
from datetime import datetime
class LoadingBar():
    def __init__(self, total, mod=None):
        self.start = None
        self.count = 0
        self.total = total
        self.mod   = mod if mod else 1 if total < 100 else total//100
        
    def begin(self):
        self.start = datetime.now()
        return self
        
    def update(self):
        if not self.start:
            self.begin()
        self.count += 1
        
        time_elapsed = (datetime.now()-self.start).seconds
        minutes, sec = divmod(time_elapsed, 60)
        time_str = f'{minutes: >2}min {sec:02}s'
        
        rate = self.count/time_elapsed if time_elapsed > 0 else 1
        remaining_time = ((self.total-self.count)/rate)/60
        
        if (self.count % self.mod) and (self.count < self.total):
            return
        
        print(f'{self.count/self.total:4.0%} Done; ({self.count:>4}/{self.total}) {time_str} elapsed [~{round(remaining_time)}min remaining]    \r', end='')
        sys.stdout.flush()
        
        if self.count == self.total:
            print()

## Getting Page Sources

Getting the first search result page

In [11]:
test_page_source = get_page_source(RESULTS_URL)

Getting links to other search result pages

In [12]:
soup = BeautifulSoup(test_page_source)
print(len(soup.prettify().split('\n')), 'lines of HTML')

3335 lines of HTML


In [13]:
next_page_button = soup.find_all('a',attrs={'data-test':"next"})
print(len(next_page_button) == 1)
next_page_button = next_page_button[0]

next_page_url = f'{TARGET_PREFIX}{next_page_button["href"]}'
print(next_page_url)

n_pages = soup.find_all('button',attrs={'data-test':"select"})
print(len(n_pages) == 1)
n_pages = n_pages[0]
print(n_pages.text)

True
https://www.target.com/s?searchTerm=Sunscreen&Nao=24
True
page 1 of 14


In [553]:
# Get urls to all result pages (currently scraping product links as well)
#%%time
# product_links = []
# result_urls = []
# n_pages = 14
# url = RESULTS_URL
# for page_num in range(1,n_pages+1):
#     print(f"\nLoading Page {page_num} :", url)
#     page_source = get_page_source(url, driver)
#     soup = BeautifulSoup(page_source)
    
#     # Get results page number
#     page_n = soup.find_all('button',attrs={'data-test':"select"})
#     if len(page_n) != 1:
#         print("BUG 1")
#     else:
#         print(f'INFO ::', page_n[0].text)
    
#     # Check
#     if driver.current_url != url:
#         print("DEBUG :: URL Changes")
#         print("DEBUG :: Before:", url)
#         print("DEBUG :: After:", driver.current_url)
#         break
#     result_urls.append(driver.current_url)
    
#     # Parse
#     products = get_product_links(page_source)
#     product_links.append(products)
#     print(f'INFO :: {len(products)} products extracted')
    
#     if page_num < n_pages:  
#         next_page = soup.find_all('a',attrs={'data-test':"next"})
#         if len(next_page) != 1:
#             print("DEBUG :: Can't find next page button")
#             break

#         next_page = next_page[0]
#         url = f'{TARGET_PREFIX}{next_page["href"]}'

In [15]:
for i, products in enumerate(product_links, 1):
    print(f'Page {i:>2} : {len(products)} products :', result_urls[i-1])

Page  1 : 28 products : https://www.target.com/s?searchTerm=Sunscreen
Page  2 : 28 products : https://www.target.com/s?searchTerm=Sunscreen&Nao=24
Page  3 : 28 products : https://www.target.com/s?searchTerm=Sunscreen&Nao=48
Page  4 : 26 products : https://www.target.com/s?searchTerm=Sunscreen&Nao=72
Page  5 : 24 products : https://www.target.com/s?searchTerm=Sunscreen&Nao=96
Page  6 : 24 products : https://www.target.com/s?searchTerm=Sunscreen&Nao=120
Page  7 : 24 products : https://www.target.com/s?searchTerm=Sunscreen&Nao=144
Page  8 : 24 products : https://www.target.com/s?searchTerm=Sunscreen&Nao=168
Page  9 : 24 products : https://www.target.com/s?searchTerm=Sunscreen&Nao=192
Page 10 : 24 products : https://www.target.com/s?searchTerm=Sunscreen&Nao=216
Page 11 : 24 products : https://www.target.com/s?searchTerm=Sunscreen&Nao=240
Page 12 : 24 products : https://www.target.com/s?searchTerm=Sunscreen&Nao=264
Page 13 : 24 products : https://www.target.com/s?searchTerm=Sunscreen&Nao=28

In [549]:
# json.dump(product_links, open('sunscreen_products.json','w'))
# json.dump(result_urls, open('sunscreen_result_urls.json','w'))

**Getting a single product page source**

In [244]:
all_products = [t for p in product_links for t in p]
all_titles = [p.title for p in all_products]
all_urls   = [f'{TARGET_PREFIX}{p.url}' for p in all_products]
print(len(set(all_titles)))

323


In [17]:
test_url = f'{TARGET_PREFIX}{all_products[5].url}'.replace('#lnk=sametab','')
print(test_url)
test_page_source = get_about_page_source(test_url)
print(len(test_page_source), 'characters of HTML')

https://www.target.com/p/cerave-am-facial-moisturizing-lotion-with-sunscreen-spf-30/-/A-81616314?preselect=76545851
620690 characters of HTML


In [18]:
soup = BeautifulSoup(test_page_source)
print(len(soup.prettify().split('\n')), 'lines of HTML')

5660 lines of HTML


In [19]:
item_details_soup = soup.find('div', attrs={'id': 'tabContent-tab-Details'})
drug_facts_soup = soup.find('div', attrs={'id': 'tabContent-tab-Drugfacts'})
print(bool(drug_facts_soup), bool(item_details_soup))

False True


**Getting page source for all products**

In [1065]:
indices = sorted(set([
    #1,2,3,
    #*list(range(0,10)),
    *list(range(0,len(all_urls))),
]))
print('Indices:', indices)

selector = itemgetter(*indices)
urls = selector(all_urls)
print('\n',len(urls), 'URLs')

Indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 

In [839]:
# product_pages = ['']*len(all_urls)
# load_bar = LoadingBar(len(urls), mod=1).begin()
# for i, url in enumerate(all_urls):
#     load_bar.update()
#     if i not in indices:
#         continue
#     if all_titles[i] in PRODUCT_BLACKLIST:
#         continue
#     try:
#         test_page_source = get_about_page_source(url)
#     except:
#         print()
#         print('ERROR :: url   =', url)
#         print(sys.exc_info()[0])
#         test_page_source = ''
#     product_pages[i] = test_page_source
# print()

100% Done; (  56/56)  7min 54s elapsed [~0min remaining]    



In [20]:
print(len(product_pages),'pages scraped')
print(sum([1 for p in product_pages if not p]), 'pages failed to be scraped')

338 pages scraped
0 pages failed to be scraped


Diff check before overwriting

In [22]:
old_product_pages = pickle.load(open('product_pages.pkl', 'rb'))

In [23]:
for i, source in enumerate(old_product_pages):
    if source != product_pages[i]:
        print(i, end=', ')
print()
print('DONE')


DONE


In [849]:
#pickle.dump(product_pages, open('product_pages.pkl', 'wb'))

## Extracting Data

**Extracting Product Links** from results pages

In [33]:
test_url = RESULTS_URL
print(test_url)
test_page_source = get_page_source(test_url)

https://www.target.com/s?searchTerm=Sunscreen


In [34]:
# Extract Manually
soup = BeautifulSoup(test_page_source)
body = soup.body
root = body.find('div', id='root')
main = root.find('div', id='mainContainer')
grid = main.find('div', attrs={'data-test': 'product-grid'})
product_list = grid.find('div', attrs={'data-test': 'product-list-container'})
product_cards = product_list.find_all('li', attrs={'data-test': 'list-entry-product-card'})
print(len(product_cards), 'products scraped')

test_card = product_cards[-1]
card_body = test_card.find('div', attrs={'data-test': 'productCardBody'})
product_title = card_body.find('a', attrs={'data-test': 'product-title'})
print(product_title.string)

28 products scraped
Neutrogena Beach Defense Lotion - SPF 50 - 6.7 fl oz


In [36]:
# Function call
products = get_product_links(test_page_source)
for i, p in enumerate(products):
    print(f'{i:>2} : {p.title}')

 0 : Continuous Sport Sunscreen Spray - up & up™
 1 : Sport Sunscreen Spray - SPF 50 - 2pk/11oz - up & up™
 2 : CeraVe Hydrating Mineral Face Sunscreen Lotion with Zinc Oxide – SPF 30 - 2.5oz
 3 : Neutrogena Kids Oil Free Water Resistant Sunscreen Spray Pack - SPF 70 - 5oz
 4 : Coppertone Sport Sunscreen Spray
 5 : CeraVe Ultra-Light Face Moisturizer with Sunscreen - SPF 30 – 1.7oz
 6 : Banana Boat Ultra Sport Clear Sunscreen Spray
 7 : Sun Bum Original Sunscreen Spray - 6 oz
 8 : CeraVe Hydrating Mineral Face Sunscreen Lotion with Zinc Oxide – SPF 50 – 2.5oz
 9 : Sun Bum Original Sunscreen Lotion
10 : Sport Sunscreen Lotion - 10.4 fl oz - up & up™
11 : CeraVe AM Facial Moisturizing Lotion with Sunscreen - SPF 30
12 : Neutrogena Ultra Sheer Dry-Touch Sunscreen Lotion
13 : Kids Sunscreen Spray - SPF 50 - up & up™
14 : Coppertone Pure & Simple Sunscreen Lotion - SPF 50 - 6 fl oz
15 : Sun Bum Sunscreen Face Stick - SPF 30 - 0.45oz
16 : Coppertone Kids Sunscreen Spray - SPF 50 - 11oz - Twi

**Extracting Product Details** from product page
- [ ] Scrape ingredients from ingredient tab
- [ ] Scrape ingredients from details tab when there is no ingredients tab
- [ ] Scrape details from details tab

In [45]:
drug_facts_headers = [
    # Ingredients
    'Active ingredients',
    'Inactive ingredients',
    
    # Warnings and restrictions
    'Children under 6 months',
    'Do not use',
    'Stop use and ask a doctor if',
    'Keep out of reach',
    'Warnings', 

    # Usage
    'Uses',
    'For use',
    'When using',
    'Purpose',
    'Directions',
    
    # Contact info
    'Question or comments?',
    
    # Other
    'Other information',
    
]

header_translate = {
    'Active ingredient' : 'Active ingredients',
    'doNotUse': 'Do not use',
    'forUse' : 'For use',
    'outOfReach' : 'Keep out of reach',
    'whenUsing': 'When using',
    'When using this product' : 'When using',
    'stopUse': 'Stop use and ask a doctor if',
    'Other Information': 'Other information',
    'Other warnings' : 'Warnings',
    'Children Under 6 Months' : 'Children under 6 months',
    'Children Under 6 Months of Age' : 'Children under 6 months',
    'Children under 6 months of age' : 'Children under 6 months',
    'For Use on Children Less Than 6 Months of Age' : 'Children under 6 months',
}

Test Parsing Strings

In [46]:
check_phrases = [
    #'other information protect this product from excessive heat and direct sun',
    #'protect this product from excessive heat and direct sun',
    ' this ',
    ' see ',
    ' for ',
    #'certified organic',
    #'+certified organic ingredient',
    #'+certified organic ingredients unless otherwise specified',
    #'†certified organic ingredient',
    #'active ingredients',
    # ?
    #'xanthan gum plant-derived scent constituent',    
]

In [90]:
fields = ('name', 'rank', 'percent', 'alt_names')
Ingredient = namedtuple('Ingredient', fields, defaults=(None,)*len(fields))

remove_phrase = re.compile(r'''
    (in)?active\ ingredients(:)?
    |(?<!^)[\+†\*]?certified\ organic.*
    |^certified\ organic\ to\ usda
    |(?<!^)[\+†\*]naturally\ derived.*
    |(?<!^)[\+†\*]natural\ fragrance.*
    |other\ information.*
    |[\+†\*]
    |[\.]+(?=[\s$])
    |,\s*$
    |^\s*,
''', re.VERBOSE)
# Extract active ingredients
# 'avobenzone (2.0%)' -> ['avobenzone', '2.0']
# 'zinc oxide (non-nano) 3.5%' -> ['zinc oxide', '3.5', '(non-nano)']
# 'zinc oxide 3.5% (non-nano)' -> ['zinc oxide', '3.5', '(non-nano)']
patt = re.compile(r'''
    (?<!\()(\w[\w\s]+)
    \s+
    (\([A-Za-z-\s]+\)\s+)?
    (\(?(\d+(\.\d+)?)\s*%\)?)?
    (\s+\([A-Za-z-\s]+\)\s*)?
''', re.VERBOSE) 
def parse_active_ingredients(text):
    ingredients = []    
    text = remove_phrase.sub("",text)
    
    if '%' not in text:
        for name in re.split('  ',text):
            name = name.strip()
            if name:
                ingredients.append(Ingredient(name=name.strip()))
        return ingredients
    
    percents = sorted([float(m.group(4)) for m in patt.finditer(text) if m.group(4)], reverse=True)
    for m in patt.finditer(text):
        a = m.group(2) if m.group(2) else m.group(6) if m.group(6) else None
        if a is not None:
            a = a.strip()[1:-1].strip()
        p = float(m.group(4)) if m.group(4) else None
        r = percents.index(p) if m.group(4) else None
        item = Ingredient(name=m.group(1).strip(), alt_names=a, percent=p, rank=r)
        if len(item.name) <= 2 or item.name in ['sunscreen']:
            print(f'DEBUG active : {repr(item.name)} from {text}')
        ingredients.append(item)
    return ingredients

# TEST
#s = 'certified organic to usda avobenzone (2.0%) (2 %), ensulizole (5%) octisalate (5 %), zinc oxide 7.25% (non nano), zinc oxide (non-nano) 7.25%.'
s = 'active ingredients: avobenzone  zinc oxide  octisalate  octocrylene'
print('Input:',repr(s),'\nOutput:')
out = parse_active_ingredients(s)
for item in out:
    print('-',item)
print()

def parse_inactive_ingredients(text):
    '''
    NOTES:
    - Inferring rank: Some lists are ordered by rank while others are semi-alphabetical making it difficult to verify rank ordering is used
    '''
    ingredients = []
    text = re.sub(',{2,}',',',text)
    text = re.sub('(?<=[a-z]),(?=[a-z])',', ', text)
    text = remove_phrase.sub("",text)
    if ',' not in text:
        # Usually means no ingredients listed and instead there is a description
        # e.g. 'protect this product from excessive heat'
        return []
    
    if any(x in text for x in check_phrases):
        print("DEBUG phrase :", text)
    for t in re.split(r',\s+', text):
        if not t: 
            continue
        # RegEx: tinctorius (safflower) oil (slimy) -> ['(safflower)', '(slimy)']
        m = re.findall(r'\([^)]+\)',t)
        if m:
            name = t
            for paren in m:
                name = name.replace(paren,'')
            alt_names = ', '.join([paren[1:-1].strip() for paren in m])
        else:
            name = t
            alt_names = None
        name = re.sub(r'\*|\.$'," ",name)
        name = re.sub(r'\s+', ' ', name).strip() # remove extra spaces, periods, and astericks
        if len(name) <= 2 or name.startswith(',') or re.search('[a-z][\.,][a-z]',name) or ':' in name:
            print(f'DEBUG inactive : {repr(name)} from {text}')
        item = Ingredient(name=name, alt_names=alt_names)
        ingredients.append(item)
    
    return ingredients

# TEST
s = 'glycerin,stearic acid, peg-100 stearate, water (aqua), carthamus (thing) tinctorius (safflower) seed (organic) oil*, flower extract. *certified organic ingredients.,'
#s = 'protect this product from excessive heat and sun.'
print('Input:',repr(s),'\nOutput:')
out = parse_inactive_ingredients(s)
for item in out:
    print('-',item)

Input: 'active ingredients: avobenzone  zinc oxide  octisalate  octocrylene' 
Output:
- Ingredient(name='avobenzone', rank=None, percent=None, alt_names=None)
- Ingredient(name='zinc oxide', rank=None, percent=None, alt_names=None)
- Ingredient(name='octisalate', rank=None, percent=None, alt_names=None)
- Ingredient(name='octocrylene', rank=None, percent=None, alt_names=None)

Input: 'glycerin,stearic acid, peg-100 stearate, water (aqua), carthamus (thing) tinctorius (safflower) seed (organic) oil*, flower extract. *certified organic ingredients.,' 
Output:
- Ingredient(name='glycerin', rank=None, percent=None, alt_names=None)
- Ingredient(name='stearic acid', rank=None, percent=None, alt_names=None)
- Ingredient(name='peg-100 stearate', rank=None, percent=None, alt_names=None)
- Ingredient(name='water', rank=None, percent=None, alt_names='aqua')
- Ingredient(name='carthamus tinctorius seed oil', rank=None, percent=None, alt_names='thing, safflower, organic')
- Ingredient(name='flower 

Test Parsing Page

In [165]:
#index = all_titles.index('Thinksport Mineral Kids Sunscreen Lotion - SPF 50 - 3oz')
index = 66 #random.Random(10).randint(0,len(product_pages))
test_product_page_source = product_pages[index]
print(f'{index})',all_titles[index])
print(all_urls[index])

66) Babyganics Sun & Bug Duo Set
https://www.target.com/p/babyganics-sun-bug-duo-set/-/A-51265294#lnk=sametab


In [157]:
soup = BeautifulSoup(test_product_page_source)
print(len(soup.prettify().split('\n')), 'lines of HTML')

4333 lines of HTML


In [158]:
item_details_soup = soup.find('div', attrs={'id': 'tabContent-tab-Details'})
drug_facts_soup = soup.find('div', attrs={'id': 'tabContent-tab-Drugfacts'})
print(bool(drug_facts_soup), bool(item_details_soup))

True True


In [159]:
#print(drug_facts_soup.prettify())

In [160]:
def extract_details_from_drug_facts(drug_facts_soup, raw=None):
    h4_headers = [h.string for h in drug_facts_soup.find_all('h4')]
    drug_facts = {}
    header = ''
    for idx, text in enumerate(drug_facts_soup.stripped_strings):
        # stripped_strings usually returns [header, text, header, text, ...]
        # Some headers have no text -> [header, header, text, header, text, ...]
        if text in h4_headers:
            # Track previous text as the header
            text = text.replace(':','')
            header = header_translate.get(text, text)
            if header not in drug_facts_headers:
                print('New Header:', repr(header))
            continue
            
        # Check if active and inactive stored in inactive
        active_in_inactive = False
        if header == 'Inactive ingredients' and re.search(r'\bactive ingredients', text):
            m = re.search('active ingredients:(.*). inactive ingredients:(.*)', text)
            if m:
                active_text = m.group(1)
                inactive_text = m.group(2)
                active_in_inactive = True
            else:
                print("DEBUG active in inactive :", text)

        
        if raw is not None: raw[header] = text
        if header == 'Active ingredients':
            drug_facts[header] = parse_active_ingredients(text)
        elif active_in_inactive:
            if raw is not None: raw['Active ingredients'] = active_text
            drug_facts['Active ingredients'] = parse_active_ingredients(active_text)
            drug_facts['Inactive ingredients'] = parse_inactive_ingredients(inactive_text)
        elif header == 'Inactive ingredients':
            drug_facts[header] = parse_inactive_ingredients(text)
        elif header in drug_facts_headers:
            # No useful info expected
            continue
        else:
            drug_facts[header] = text
    
    return drug_facts

raw = {}
for k,v in extract_details_from_drug_facts(drug_facts_soup, raw).items():
    print('>>>',k,'<<<')
    print('RAW :', repr(raw[k]))
    print()
    table = [[x for x in ing] for ing in v]
    print(tabulate(table, headers=Ingredient._fields, tablefmt='github'))
    #pprint(v)
    print('-'*80)

DEBUG active in inactive : natural insect repellent: active ingredients: 95.25% soybean oil (certified organic ingredient), 1.50% rosemary oil, 0.95% citronella oil, 0.75% geranium oil, 0.70% cedarwood oil, 0.60% peppermint oil and 0.25% lemongrass oil, mineral-based sunscreen spray: water, caprylic/capric triglyceride, glycerin, aloe barbadensis leaf juice (certified organic ingredient), vp/hexadecene copolymer, glyceryl stearate, hexaglyceryl polyricinoleate, polysorbate 80, phenethyl alcohol, glyceryl caprylate, sodium magnesium silicate, xanthan gum, hydroxyethyl acrylate/sodium acryloyldimethyltaurate copolymer, citric acid, squalane, solanum lycopersicum (tomato) seed oil, helianthus annuus (sunflower) seed oil (certified organic ingredient), lecithin (certified organic ingredient), polysorbate 60, vaccinium macrocarpon (cranberry) seed oil, rubus idaeus (red raspberry) seed oil, nigella sativa (black cumin) seed oil (certified organic ingredient), sorbitan isostearate.
>>> Activ

**Extract Product Data** from all product pages

In [96]:
def extract_product_details(page_source):
    soup = BeautifulSoup(page_source)
    drug_facts = soup.find('div', attrs={'id': 'tabContent-tab-Drugfacts'})
    if not drug_facts:
        prod_details = soup.find('div', attrs={'id': 'tabContent-tab-Details'})
        details = extract_details_from_prod_details(prod_details)
    else:
        details = extract_details_from_drug_facts(drug_facts)
    return details

def extract_details_from_prod_details(prod_details):
    return {'STUB' : 'STUB'}

In [171]:
indices = sorted(set([
    #23,64,84
    #*list(range(60,70)),
    #*list(range(0,len(all_urls))),
]))
print('Indices:', indices)

Indices: []


In [172]:
%%time
tmp_product_details = {}
load = LoadingBar(len(product_pages), mod=1).begin()
for i, page_source in enumerate(product_pages):
    load.update()
    if indices and i not in indices:
        continue
    product = all_titles[i]
    if product in PRODUCT_BLACKLIST:
        tmp_product_details[product] = {'Skip' : None}
        continue
    #print(i, end=', ')
    tmp_product_details[product] = extract_product_details(page_source)
    if not tmp_product_details[product]:
        print(f'No details for URL[{i}]:', all_urls[i])
print()

100% Done; ( 338/338)  0min 44s elapsed [~0min remaining]    

CPU times: user 41.6 s, sys: 1.33 s, total: 43 s
Wall time: 44.4 s


In [173]:
product_details = tmp_product_details

In [174]:
title = all_titles[index]
print(title)
pprint(product_details[title])

Babyganics Sun & Bug Duo Set
{'Skip': None}


In [175]:
old_product_details = pickle.load(open('product_details.pkl','rb'))

In [65]:
old_product_details == product_details

False

In [None]:
#pickle.dump(product_details, open('product_details.pkl','wb'))
product_details = old_product_details

## Cleaning Data

- [ ] Map ingredients of same type to same name 
    - [ ] Name varients (e.g. `water, water/eau, purified water` -> `water`)
    - [ ] Typos or bugs (e.g. `water vitamin e` -> `['water', 'vitamin e']`)
- [ ] Remove duplicate products
- [ ] Parse titles (brand, spf, fluid oz, etc.)
- [ ] Combine data into table

In [66]:
#all_products = [t for p in product_links for t in p]
#all_titles = [p.title for p in all_products]
#all_urls   = [p.url   for p in all_products]
#print(len(all_titles),len(all_urls), sep=', ')
#print(len(set(all_titles)),len(set(all_urls)), sep=', ')

In [194]:
n_active = [0]*len(product_details)
all_active_ingredients = []
n_inactive = [0]*len(product_details)
all_inactive_ingredients = []
for i, (prod, details) in enumerate(product_details.items()):
    if 'Active ingredients' in details:
        n_active[i] = len(details['Active ingredients'])
        for ingred in details['Active ingredients']:
            #if ingred.name in ['sunscreen']:
            #    print(f'DEBUG : {i}) {ingred}; {prod}')
            all_active_ingredients.append(ingred.name)
    if 'Inactive ingredients' in details:
        n_inactive[i] = len(details['Inactive ingredients'])
        for ingred in details['Inactive ingredients']:
            all_inactive_ingredients.append(ingred.name)

all_active_ctr = Counter(all_active_ingredients)
all_active_set = sorted(all_active_ctr)

all_inactive_ctr = Counter(all_inactive_ingredients)
all_inactive_set = sorted(all_inactive_ctr)

print(f'{sum(n_active)/len(n_active):.1f} avg active ingredients')
print(f'{len(all_active_ctr)} unique active ingredients')
print(f'{sum(n_inactive)/len(n_inactive):.1f} avg inactive ingredients')
print(f'{len(all_inactive_ctr)} unique inactive ingredients')

2.2 avg active ingredients
16 unique active ingredients
18.3 avg inactive ingredients
809 unique inactive ingredients


In [177]:
pprint(all_active_ctr.most_common())

[('zinc oxide', 150),
 ('octocrylene', 117),
 ('octisalate', 115),
 ('avobenzone', 114),
 ('homosalate', 108),
 ('titanium dioxide', 60),
 ('oxybenzone', 16),
 ('octinoxate', 10),
 ('octyl salicylate', 4),
 ('homosolate', 2),
 ('to deliver zinc oxide', 1),
 ('ecamsule', 1),
 ('uncoated zinc oxide', 1),
 ('octorcylene', 1),
 ('sunscreen', 1),
 ('ensulizole', 1)]


In [249]:
pprint(all_inactive_ctr.most_common()[-10:])

[('lonicera japonica extract', 1),
 ('populus tremuloides bark extract', 1),
 ('theobroma cacao butter)', 1),
 ('lavandula angustifolia flower/leaf/stem extract nasturtium', 1),
 ('officinale flower/leaf extract', 1),
 ('ceramide 3', 1),
 ('myristyl alcohol', 1),
 ('glycol stearate', 1),
 ('potassium stearate', 1),
 ('stearamide amp', 1)]


In [251]:
#x = sorted(all_inactive_set, key=len, reverse=False)
# x = sorted(all_inactive_set, reverse=True)
# for g in x:
#     if 'eau' in g:
#         print(f'- {g}')
print('- '+'\n- '.join(x[0:50]))

- zingiber officinale root extract
- zinc oxide
- zinc gluconate
- zemea propanediol
- yellow 6 lake {ci 15985}
- yellow 5 lake {ci 19140}
- yellow 5
- yellow 11
- yellow 10 lake {ci 47005}
- xanthan gum plant-derived scent constituent
- xanthan gum as the sodium salt
- xanthan gum
- water/aqua/eau
- water vitamin e
- water organic
- water c12-15 alkyl benzoate
- water / eau
- water
- vp/hexadecene copolymer
- vp/eicosene copolymer
- vp/acrylates/lauryl methacrylate copolymer
- vp hexadecene copolymer
- vitis vinifera seed oil
- vitis vinifera seed extract
- vitis vinifera fruit extract
- vitis vinifera
- vitamin e
- violet 2
- vinyl dimethicone/ methicone silsesquioxane crosspolymer
- vegan beeswax
- vanillin plant-derived scent constituent
- vanillin
- vanilla planifolia fruit extract
- vaccinium macrocarpon seed oil
- vaccinium angustifolium fruit extract
- va/butyl maleate/lsobornyl acrylate copolymer
- va/butyl maleate/isobornyl acrylate copolymer
- undecylenoyl glycine
- undecane

In [181]:
print('Duplicate products')
c = 0
for title, count in Counter(all_titles).items():
    if count > 1:
        c += 1
        urls = set([p.url for p in all_products if p.title == title])
        print(f'\t- ({count}) : {title}')
        if len(urls) > 1:
            print(f'\t Different URLs')
            print(f'\t  {urls} ')
print(f'{c} duplicate titles')

# print()
# c = 0
# for url, count in Counter(all_urls).items():
#     if count > 1:
#         c += 1
#         title = all_titles[all_urls.index(url)]
#         print(f'\t- ({count}) : {url}')
#         #print(f'\t  {title} ')
# print(f'{c} duplicate URLs')

Duplicate products
	- (2) : CeraVe Ultra-Light Face Moisturizer with Sunscreen - SPF 30 – 1.7oz
	- (2) : CeraVe AM Facial Moisturizing Lotion with Sunscreen - SPF 30
	- (2) : CeraVe Hydrating Mineral Face Sunscreen Lotion with Zinc Oxide – SPF 50 – 2.5oz
	- (2) : CeraVe Hydrating Mineral Face Sunscreen Lotion with Zinc Oxide – SPF 30 - 2.5oz
	- (2) : Pipette Mineral Sunscreen Broad Spectrum SPF 50 - 4 fl oz
	- (2) : Raw Elements Tinted Daily Face Mineral Sunscreen Aluminum Tube - SPF 30 - 1.8 fl oz
	- (2) : La Roche-Posay Anthelios Clear Skin Dry Touch Face Sunscreen for Acne Prone Skin - SPF 60 - 1.7oz
	 Different URLs
	  {'/p/la-roche-posay-anthelios-clear-skin-dry-touch-face-sunscreen-for-acne-prone-skin-spf-60-1-7oz/-/A-75571510#lnk=sametab', '/p/la-roche-posay-anthelios-clear-skin-dry-touch-face-sunscreen-for-acne-prone-skin-spf-60-1-7oz/-/A-49124120#lnk=sametab'} 
	- (2) : Vichy LiftActiv Peptide-C Anti-Aging Face Sunscreen Moisturizer with Vitamin C - SPF 30 - 1.69 fl oz
	- (2) 

In [187]:
tmp = [p.split(' - ') for p in all_titles if "SPF" not in p]
print(len(tmp))
tmp

27


[['Continuous Sport Sunscreen Spray', 'up & up™'],
 ['Coppertone Sport Sunscreen Spray'],
 ['Sun Bum Original Sunscreen Spray', '6 oz'],
 ['Sun Bum Original Sunscreen Lotion'],
 ['Sport Sunscreen Lotion', '10.4 fl oz', 'up & up™'],
 ['Banana Boat Ultra Sport Clear Sunscreen Spray'],
 ['Neutrogena Ultra Sheer Dry-Touch Sunscreen Lotion'],
 ['Coppertone Sport Sunscreen Lotion', '7 fl oz'],
 ['Neutrogena Clear Face Liquid Sunscreen Lotion', '3 fl oz'],
 ['Banana Boat Ultra Sport Sunscreen Lotion', '8 fl oz'],
 ['Neutrogena Beach Defense Sunscreen Spray', '6.5 oz'],
 ['Sun Bum Face Lotion', '3 fl oz'],
 ['Babyganics Sun & Bug Duo Set'],
 ['bliss Block Star Invisible Daily Sunscreen', '1.4 fl oz'],
 ['Sun Bum Mineral Spray Sunscreen', '6 oz'],
 ['Sun Bum Summer of Love Sunscreen Spray', '6oz'],
 ['Sun Bum Summer of Love Sunscreen Lotion', '6 fl oz'],
 ['Neutrogena Invisible Daily Defense Sunscreen Lotion', '3 fl oz'],
 ['Sun Bum Mineral Sunscreen Lotion', '3 fl oz'],
 ['Mineral Sunscreen Lo

In [245]:
len(known_brands)

50

In [188]:
common_prefixes = []
prefix = ''
count = 0
known_brands = [
    'up & up™', 
    'Goddess Garden', 
    'Alba Botanica', 
    'All Good', 
    'CeraVe', 
    'Hawaiian Tropic', 
    'Raw Elements', 
    'Babo Botanicals', 
    'Blue Lizard', 
    'Australian Gold', 
    'Babyganics', 
    'La Roche-Posay', 
    'Olay', 
    'Aveeno', 
    'Bare Republic', 
    'Sun Bum', 'Baby Bum',
    'Thinksport', 'Thinkbaby', 
    'Neutrogena', 
    'Coppertone', 
    'Banana Boat',
    'DERMA E',
    'Coola',
    'Pacifica',
    'Mustela',
    'Cetaphil Sheer Mineral',
    'Vichy',
    'Project Sunscreen',
    "C'est Moi",
    'Black Girl Sunscreen',
    'Babo',
    'Sukin Suncare',
    'No7',
    'Garnier',
    'COOLA',
    'Badger',
    'Pipette',
    'Everyday Humans',
    'Dove',
    'bliss',
    'Unsun Cosmetics',
    'SheaMoisture',
    "Uncle Bud's",
    "Two Peas Organics",
    'SLMD Skincare',
    'RoC',
    'Mele',
    'Differin',
    'Bravo Sierra',
]
for title in sorted(set(all_titles)):
    if any(b in title for b in known_brands):
        continue
    if not prefix:
        prefix = title
    tmp_prefix = os.path.commonprefix([prefix, title])
    if len(tmp_prefix) < 4: # and not tmp_prefix[-1] == ' ':
        common_prefixes.append(f'{count:0>2} : {prefix.strip()}')
        prefix = title
        count = 1
    else:
        count += 1
        prefix = tmp_prefix
print(len(common_prefixes))
sorted(common_prefixes, reverse=True)

1


['01 : MELE Dew The Most Sheer Facial Moisturizer with SPF 30 Sunscreen for Melanin Rich Skin - 1 fl oz']

In [189]:
tmp = [p for p in all_titles if p.startswith("Adult Sport Sunscreen Stick")]
print(len(tmp))
tmp

1


['Adult Sport Sunscreen Stick - SPF 55 - up & up™']

In [190]:
prod_by_brand = defaultdict(list)
for title in all_titles:
    flag = True
    for brand in known_brands:
        if brand in title:
            flag = False
            prod_by_brand[brand].append(title)
    if flag:
        prod_by_brand['No Brand'].append(title)

In [191]:
ordered_prod_by_brand = {}
buff = max([len(b) for b in known_brands])+1
for idx, (brand, titles) in enumerate(sorted(prod_by_brand.items(), key=lambda x : (len(x[1]),x[0])),1):
    ordered_prod_by_brand[brand] = titles
    print('-'*80)
    print(f'{idx:>2}) {brand:.<{buff}} ({len(titles)} products)')
    for title in titles:
        print(f'\t- {title}')

--------------------------------------------------------------------------------
 1) Bravo Sierra........... (1 products)
	- Bravo Sierra Face Sunscreen SPF 30 - 3 fl oz
--------------------------------------------------------------------------------
 2) Differin............... (1 products)
	- Differin Oil Absorbing Moisturizer with Sunscreen, Broad-Spectrum UVA/UVB SPF 30 - 4oz
--------------------------------------------------------------------------------
 3) Dove................... (1 products)
	- Dove Men+Care Hydrate + SPF 15 Sunscreen Face Lotion - 1.69oz
--------------------------------------------------------------------------------
 4) Everyday Humans........ (1 products)
	- Everyday Humans Resting Beach Face Sunscreen Serum - SPF 30
--------------------------------------------------------------------------------
 5) Mele................... (1 products)
	- Mele No Shade Sunscreen Oil Broad Spectrum for Melanin Rich Skin - SPF 30 - 1 fl oz
-------------------------------------

In [508]:
json.dump(ordered_prod_by_brand, open('products_by_brand.json','w'), indent=4)

## Structure (Normalize?) Data

In [198]:
# Product Table
# id, name, SPF, fl_oz
products = pd.DataFrame(data, )

# Brand Table
# product_id, brand
brands = pd.DataFrame()

# Ingredients Table
# id, name
ingredients = pd.DataFrame()

# Ingredient name alternatives
# ingredient_id, alt_name
ingredient_alt_names = pd.DataFrame()

# Product ingredients table
# product_id, ingredient_id, is_active, rank, perc
product_ingredients = pd.DataFrame()

## EDA

## Application Demo

In [12]:
from functools import reduce
from statistics import median
from random import random

0.8759791297420325

In [129]:
def get_ingredients(product_name):
    global product_details
    prod = product_details.get(product_name, None)
    if prod is not None:
        ing = set()
        ing.update({x.name for x in prod.get('Inactive ingredients', {})})
        ing.update({x.name for x in prod.get('Active ingredients', {})})
        return ing
    return {}

def find_products(ingredient, includes=True):
    global product_details
    products = []
    
    for prod in product_details:
        in_product = ingredient in get_ingredients(prod)
        if (includes and in_product) or (not includes and not in_product):
            products.append(prod)

    return products

def get_doc_freq(ingredient_sets):
    '''
    Args:
    ingredient_sets : list[set]
    
    Returns:
    dict[word] = freq
    '''
    freq = {}
    increment = 1/len(ingredient_sets)
    
    for product in ingredient_sets:
        for ingr in product:
            freq[ingr] = freq.get(ingr, 0) + increment
    
    return freq

class Scorer():
    def __init__(self, impute_val=None):
        self.impute_val = impute_val
    
    def fit(self, bad_products, good_products=None):
        if good_products is None:
            good_products = []
        bad_ingredients  = [get_ingredients(p) for p in bad_products]
        good_ingredients = [get_ingredients(p) for p in good_products]
        
        all_ingredients = reduce(lambda x,y : x|y, bad_ingredients + good_ingredients)        
        
        
        bad_df  = get_doc_freq(bad_ingredients)
        good_df = get_doc_freq(good_ingredients)
        
        
        scores = {}
        for ingr in all_ingredients:
            scores[ingr] = bad_df.get(ingr,0) - good_df.get(ingr,0)

        self.ingredient_scores_ = dict(sorted(scores.items(), key=lambda kv : kv[1], reverse=True))
        
        # Rank all unique ingredient scores
        self.ranked_scores_ = sorted(set(self.ingredient_scores_.values()), reverse=True)
        
        # Impute mean score to any unseen ingredients
        if self.impute_val is None:
            self.impute_val = sum(self.ingredient_scores_.values())/len(all_ingredients)
        #self.impute_val = median(self.scores_.values())
        
        return self
    
    def get_ingredient_score_rank(self, ingredient):
        rank = self.ranked_scores_.index(ingredient)
        return rank+1 if rank != -1 else -1
    
    def score_ingredients(self, ingredients):
        scores = []
        for ingr in ingredients: 
            scores.append(self.ingredient_scores_.get(ingr, self.impute_val))
        return scores
    
    def score_products(self, products):
        scores = []
        for prod in products:
            ingr = get_ingredients(prod)
            if not ingr:
                scores.append(self.impute_val)
            else:
                ingr_scores = self.score_ingredients(ingr)
                score = -sum(ingr_scores)/len(ingr_scores)
                scores.append(score)
        return scores
        
    def filtered_scores(self, min_score = 0, max_n_scores=10):
        scores_filt = {}
        prev_rank = 0
        for ingr, score in self.ingredient_scores_.items():
            rank = self.get_ingredient_score_rank(score)
            
            score_pass = score >= min_score 
            rank_pass  = (rank == 1) or (rank == prev_rank)
            len_pass   = len(scores_filt) <= max_n_scores
            if score_pass and (rank_pass or len_pass):
                scores_filt[ingr] = (rank, score)
                prev_rank = rank
                
        # Trim off last rank if too many entries
        if len(scores_filt) > max_n_scores:
            scores_filt = {k:v for k,v in scores_filt.items() if v[0] < prev_rank}
        
        # Remove rank
        return {k:v[1] for k,v in scores_filt.items()}
 
# def find_potential_allergens(bad_products, good_products=None):    
#     scorer = IngredientScorer().fit(bad_products, good_products)
#     all_ingredients = reduce(lambda x,y : x|y, bad_ingredients + good_ingredients)
#     scores = scorer.transform(all_ingredients)
#     #scores = score_ingredients(bad_ingredients, good_ingredients)
#     return scores

# def score_ingredients(bad_ingredients, good_ingredients):
#     bad_df = get_doc_freq(bad_ingredients)
#     good_df = get_doc_freq(good_ingredients)
#     all_ingredients = reduce(lambda x,y : x|y, bad_ingredients + good_ingredients)
    
#     scores = {ingr : bad_df[ingr] - good_df[ingr] for ingr in all_ingredients}
    
#     # Sort by score and group scores to assign rank
#     rank = sorted(set(scores.values()), reverse=True)
#     scores2 = {}
#     for ingr, score in sorted(scores.items(), key=lambda kv : -kv[1]):
#         scores2[ingr] = (rank.index(score)+1, score)
#     return scores2

In [130]:
bad_products  = [
    'Banana Boat Ultra Sport Sunscreen Lotion - 8 fl oz',
    'Coppertone Pure & Simple Sunscreen Lotion - SPF 50 - 6 fl oz',
    #'Sport Sunscreen Lotion - SPF 30 - 3oz - up & up™',
    #'Cetaphil Sheer Mineral Sunscreen - SPF 50 - 3 fl oz'
]
good_products = [
    'Neutrogena Sensitive Skin Sunscreen Broad Spectrum - SPF 60+ - 3 fl oz',
    'La Roche-Posay Anthelios Melt in Milk Sunscreen Lotion - SPF 100 - 3.0 fl oz',
]
consider_products = [
    'Sun Bum Original Sunscreen Lotion',
    'CeraVe Hydrating Sunscreen Body Lotion - SPF 50 - 5 fl oz',
]
scorer = Scorer().fit(bad_products, good_products)

In [123]:
ingr_scores = scorer.ingredient_scores_
ingr_scores_filt = scorer.filtered_scores(min_score=0, max_n_scores=20)
prod_scores = scorer.score_products(consider_products)

In [124]:
scorer.score_products([bad_products[1]])

[-0.38]

In [127]:
print(f'{len(ingr_scores)} total ingredients scored')
print(f'{len(ingr_scores_filt)} ingredients to show')
print('-'*80)
flag = True
for ingr, score in ingr_scores.items():
    if score > 0:
        if flag and ingr not in ingr_scores_filt:
            flag = False
            print('-'*30,'Filtered','-'*30)
        rank = scorer.get_ingredient_score_rank(score)
        print(f'Rank {rank}) {score:.2f} : {ingr}')
print('-'*80)
print(f'{len(prod_scores)} total products scored')
for prod, score in zip(consider_products, prod_scores):
    print(f'{score:.2f} : {prod}')

print('-'*80)
print(f'Top Scoring Products')
all_prods = list(product_details.keys())
excl_prods = bad_products + good_products + consider_products
rec_prods = [p for p in all_prods if p not in excl_prods]
rec_prod_scores = scorer.score_products(rec_prods)
rec_prods_sort = sorted(zip(rec_prods,rec_prod_scores), key=lambda x:x[1], reverse=True)
for prod, score in rec_prods_sort[:10]:
    print(f'{score:.2f} : {prod}')

90 total ingredients scored
1 ingredients to show
--------------------------------------------------------------------------------
Rank 1) 1.00 : propylene glycol
------------------------------ Filtered ------------------------------
Rank 2) 0.50 : polyester-27
Rank 2) 0.50 : p0023 caprylyl glycol
Rank 2) 0.50 : 1
Rank 2) 0.50 : cetyl peg/ppg-10/1 dimethicone
Rank 2) 0.50 : bis-octyldodecyl dimer dilinoleate/propanediol copolymer
Rank 2) 0.50 : fragrance
Rank 2) 0.50 : aloe barbadensis leaf juice
Rank 2) 0.50 : octisalate
Rank 2) 0.50 : avobenzone
Rank 2) 0.50 : hydroxyacetophenone
Rank 2) 0.50 : sodium chloride
Rank 2) 0.50 : sacred lotus extract
Rank 2) 0.50 : octocrylene
Rank 2) 0.50 : isopropyl palmitate
Rank 2) 0.50 : ethylhexyl methoxycrylene
Rank 2) 0.50 : vp/eicosene copolymer
Rank 2) 0.50 : peg-12 dimethicone crosspolymer
Rank 2) 0.50 : cetyl alcohol
Rank 2) 0.50 : c12-15 alkyl benzoate
Rank 2) 0.50 : 2-hexanediol
Rank 2) 0.50 : giant kelp extract
Rank 2) 0.50 : tea leaf extra

In [17]:
import json
json.dump(product_details, open('data/product_details.json', 'w'))

In [20]:
print('\n'.join(sorted(find_products('propylene glycol'))))

Banana Boat Sport Performance Sunscreen Lotion - SPF 30 - 3oz
Banana Boat Ultra Sport Sunscreen Lotion - 8 fl oz
Black Girl Sunscreen Broad Spectrum - SPF 30 - 3 fl oz
Black Girl Sunscreen Kids Broad Spectrum - SPF 50 - 3 fl oz
Cetaphil Sheer Mineral Sunscreen - SPF 50 - 3 fl oz
Coppertone Kids Pure and Simple Botanicals Sunscreen Lotion- SPF 50 - 6oz
Coppertone Kids Tear Free Mineral Sunscreen Lotion - SPF 50 - 8oz
Coppertone Pure & Simple Baby Mineral Sunscreen - SPF 50 - 6 fl oz
Coppertone Pure & Simple Baby Sunscreen Spray - SPF 50 - 5oz
Coppertone Pure & Simple Kid's Sunscreen Spray - SPF 50 - 5oz
Coppertone Pure & Simple Sunscreen Lotion - SPF 50 - 6 fl oz
Coppertone Pure & Simple Sunscreen Spray - SPF 50 - 5oz
Coppertone Pure and Simple Botanicals Faces Sunscreen Lotion- SPF 50 - 2oz
Coppertone Sport Mineral Sunscreen Face Lotion - SPF 50 - 2.5 fl oz
Coppertone Sport Mineral Sunscreen Spray - SPF 50 - 5oz
La Roche-Posay Anthelios AOX Daily Antioxidant Face Serum with Sunscreen –

In [23]:
print('\n'.join(sorted(find_products('propylene glycol', includes=False))[:]))

Adult Sport Sunscreen Stick - SPF 55 - up & up™
Alba Botanica Emollient Sunscreen Sport Lotion - SPF 45 - 4oz
Alba Botanica Fragrance Free Sunscreen Lotion - SPF 30 - 4oz
Alba Botanica Hawaiian Coconut Sunscreen Spray - SPF 50 - 6 fl oz
Alba Botanica Hawaiian Green Tea Sunscreen Lotion - SPF 45 - 4oz
Alba Botanica Sweet Pea Sheer Shield Sunscreen - SPF 45 - 3oz
Alba Botanica Very Emollient Active Kids Clear Sunscreen Spray - SPF 50 - 6oz
Alba Botanica Very Emollient Kids Sunscreen Lotion - SPF 45 - 4oz
All Good Coconut Sunstick - SPF 30 - 0.6oz
All Good Kids Sunscreen Butter Stick Water Resistant - SPF 50+ - 2oz
All Good Kids Sunscreen Lotion Water Resistant - SPF 30 - 3oz
All Good Kids Sunscreen Spray Water Resistant - SPF 30 - 6oz
All Good Sport Sunscreen Butter Stick Water Resistant - SPF 50+ - 2oz
All Good Sport Sunscreen Lotion Water Resistant - SPF 30 - 3oz
All Good Sport Sunscreen Spray Water Resistant - SPF30 - 6oz
All Good Tinted Sunscreen Lotion - SPF 30 - 3 fl oz
All Good Zi

In [242]:
product_details['La Roche-Posay Anthelios Cooling Water-Lotion Face and Body Sunscreen SPF 60 - 5.0 fl oz']

{'Active ingredients': [Ingredient(name='avobenzone', rank=None, percent=None, alt_names=None),
  Ingredient(name='homosalate', rank=None, percent=None, alt_names=None),
  Ingredient(name='octisalate', rank=None, percent=None, alt_names=None),
  Ingredient(name='octocrylene', rank=None, percent=None, alt_names=None),
  Ingredient(name='oxybenzone', rank=None, percent=None, alt_names=None)],
 'Inactive ingredients': [Ingredient(name='water', rank=None, percent=None, alt_names=None),
  Ingredient(name='dimethicone', rank=None, percent=None, alt_names=None),
  Ingredient(name='alcohol denat', rank=None, percent=None, alt_names=None),
  Ingredient(name='styrene/acrylates copolymer', rank=None, percent=None, alt_names=None),
  Ingredient(name='acrylates/dimethicone copolymer', rank=None, percent=None, alt_names=None),
  Ingredient(name='phenoxyethanol', rank=None, percent=None, alt_names=None),
  Ingredient(name='propylene glycol', rank=None, percent=None, alt_names=None),
  Ingredient(name

In [243]:
product_details['Coppertone Pure & Simple Sunscreen Lotion - SPF 50 - 6 fl oz']

{'Active ingredients': [Ingredient(name='zinc oxide', rank=0, percent=24.08, alt_names=None)],
 'Inactive ingredients': [Ingredient(name='water', rank=None, percent=None, alt_names=None),
  Ingredient(name='c12-15 alkyl benzoate', rank=None, percent=None, alt_names=None),
  Ingredient(name='isopropyl palmitate', rank=None, percent=None, alt_names=None),
  Ingredient(name='butyloctyl salicylate', rank=None, percent=None, alt_names=None),
  Ingredient(name='ethylhexyl isononanoate', rank=None, percent=None, alt_names=None),
  Ingredient(name='cetyl peg/ppg-10/1 dimethicone', rank=None, percent=None, alt_names=None),
  Ingredient(name='propylene glycol', rank=None, percent=None, alt_names=None),
  Ingredient(name='cyclopentasiloxane', rank=None, percent=None, alt_names=None),
  Ingredient(name='bis-octyldodecyl dimer dilinoleate/propanediol copolymer', rank=None, percent=None, alt_names=None),
  Ingredient(name='dimethicone', rank=None, percent=None, alt_names=None),
  Ingredient(name='et