-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7afebb4
commit 491aa0d
Showing
1 changed file
with
157 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
#!/usr/bin/python | ||
|
||
from selenium import webdriver | ||
from functools import reduce | ||
|
||
# Checks if any social media, signup text is in the <li> elements | ||
def check_if_excluded_words(texts): | ||
for text in texts: | ||
if ('instagram' in text or 'youtube' in text or 'twitter' in text or | ||
'facebook' in text or 'login' in text or 'signup' in text or | ||
'share' in text): | ||
return True | ||
|
||
return False | ||
|
||
# Checks if the <li> elements have a border | ||
# Since some websites use pseudo-selectors to create borders, we perform that | ||
# check too | ||
def check_if_border(driver, li_elements): | ||
for li in li_elements: | ||
children = li.find_elements_by_css_selector("*") | ||
|
||
for child in children + [li]: | ||
if (child.value_of_css_property('border-left-style').lower() != | ||
'none'): | ||
return True | ||
elif ((driver.execute_script( | ||
'''return window.getComputedStyle(arguments[0],':before') | ||
.getPropertyValue('border-left-style')''', child)) != | ||
'none'): | ||
return True | ||
elif ((driver.execute_script( | ||
'''return window.getComputedStyle(arguments[0],':after') | ||
.getPropertyValue('border-left-style')''', child)) != | ||
'none'): | ||
return True | ||
else: | ||
continue | ||
|
||
return False | ||
|
||
# Checks whether the <li> elements is a product attribute | ||
def is_list_product_attribute(driver, li_elements): | ||
# Extract the <a> and <button> elements from within each <li> | ||
|
||
a_links = map(lambda x: x.find_elements_by_tag_name('a'), li_elements) | ||
buttons = map(lambda x: x.find_elements_by_tag_name('button'), li_elements) | ||
|
||
# If any of the <li> elements have more than one <a> or <button>, | ||
# probably False | ||
count_a_links = map(lambda x: len(x) <= 1, a_links) | ||
count_buttons = map(lambda x: len(x) <= 1, buttons) | ||
|
||
if all(x is True for x in count_a_links): | ||
hrefs = map(lambda x: x[0].get_attribute('href') if len(x) == 1 else | ||
None, a_links) | ||
hrefs = map(lambda x: '' if x is None else x.lower(), hrefs) | ||
if check_if_excluded_words(hrefs): | ||
return False | ||
|
||
elif all(x is True for x in count_buttons): | ||
texts = map(lambda x: x[0].text.lower() if len(x) == 1 else '', buttons) | ||
if check_if_excluded_words(texts): | ||
return False | ||
else: | ||
return False | ||
|
||
# Do all the elements have the same height? If not, probably False | ||
height = map(lambda x: x.value_of_css_property('height'), li_elements) | ||
if (len(set(height)) > 1): | ||
return False | ||
|
||
# If the height is set automatically, if it is greater than 80, or less than | ||
# 10 then probably False | ||
if ((list(set(height))[0] == 'auto') or | ||
float(list(set(height))[0][:-2]) > 80 or | ||
float(list(set(height))[0][:-2]) <= 10): | ||
return False | ||
|
||
# The elements need to have a border | ||
border_check = check_if_border(driver, li_elements) | ||
|
||
# Are all the elements floating left (in-line and horizontal)? If so, | ||
# probably True | ||
floating = map(lambda x: x.value_of_css_property('float'), li_elements) | ||
if (len(set(floating)) == 1 and list(set(floating))[0] == 'left' and | ||
border_check): | ||
return True | ||
|
||
display = map(lambda x: x.value_of_css_property('display'), li_elements) | ||
if (len(set(display)) == 1 and list(set(display))[0] == 'inline-block' and | ||
border_check): | ||
return True | ||
|
||
return False | ||
|
||
|
||
def parse_through_html_lists(driver, ul_ol_elements): | ||
product_attributes = [] | ||
|
||
for element in ul_ol_elements: | ||
# Only look for visible list elements | ||
if element.is_displayed(): | ||
|
||
ul_elements = element.find_elements_by_tag_name('ul') | ||
ol_elements = element.find_elements_by_tag_name('ol') | ||
|
||
# Ignore those <ul> and <ol> elements that have children <ul> and | ||
# <ol> elements | ||
if (len(ul_elements) != 0 or len(ol_elements) != 0): | ||
continue | ||
|
||
# Extract all <li> elements | ||
li_elements = element.find_elements_by_tag_name('li') | ||
|
||
if len(li_elements) > 0 and is_list_product_attribute(driver, li_elements): | ||
# print len(li_elements) | ||
# for ele in li_elements: | ||
# print ele.text | ||
product_attributes.append(element) | ||
else: | ||
continue | ||
else: | ||
continue | ||
|
||
return product_attributes | ||
|
||
|
||
def get_product_attribute_elements(url): | ||
driver = webdriver.Firefox() | ||
driver.get(url) | ||
|
||
# Start by looking for ol, ul lists | ||
ul_elements = driver.find_elements_by_tag_name('ul') | ||
ol_elements = driver.find_elements_by_tag_name('ol') | ||
|
||
result = parse_through_html_lists(driver, ul_elements + ol_elements) | ||
|
||
driver.close() | ||
|
||
return result | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
# Tests | ||
get_product_attribute_elements('https://www.tedbaker.com/us/Mens/Clothing/Shorts/PROSHOR-Cotton-chino-shorts-Dark-Blue/p/142793-DK-BLUE') | ||
#get_product_attribute_elements('https://usa.tommy.com/en/men/men-shirts/lewis-hamilton-logo-shirt-mw08299') | ||
#get_product_attribute_elements('https://www.calvinklein.us/en/mens-clothing/mens-featured-shops-calvin-klein-jeans/slim-fit-archive-western-shirt-22705235') | ||
#get_product_attribute_elements('https://www.amazon.com/Linksys-Tri-Band-Intelligent-bedrooms-Multi-Story/dp/B01N2NLNEH?ref_=Oct_DLandingS_PC_NA_NA&smid=ATVPDKIKX0DER') | ||
#get_product_attribute_elements('https://shop4reebok.com/#!product/CN8042_temposlipon') | ||
#get_product_attribute_elements('https://us.boohoo.com/high-shine-v-hem-bandeau/DZZ09839.html') | ||
#get_product_attribute_elements('https://www.prettylittlething.com/mustard-rib-button-detail-midi-skirt.html') | ||
#get_product_attribute_elements('https://www.jcpenney.com/p/the-foundry-big-tall-supply-co-quick-dry-short-sleeve-knit-polo-shirt-big-and-tall/ppr5007145724?pTmplType=regular&catId=cat100240025&deptId=dept20000014&urlState=/g/mens-shirts/N-bwo3yD1nohp5&productGridView=medium&selectedSKUId=58130901099&badge=fewleft') | ||
#get_product_attribute_elements('https://www.forever21.com/us/shop/Catalog/Product/F21/outerwear_coats-and-jackets/2000288425') | ||
#get_product_attribute_elements('target.com/p/boys-short-sleeve-t-shirt-cat-jack-153/-/A-53411710?preselect=53364641#lnk=sametab') | ||
#get_product_attribute_elements('http://www2.hm.com/en_us/productpage.0476583002.html') |