In [1]:
# importing all the required libraries

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.ui import Select

import xlsxwriter
import time

In [2]:
# adding options

chromeOptions = Options()
chromeOptions.add_argument('--kiosk')
chromeOptions.add_argument('blink-settings=imagesEnabled=false')
chromeOptions.page_load_strategy = 'normal'

In [3]:
# opening the webpage

driver = webdriver.Chrome('../chromedriver/chromedriver', options=chromeOptions)

In [4]:
driver.get('https://www.producthunt.com')

In [23]:
driver.close()

# Extract Product Hunt listings
### Steps:
1. Scroll down the page
2. Click all the show more button (to be able to scrape all data)
3. Scrape the dates (for reference: to see which timeframe was scraped)
4. Get all the cards
5. Loop the cards then scrape all the necessary data(then add it into a list):
    - Product Name
    - Description
    - Tags
    - Votes
    - href (to get the URL since it doesn't exist in the home page)  
6. Loop the list and go to each href then scrape the website to get the URL.
7. Save the lists with complete information to an excel file.

# Step 1: Scroll down

In [None]:
for i in range(300): # scrolling down the page
    print(i, driver.execute_script("return document.body.scrollHeight"))
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1) # give the browser some time to load

0 3528
1 3921
2 3921
3 6137
4 7465
5 8793
6 10121


In [30]:
print(driver.current_url)

https://www.producthunt.com/


# Step 2: Click all show more buttons

In [None]:
# click all show more buttons
show_more = driver.find_elements_by_class_name('showHiddenPosts_fbc1f')
print(len(show_more))
for show in show_more:
    show.click()

# Step 3: Take all dates and print it

In [None]:
# get all dates (for reference as to when to when we got the dates.)
get_all_date = driver.find_element_by_tag_name('main').find_elements_by_class_name('header_fb2c3')
print(len(get_all_date))
for date in get_all_date: 
    print(date.text)

# Step 4: Find all cards

In [None]:
cards = driver.find_elements_by_class_name('item_54fdd') # getting all cards from classname
print(len(cards))
products = []

# Step 5: Loop cards and extract data

In [None]:
for i, card in enumerate(cards):
    try:
        title = card.find_element_by_tag_name('h3').text # getting title from h3 in each li
        description = card.find_element_by_tag_name('p').text # getting description from p in each li
        vote = card.find_element_by_class_name('voteButtonWrap_4c515').find_element_by_tag_name('span').text # getting vote from classname then span
        href = card.find_element_by_tag_name('a').get_attribute('href') # get href through anchor tag
        products.append({'product': title, 'description': description, 'tag': [], 'vote': vote, 'url': None, 'href': href})
    except:
        print('error', i)

In [None]:
print(len(products))
# for i in products:
#     print(i)

# Step 6: Go to each website and get the URL and tags

In [None]:
for product in products:
    try:
        driver.get(product['href'])
        try:
            driver.find_element_by_class_name('more_2e78b').click()
        except:
            print('No clickable tag')
        for tag in driver.find_elements_by_class_name('button_53e93'):
            product['tag'].append(tag.text.capitalize())
        driver.get(driver.find_element_by_class_name('link_9bebc').get_attribute('href'))
        product['url'] = driver.current_url
    except:
        product['url'] = driver.current_url
    print(product)

In [None]:
for product in products:
    if(product['url'] == None):
        print(product)

In [None]:
print(len(products))

In [None]:
for product in products[:5]:
    try:
        driver.get(product['href'])
        try:
            driver.find_element_by_class_name('more_2e78b').click()
        except:
            print('No clickable tag')
        tag = []
        for tag in driver.find_elements_by_class_name('button_53e93'):
            product['tag'].append(tag.text.capitalize())
        driver.get(driver.find_element_by_class_name('link_9bebc').get_attribute('href'))
    print(product)

# Step 7: Put data into excel file

In [None]:
workbook = xlsxwriter.Workbook('product_hunt.xlsx')
worksheet = workbook.add_worksheet()
worksheet.set_column('A:E', 25)
worksheet.write(0, 0, 'Product Name')
worksheet.write(0, 1, 'Description')
worksheet.write(0, 2, 'URL')
worksheet.write(0, 3, 'ProductHunt URL')
worksheet.write(0, 4, 'Tag')
worksheet.write(0, 5, 'Votes')

row = 1
for product in products:
    worksheet.write(row, 0, product['product'])
    worksheet.write(row, 1, product['description'])
    worksheet.write(row, 2, product['url'])
    worksheet.write(row, 3, product['href'])
    worksheet.write(row, 4, ','.join(product['tag']))
    worksheet.write(row, 5, product['vote'])
    row += 1
    
workbook.close()