In [1]:
# importing all the required libraries

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.ui import Select

import xlsxwriter
import time

In [2]:
# adding options

chromeOptions = Options()
chromeOptions.add_argument('--kiosk')
chromeOptions.add_argument('blink-settings=imagesEnabled=false')
chromeOptions.page_load_strategy = 'normal'

In [3]:
# opening the webpage

driver = webdriver.Chrome('../chromedriver/chromedriver', options=chromeOptions)

In [4]:
driver.get('https://betalist.com/')

# Extract Betalist listings
### Steps:
1. Scroll down the page
2. Scrape the dates (for reference: to see which timeframe was scraped)
3. Get all the cards
4. Loop the cards then scrape all the necessary data(then add it into a list):
    - Product Name
    - Description
    - Votes
    - href (to get the Tags and URL since it doesn't exist in the home page)  
5. Loop the list and go to each href then scrape the website to get the URL.
6. Save the lists with complete information to an excel file.

# Step 1: Scroll down

In [7]:
for i in range(10): # scrolling down the page
    print(i, driver.execute_script("return document.body.scrollHeight"))
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1) # give the browser some time to load

0 197998
1 200686
2 202599
3 204469
4 206747
5 208662


KeyboardInterrupt: 

In [8]:
print(driver.current_url)

https://betalist.com/?page=100


# Step 2: Get all the dates

In [9]:
# get all dates (for reference as to when to when we got the dates.)
get_all_date = driver.find_elements_by_class_name('startupDeckHeader')
print(len(get_all_date))
for date in get_all_date: 
    print(date.text)

304
🙌 Trending Startups
Today April 21st
Yesterday April 20th
Sunday April 19th
Saturday April 18th
Friday April 17th
Thursday April 16th
Wednesday April 15th
Tuesday April 14th
Monday April 13th
Sunday April 12th
Saturday April 11th
Friday April 10th
Thursday April 9th
Wednesday April 8th
Tuesday April 7th
Monday April 6th
Sunday April 5th
Saturday April 4th
Friday April 3rd
Thursday April 2nd
Wednesday April 1st
Tuesday March 31st
Monday March 30th
Sunday March 29th
Saturday March 28th
Friday March 27th
Thursday March 26th
Wednesday March 25th
Tuesday March 24th
Monday March 23rd
Sunday March 22nd
Saturday March 21st
Friday March 20th
Thursday March 19th
Wednesday March 18th
Tuesday March 17th
Monday March 16th
Sunday March 15th
Saturday March 14th
Friday March 13th
Thursday March 12th
Wednesday March 11th
Tuesday March 10th
Monday March 9th
Sunday March 8th
Saturday March 7th
Friday March 6th
Thursday March 5th
Wednesday March 4th
Tuesday March 3rd
Monday March 2nd
Sunday March 1st


# Step 3: Find all cards

In [10]:
cards = driver.find_elements_by_class_name('startupCard') # getting all cards from classname
print(len(cards))
listings = []

1351


# Step 4: Loop cards and extract data

In [11]:
for i, card in enumerate(cards):
    try:
        title = card.find_element_by_class_name('startupCard__details__name').text # getting title from its class
        description = card.find_element_by_class_name('startupCard__details__pitch').text # getting description from its class 
        vote = card.find_element_by_class_name('cuteButton--upvote').find_element_by_class_name('cuteButton__score').text # getting vote from classname
        tag = card.find_element_by_class_name('startupCard__details__name').get_attribute('href') # getting tags from its class then span
        href = card.find_element_by_tag_name('a').get_attribute('href') # get href through anchor tag's href attribute
        listings.append({'product': title, 'description': description, 'tag': [], 'vote': vote, 'url': None, 'href': href})
    except:
        print('error', i)

error 3


In [12]:
print(len(listings))
for i in listings:
    print(i)

1350
{'product': 'Scrapbook', 'description': '250+ growth tactics for every step of the funnel', 'tag': [], 'vote': '18', 'url': None, 'href': 'https://betalist.com/startups/scrapbook'}
{'product': 'erxes', 'description': 'An open source growth marketing platform', 'tag': [], 'vote': '13', 'url': None, 'href': 'https://betalist.com/startups/erxes'}
{'product': 'Zen Mind Map', 'description': 'The simplest mind mapping tool', 'tag': [], 'vote': '5', 'url': None, 'href': 'https://betalist.com/startups/zen-mind-map'}
{'product': 'Heurio', 'description': 'The easiest way of digital product review on live websites', 'tag': [], 'vote': '1', 'url': None, 'href': 'https://betalist.com/startups/heurio'}
{'product': 'WooBloo SMASH', 'description': 'World’s Most Advanced Smart Projector', 'tag': [], 'vote': '', 'url': None, 'href': 'https://betalist.com/startups/woobloo-smash'}
{'product': 'Saylient', 'description': 'Transcribe, annotate, and share your recordings', 'tag': [], 'vote': '7', 'url': 

# Step 5: Getting website URL and tags

In [9]:
for listing in listings:
    driver.get(listing['href'])
    for tag in driver.find_elements_by_class_name('tag'): # get all tags
        if(tag != ''):
            listing['tag'].append(tag.text.capitalize())
    driver.get(driver.find_element_by_class_name('button2--contrast').get_attribute('href'))
    listing['url'] = driver.current_url 

KeyboardInterrupt: 

In [15]:
for listing in listings:
    if(listing['vote'] == ''):
        listing['vote'] = 0
    else:
        listing['vote'] = int(listing['vote'])
    print(listing)

{'product': 'Scrapbook', 'description': '250+ growth tactics for every step of the funnel', 'tag': [], 'vote': 18, 'url': None, 'href': 'https://betalist.com/startups/scrapbook'}
{'product': 'erxes', 'description': 'An open source growth marketing platform', 'tag': [], 'vote': 13, 'url': None, 'href': 'https://betalist.com/startups/erxes'}
{'product': 'Zen Mind Map', 'description': 'The simplest mind mapping tool', 'tag': [], 'vote': 5, 'url': None, 'href': 'https://betalist.com/startups/zen-mind-map'}
{'product': 'Heurio', 'description': 'The easiest way of digital product review on live websites', 'tag': [], 'vote': 1, 'url': None, 'href': 'https://betalist.com/startups/heurio'}
{'product': 'WooBloo SMASH', 'description': 'World’s Most Advanced Smart Projector', 'tag': [], 'vote': 0, 'url': None, 'href': 'https://betalist.com/startups/woobloo-smash'}
{'product': 'Saylient', 'description': 'Transcribe, annotate, and share your recordings', 'tag': [], 'vote': 7, 'url': None, 'href': 'h

In [11]:
driver.close()

# Step 6: Put data into excel file

In [16]:
workbook = xlsxwriter.Workbook('betalist.xlsx')
worksheet = workbook.add_worksheet()
worksheet.set_column('A:D', 25)
worksheet.set_column('E:E', 70)
worksheet.write(0, 0, 'Product Name')
worksheet.write(0, 1, 'Description')
worksheet.write(0, 2, 'URL')
worksheet.write(0, 3, 'Betalist URL')
worksheet.write(0, 4, 'Tag')
worksheet.write(0, 5, 'Votes')

row = 1
for listing in listings:
    worksheet.write(row, 0, listing['product'])
    worksheet.write(row, 1, listing['description'])
    worksheet.write(row, 2, listing['url'])
    worksheet.write(row, 3, listing['href'])
    worksheet.write(row, 4, ','.join(listing['tag']))
    worksheet.write(row, 5, listing['vote'])
    row += 1
    
workbook.close()