In [3]:
import random
import time
import csv

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
# As the datatable is not stored as HTML content, Selenium is used to read JSON format

def scrape_sharktanklab_selenium(num_pages=43, output_csv="XXXXX/sharktanklab_data.csv"):
    
    # Rotating User-Agent settings
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    ]
    random_ua = random.choice(user_agents)

    # Configure Chrome options
    chrome_options = Options()
    chrome_options.add_argument(f"--user-agent={random_ua}")
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # Create empty object to store data
    all_rows = []

    # Loop over the different webpages loading all pages of the table
    for page_num in range(1, num_pages + 1):
        url = f"https://sharktanklab.com/database/?current_page={page_num}"
        driver.get(url)
        time.sleep(3)

        try:
            table = driver.find_element(By.CLASS_NAME, 'table')
            rows = table.find_elements(By.TAG_NAME, 'tr')
            for row in rows:
                cells = row.find_elements(By.TAG_NAME, 'td')
                cell_text = [cell.text.strip() for cell in cells]
                if cell_text:
                    all_rows.append(cell_text)

        except Exception as e:
            print(f"Could not find table on page {page_num}. Error: {e}")
    driver.quit()

    # Copy the column names from the datatable to create csv
    headers = [
        "Product",
        "Location",
        "Category",
        "Season",
        "Deal Y/N",
        "Sharks",
        "Asking Amount",
        "% Equity (Asking)",
        "Invested Amount",
        "% Equity (Deal)",
        "Deal Type"
    ]

    # Create the csv file
    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(headers)   
        writer.writerows(all_rows)

    count = len(all_rows)
    print("Total unique product rows found:", count)

if __name__ == "__main__":
    scrape_sharktanklab_selenium()

Total unique product rows found: 1268
