In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import time



In [None]:
# Define URL for product information
base_url = "https://sharktanklab.com/products/"

# Rotating User-Agent settings to avoid user blocking
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
]
headers = {
    "User-Agent": random.choice(user_agents),
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://google.com",
    "Connection": "keep-alive",
}

# Define the function to create the individiual product URL
def build_page_url(page):
    if page == 1:
        return base_url
    else:
        return f"{base_url}page/{page}/"

# Collect all unique product links
all_product_links = set()
page = 1

while True:
    url = build_page_url(page)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    page_links = set()
    
    # Loop through all pages to find the according product link
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        if "/products/" in href and "/page/" not in href:
            if not href.startswith("http"):
                href = "https://sharktanklab.com" + href
            page_links.add(href)
    
    # If no product links are found on this page, assume we've reached the end.
    if not page_links:
        print(f"No further products found, stopped on page {page}.")
        break
    
    all_product_links.update(page_links)
    
    page += 1
    time.sleep(random.uniform(1, 3)) 

count = len(all_product_links)
print("Total unique product links found:", count)


No further products found, stopped on page 107.
Total unique product links found: 1271


In [6]:
display(all_product_links)

{'https://sharktanklab.com/products/balm-chicky-balm-balm/',
 'https://sharktanklab.com/products/funkkoff-teethrefreshers/',
 'https://sharktanklab.com/products/morninghead/',
 'https://sharktanklab.com/products/nophone-fake-phone/',
 'https://sharktanklab.com/products/man-candles/',
 'https://sharktanklab.com/products/zach-zoe-honey/',
 'https://sharktanklab.com/products/somnifix-mouth-tape/',
 'https://sharktanklab.com/products/kit-lender-ski-and-snowboard-clothing/',
 'https://sharktanklab.com/products/caddy-swag/',
 'https://sharktanklab.com/products/the-sullivan-generator/',
 'https://sharktanklab.com/products/lil-advents-potty-training-game/',
 'https://sharktanklab.com/products/the-two-guys-bow-tie-co-wooden-bowtie/',
 'https://sharktanklab.com/products/lulu-bang-gourmet-sauces/',
 'https://sharktanklab.com/products/crio-bru-superfood-coffee-alternative/',
 'https://sharktanklab.com/products/kent-underwear/',
 'https://sharktanklab.com/products/napwell-alarm-clock-mask/',
 'http

In [7]:
def scrape_product(url, headers):
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve {url}")
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the product title
    title_element = soup.find('h1', class_='entry-title')
    product_title = title_element.get_text(strip=True) if title_element else "Title not found"
    
    # Extract the info about season & episode 
    season_episode_element = soup.find("div", class_="dm-product-season-label")
    if season_episode_element:
        spans = season_episode_element.find_all("span")
        season_episode = " ".join(span.get_text(strip=True) for span in spans)
    else:
        season_episode = "Season info not found"
    
    # Extract short description
    short_desc_element = soup.find('p', class_='product-summary-content')
    short_description = short_desc_element.get_text(strip=True) if short_desc_element else "Short description not found"
    
    # Extract long description
    long_desc_container = soup.find('div', class_='wp-block-media-text__content')
    if long_desc_container:
        first_paragraph = long_desc_container.find('p')  # Get only the first <p>
        long_description = first_paragraph.get_text(strip=True) if first_paragraph else "Long description not found"
    else:
        long_description = "Long description not found"

    return {
        "URL": url,
        "Product Title": product_title,
        "Season & Episode": season_episode,
        "Short Description": short_description,
        "Long Description": long_description
    }


all_product_data = []

# Loop over all prodcut links to gather the data
for url in all_product_links:
    data = scrape_product(url, headers)
    if data is not None:
        all_product_data.append(data)
    time.sleep(random.uniform(1, 3))

# Store data in a dataframe
SharkTank_df = pd.DataFrame(all_product_data)
display(SharkTank_df)

Unnamed: 0,URL,Product Title,Season & Episode,Short Description,Long Description
0,https://sharktanklab.com/products/balm-chicky-...,Balm Chicky Balm Balm,Season 6 Episode 17,PRODUCT SUMMARYBalm Chicky Balm Balm offers a ...,Balm Chicky Balm Balm introduces a revolutiona...
1,https://sharktanklab.com/products/funkkoff-tee...,FunkkOFF! – TeethRefreshers,Season 14 Episode 14,PRODUCT SUMMARYFunkkOFF! TeethRefreshers is a ...,FunkkOFF! TeethRefreshers is a unique tooth-re...
2,https://sharktanklab.com/products/morninghead/,Morninghead,Season 5 Episode 21,PRODUCT SUMMARYMorninghead is a reusable cap w...,Morninghead offers a convenient solution for i...
3,https://sharktanklab.com/products/nophone-fake...,NoPhone – Fake Phone,Season 7 Episode 23,PRODUCT SUMMARYThe NoPhone is a technology-fre...,Priced at $12 for the regular NoPhone and $18 ...
4,https://sharktanklab.com/products/man-candles/,Man Candles,Season 2 Episode 8,PRODUCT SUMMARYOriginal Man Candle offers a ra...,Original Man Candle offers a diverse range of ...
...,...,...,...,...,...
1266,https://sharktanklab.com/products/the-original...,The Original Profender,Season 3 Episode 8,PRODUCT SUMMARYThe Original Profender is a mob...,The Original Profender is a portable device eq...
1267,https://sharktanklab.com/products/no-limbits-a...,No Limbits Adaptive Clothing,Season 13 Episode 18,PRODUCT SUMMARYNo Limbits is an adaptive cloth...,No Limbits offers a range of specialized adapt...
1268,https://sharktanklab.com/products/invisiplug/,invisiPlug,Season 5 Episode 14,PRODUCT SUMMARYInvisiPlug offers a line of sur...,InvisiPlug revolutionizes the way surge protec...
1269,https://sharktanklab.com/products/kitty-kasas-...,Kitty Kasas Cat Houses,Season 10 Episode 11,"PRODUCT SUMMARYKitty Kasas are modular, stacka...",Kitty Kasas revolutionizes the world of cat fu...


In [None]:
# Store in files
SharkTank_df.to_excel("XXXX/sharktank_products.xlsx")