In [2]:
import requests

In [10]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_all_links(url):
    try:
        # Step 1: Fetch the HTML content
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        html_content = response.text

        # Step 2: Parse the HTML
        soup = BeautifulSoup(html_content, 'html.parser')

        # Step 3: Extract the links
        links = []
        for a_tag in soup.find_all('a', href=True):
            link = urljoin(url, a_tag['href'])  # Convert relative URLs to absolute
            links.append(link)

        clean_links = get_clean_links(links)
        same_domain_links = filter_different_domain_links(clean_links, url)
        return same_domain_links
    
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []
    
def get_clean_links(links: list) -> list:
    links = list(set(links))
    links = [link for link in links if link.startswith('https://www.') or link.startswith('http://www.')]

    return links

def filter_different_domain_links(links: list, url) -> list:
    domain = url.replace('https://www.', '').replace('http://www.', '').split('.', maxsplit=1)[0]

    links = [link for link in links 
             if link.replace('https://www.', '').replace('http://www.', '').startswith(domain)]
    return links

# Usage
url = 'https://www.therocketbrew.com/'  
links = get_all_links(url)
for link in links:
    print(link)

print(f"{len(links)=}")

http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach
https://www.therocketbrew.com/
https://www.therocketbrew.com/team
https://www.therocketbrew.com/startup
https://www.therocketbrew.com/features
http://www.therocketbrew.com/sales-and-crm-integrations
https://www.therocketbrew.com/checkout
http://www.therocketbrew.com/blogs
https://www.therocketbrew.com/pricing
http://www.therocketbrew.com/blogs/what-are-the-key-elements-of-a-good-value-proposition
https://www.therocketbrew.com/blogs
https://www.therocketbrew.com/enterprise
http://www.therocketbrew.com/security-and-trust
https://www.therocketbrew.com/book-a-demo
http://www.therocketbrew.com/email-101-how-to-set-up-dns-records-email-auth-for-gmail-using-rocketbrew
len(links)=15


In [7]:
domain = url.replace('https://www.', '').replace('http://www.', '').split('.', maxsplit=1)[0]
domain

'therocketbrew'

## Decide what links are useful to scrape

In [11]:
import os
import openai
from dotenv import load_dotenv

load_dotenv();
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
SYSTEM_PROMPT = """
I'm scraping website to learn more about the product it offers, what are the benefits of using it and what are the competitive advantages of this company or its products. 

Help me decide what 10 links out of all links should I scrape. Return the JSON with key "useful_links" and value is list of links.

My life depends on this. I will tip you generously if you follow the instructions and do a great job. Only return the JSON with key "useful_links" and value is list of links, no other text is needed.

"""