In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import List

In [2]:
def process_one_link(url: str) -> set:
    try:
        # Step 1: Fetch the HTML content
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        html_content = response.text

        # Step 2: Parse the HTML
        soup = BeautifulSoup(html_content, 'html.parser')

        # Step 3: Extract the links
        links = list()
        for a_tag in soup.find_all('a', href=True):
            link = urljoin(url, a_tag['href'])  # Convert relative URLs to absolute
            links.append(link)

        clean_links = get_clean_links(links)
        same_domain_links = filter_different_domain_links(clean_links, url)
        return same_domain_links
    
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return set()
    

def get_all_links(url: str, depth: int=1) -> List[str]:
    # depth = 0 - only visit the root link
    # depth = 1 - visit the root and all links form it
    visited_links = set()
    links_for_search = [url]

    for i in range(depth):
        new_links_current_depth_set = set()

        for link in links_for_search:
            new_links = process_one_link(link)
            new_links_current_depth_set.update(new_links)

        visited_links.update(links_for_search)
        links_for_search = new_links_current_depth_set

    visited_links.update(links_for_search)

    return list(visited_links)


    
def get_clean_links(links: set) -> set:
    links = set([link for link in links 
                 if link.startswith('https://www.') or link.startswith('http://www.')])
    
    formated_links = []

    for link in links:
        if link.endswith('/'):
            formated_links.append(link[:-1])
        else:
            formated_links.append(link)

    return formated_links


def filter_different_domain_links(links: set, url) -> set:
    domain = url.replace('https://www.', '').replace('http://www.', '').split('.', maxsplit=1)[0]

    links = set([link for link in links 
                 if link.replace('https://www.', '').replace('http://www.', '').startswith(domain)])
    return links

# Usage
url = 'https://www.therocketbrew.com'  

depth_1_links = get_all_links(url, depth=1)
depth_2_links = get_all_links(url, depth=2)


print(f"{len(depth_1_links)=}")
print(f"{len(depth_2_links)=}")

len(depth_1_links)=15
len(depth_2_links)=41


In [46]:
links = ['http://www.therocketbrew.com/',
         'http://www.therocketbrew.com/blogs/how-do-you-pitch-a-technical-product',
         'https://www.therocketbrew.com/enterprise',
         'http://www.therocketbrew.com/blogs/is-chatgpt-effective-for-outreach']

formated_links = []

for link in links:
    if link.endswith('/'):
        formated_links.append(link[:-1])
    else:
        formated_links.append(link)

formated_links

['http://www.therocketbrew.com',
 'http://www.therocketbrew.com/blogs/how-do-you-pitch-a-technical-product',
 'https://www.therocketbrew.com/enterprise',
 'http://www.therocketbrew.com/blogs/is-chatgpt-effective-for-outreach']

In [3]:
depth_2_links

['http://www.therocketbrew.com/blogs/how-consumption-based-products-are-better-for-outreach',
 'http://www.therocketbrew.com/startup',
 'http://www.therocketbrew.com/enterprise',
 'https://www.therocketbrew.com',
 'http://www.therocketbrew.com/blogs/how-do-you-pitch-a-technical-product',
 'https://www.therocketbrew.com/blogs',
 'http://www.therocketbrew.com/blogs/is-outbound-a-numbers-game',
 'http://www.therocketbrew.com/blogs/is-chatgpt-effective-for-outreach',
 'http://www.therocketbrew.com/blogs/how-outbound-personalization-has-changed-in-2024',
 'http://www.therocketbrew.com/features',
 'http://www.therocketbrew.com/blogs/what-should-a-cold-outreach-message-include',
 'http://www.therocketbrew.com/blogs/should-you-use-linkedin-as-an-outreach-channel',
 'http://www.therocketbrew.com',
 'http://www.therocketbrew.com/pricing',
 'https://www.therocketbrew.com/book-a-demo',
 'http://www.therocketbrew.com/blogs/how-to-respond-to-cold-outreach-when-its-not-a-yes',
 'https://www.therocket

In [4]:
domain = url.replace('https://www.', '').replace('http://www.', '').split('.', maxsplit=1)[0]
domain

'therocketbrew'

## Decide what links are useful to scrape for summary

In [14]:
import os
import openai
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [18]:
from langchain_openai import ChatOpenAI

In [19]:
llm_model = "gpt-3.5-turbo"

In [20]:
# To control the randomness and creativity of the generated
# text by an LLM, use temperature = 0.0
chat = ChatOpenAI(temperature=0.0, model=llm_model)
chat

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x10ecc5270>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x10ecc6980>, temperature=0.0, openai_api_key=SecretStr('**********'), openai_proxy='')

In [21]:
SYSTEM_PROMPT = """
I'm scraping website to learn more about the product it offers, what are the benefits of using it and what are the competitive advantages of this company or its products. 

Help me decide what 10 links out of all links should I scrape. Return the JSON with key "useful_links" and value is list of links.

My life depends on this. I will tip you generously if you follow the instructions and do a great job. Only return the JSON with key "useful_links" and value is list of links, no other text is needed.

"""

In [9]:
summary_links = [
    "http://www.therocketbrew.com/features",
    "http://www.therocketbrew.com/pricing",
    "https://www.therocketbrew.com/startup",
    "https://www.therocketbrew.com/enterprise",
    "https://www.therocketbrew.com/book-a-demo",
    "http://www.therocketbrew.com/security-and-trust",
    "http://www.therocketbrew.com/blogs/how-consumption-based-products-are-better-for-outreach",
    "http://www.therocketbrew.com/blogs/how-outbound-personalization-has-changed-in-2024",
    "http://www.therocketbrew.com/blogs/what-are-the-key-elements-of-a-good-value-proposition",
    "http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach"
]


## Extract text from these links

In [7]:
import requests
from bs4 import BeautifulSoup
from typing import List


def fetch_html_content(url: str) -> str:
    """Fetch the HTML content from a given URL.

    Args:
        url (str): The URL of the website to fetch HTML content from.

    Returns:
        str: The raw HTML content of the page.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful

         # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        return soup.get_text()
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return ""

 
html_content = fetch_html_content(url)

html_content


'Rocketbrew | AI for personalized outbound\n\n\n\n\n\nSolutionsFor EnterpriseFor sales teams who want their entire outbound process to be automated with AI.For StartupsFor sellers just getting started with outreach and want an end-to-end solution to take care of everything.For TeamsFor outbound teams who want the highest performing tools for their SDRs.FeaturesPricingBlogsHow it worksPricingBlogsHow it worksPricingLoginGet startedYour Cart$Â\xa00.00Â\xa0USD: RemoveSubtotalPay with browser.Continue to CheckoutNo items found.Product is not available in this quantity.Personalized sales outbound,on autopilot.Automate your entire LinkedIn and email outreach motion including initial message, follow ups and scheduling a meeting - without a template.Get StartedSee how this workstrusted by 100+ companiestrusted by 100+ companiesOur generative AIÂ\xa0model books more calls directly into your calendar.No more boring templates, wondering what to say, or endless follow ups. A completely automated e

In [33]:
depth_2_links

{'http://www.therocketbrew.com/',
 'http://www.therocketbrew.com/blogs',
 'http://www.therocketbrew.com/blogs/how-consumption-based-products-are-better-for-outreach',
 'http://www.therocketbrew.com/blogs/how-do-you-grab-the-attention-of-a-prospect',
 'http://www.therocketbrew.com/blogs/how-do-you-pitch-a-technical-product',
 'http://www.therocketbrew.com/blogs/how-outbound-personalization-has-changed-in-2024',
 'http://www.therocketbrew.com/blogs/how-personalized-should-an-outreach-message-be',
 'http://www.therocketbrew.com/blogs/how-to-make-cold-outreach-follow-ups-effective',
 'http://www.therocketbrew.com/blogs/how-to-respond-to-cold-outreach-when-its-not-a-yes',
 'http://www.therocketbrew.com/blogs/how-to-write-relevant-outbound-messages',
 'http://www.therocketbrew.com/blogs/is-chatgpt-effective-for-outreach',
 'http://www.therocketbrew.com/blogs/is-outbound-a-numbers-game',
 'http://www.therocketbrew.com/blogs/is-personalization-important-in-cold-outreach',
 'http://www.therocke

In [37]:
len(html_content)

4931

In [10]:
website_info = ''

info_for_summary = ''

for link in depth_2_links:

    html_content = fetch_html_content(link)
    website_info += html_content

    if link in summary_links:
        info_for_summary += html_content
        
    print(link, len(website_info), len(info_for_summary))


http://www.therocketbrew.com/blogs/how-consumption-based-products-are-better-for-outreach 1393 1393
http://www.therocketbrew.com/startup 3588 1393
http://www.therocketbrew.com/enterprise 5953 1393
https://www.therocketbrew.com 10884 1393
http://www.therocketbrew.com/blogs/how-do-you-pitch-a-technical-product 12439 1393
https://www.therocketbrew.com/blogs 15334 1393
http://www.therocketbrew.com/blogs/is-outbound-a-numbers-game 16816 1393
http://www.therocketbrew.com/blogs/is-chatgpt-effective-for-outreach 18744 1393
http://www.therocketbrew.com/blogs/how-outbound-personalization-has-changed-in-2024 20684 3333
http://www.therocketbrew.com/features 23159 5808
http://www.therocketbrew.com/blogs/what-should-a-cold-outreach-message-include 25122 5808
http://www.therocketbrew.com/blogs/should-you-use-linkedin-as-an-outreach-channel 27260 5808
http://www.therocketbrew.com 32191 5808
http://www.therocketbrew.com/pricing 37488 11105
https://www.therocketbrew.com/book-a-demo 39618 13235
http://ww

In [11]:
with open('website_info_depth_2.txt', 'w') as file:
    file.write(website_info)

with open('website_info_for_summary.txt', 'w') as file:
    file.write(info_for_summary)