In [1]:
import requests
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import List
import re
import os

In [2]:
def process_one_link(url: str) -> set:
    try:
        # Step 1: Fetch the HTML content
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        html_content = response.text

        # Step 2: Parse the HTML
        soup = BeautifulSoup(html_content, 'html.parser')

        # Step 3: Extract the links
        links = list()
        for a_tag in soup.find_all('a', href=True):
            link = urljoin(url, a_tag['href'])  # Convert relative URLs to absolute
            links.append(link)

        clean_links = get_clean_links(links)
        same_domain_links = filter_different_domain_links(clean_links, url)
        return same_domain_links
    
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return set()
    

def get_all_links(url: str, depth: int=1) -> List[str]:
    # depth = 0 - only visit the root link
    # depth = 1 - visit the root and all links form it
    visited_links = set()
    links_for_search = [url]

    for i in range(depth):
        new_links_current_depth_set = set()

        for link in links_for_search:
            new_links = process_one_link(link)
            new_links_current_depth_set.update(new_links)

        visited_links.update(links_for_search)
        links_for_search = new_links_current_depth_set

    visited_links.update(links_for_search)

    return list(visited_links)


    
def get_clean_links(links: set) -> set:
    links = set([link for link in links 
                 if link.startswith('https://www.') or link.startswith('http://www.')])
    
    formated_links = []

    for link in links:
        if link.endswith('/'):
            formated_links.append(link[:-1])
        else:
            formated_links.append(link)

    return formated_links


def filter_different_domain_links(links: set, url) -> set:
    domain = url.replace('https://www.', '').replace('http://www.', '').split('.', maxsplit=1)[0]

    links = set([link for link in links 
                 if link.replace('https://www.', '').replace('http://www.', '').startswith(domain)])
    return links

# Usage
url = 'https://www.therocketbrew.com'  

depth_1_links = get_all_links(url, depth=1)
depth_2_links = get_all_links(url, depth=2)


print(f"{len(depth_1_links)=}")
print(f"{len(depth_2_links)=}")

len(depth_1_links)=15
len(depth_2_links)=41


In [3]:
depth_2_links

['http://www.therocketbrew.com/blogs/power-of-personalization',
 'http://www.therocketbrew.com/book-a-demo',
 'http://www.therocketbrew.com/blogs/should-you-send-empty-connection-requests-on-linkedin-for-cold-outreach',
 'https://www.therocketbrew.com/book-a-demo',
 'http://www.therocketbrew.com/team',
 'http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach',
 'http://www.therocketbrew.com/blogs/whats-the-future-of-outbound-in-2024',
 'http://www.therocketbrew.com/blogs/how-to-make-cold-outreach-follow-ups-effective',
 'http://www.therocketbrew.com/blogs/what-is-the-difference-between-personalized-and-templated-outreach',
 'http://www.therocketbrew.com/blogs/is-personalization-important-in-cold-outreach',
 'http://www.therocketbrew.com/blogs',
 'http://www.therocketbrew.com/blogs/should-you-use-linkedin-as-an-outreach-channel',
 'http://www.therocketbrew.com/blogs/how-to-respond-to-cold-outreach-when-its-not-a-yes',
 'http://www.therocketbrew.com/blogs/how-to

In [4]:
domain_list = url.replace('https://www.', '').replace('http://www.', '').split('.', maxsplit=2)[:2]
domain_folder_name = '.'.join(domain_list)

# Check if the folder exists, if not, create it
if not os.path.exists(domain_folder_name):
    os.makedirs(domain_folder_name)

domain_folder_name

'therocketbrew.com'

In [5]:
links_str_to_save = json.dumps({"all_links": depth_2_links})

In [6]:
all_links_filename = 'depth_2_links.json'

file_path_to_save = os.path.join(domain_folder_name, all_links_filename)

# Write the JSON object to a file
with open(file_path_to_save, 'w') as json_file:
    json.dump(links_str_to_save, json_file, indent=4)

In [7]:
depth_2_links

['http://www.therocketbrew.com/blogs/power-of-personalization',
 'http://www.therocketbrew.com/book-a-demo',
 'http://www.therocketbrew.com/blogs/should-you-send-empty-connection-requests-on-linkedin-for-cold-outreach',
 'https://www.therocketbrew.com/book-a-demo',
 'http://www.therocketbrew.com/team',
 'http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach',
 'http://www.therocketbrew.com/blogs/whats-the-future-of-outbound-in-2024',
 'http://www.therocketbrew.com/blogs/how-to-make-cold-outreach-follow-ups-effective',
 'http://www.therocketbrew.com/blogs/what-is-the-difference-between-personalized-and-templated-outreach',
 'http://www.therocketbrew.com/blogs/is-personalization-important-in-cold-outreach',
 'http://www.therocketbrew.com/blogs',
 'http://www.therocketbrew.com/blogs/should-you-use-linkedin-as-an-outreach-channel',
 'http://www.therocketbrew.com/blogs/how-to-respond-to-cold-outreach-when-its-not-a-yes',
 'http://www.therocketbrew.com/blogs/how-to

## Decide what links are useful to scrape for summary

In [8]:
file_path_to_save = 

SyntaxError: invalid syntax (3028910667.py, line 1)

In [9]:
# Read the JSON file
with open(file_path_to_save, 'r') as json_file:
    data = json.load(json_file)

data

'{"all_links": ["http://www.therocketbrew.com/blogs/power-of-personalization", "http://www.therocketbrew.com/book-a-demo", "http://www.therocketbrew.com/blogs/should-you-send-empty-connection-requests-on-linkedin-for-cold-outreach", "https://www.therocketbrew.com/book-a-demo", "http://www.therocketbrew.com/team", "http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach", "http://www.therocketbrew.com/blogs/whats-the-future-of-outbound-in-2024", "http://www.therocketbrew.com/blogs/how-to-make-cold-outreach-follow-ups-effective", "http://www.therocketbrew.com/blogs/what-is-the-difference-between-personalized-and-templated-outreach", "http://www.therocketbrew.com/blogs/is-personalization-important-in-cold-outreach", "http://www.therocketbrew.com/blogs", "http://www.therocketbrew.com/blogs/should-you-use-linkedin-as-an-outreach-channel", "http://www.therocketbrew.com/blogs/how-to-respond-to-cold-outreach-when-its-not-a-yes", "http://www.therocketbrew.com/blogs/how-

In [10]:
depth_2_links = json.loads(data)['all_links']
depth_2_links

['http://www.therocketbrew.com/blogs/power-of-personalization',
 'http://www.therocketbrew.com/book-a-demo',
 'http://www.therocketbrew.com/blogs/should-you-send-empty-connection-requests-on-linkedin-for-cold-outreach',
 'https://www.therocketbrew.com/book-a-demo',
 'http://www.therocketbrew.com/team',
 'http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach',
 'http://www.therocketbrew.com/blogs/whats-the-future-of-outbound-in-2024',
 'http://www.therocketbrew.com/blogs/how-to-make-cold-outreach-follow-ups-effective',
 'http://www.therocketbrew.com/blogs/what-is-the-difference-between-personalized-and-templated-outreach',
 'http://www.therocketbrew.com/blogs/is-personalization-important-in-cold-outreach',
 'http://www.therocketbrew.com/blogs',
 'http://www.therocketbrew.com/blogs/should-you-use-linkedin-as-an-outreach-channel',
 'http://www.therocketbrew.com/blogs/how-to-respond-to-cold-outreach-when-its-not-a-yes',
 'http://www.therocketbrew.com/blogs/how-to

In [11]:
import os
import openai
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [12]:
from langchain_openai import ChatOpenAI

In [13]:
llm_model = "gpt-3.5-turbo"

In [14]:
from langchain.schema import (
    HumanMessage,
    SystemMessage
)


In [15]:
SYSTEM_PROMPT = """
I'm scraping website to learn more about the product it offers, what are the benefits of using it and what are the competitive advantages of this company or its products. 

Help me decide what 10 links out of all links should I scrape, put the most useful links first, I'll send you links. Return the JSON with key "useful_links" and value is list of links.

My life depends on this. I will tip you generously if you follow the instructions and do a great job. Only return the JSON with key "useful_links" and value is list of links, no other text is needed.

"""

In [18]:
ChatOpenAI??

[0;31mInit signature:[0m
[0mChatOpenAI[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcache[0m[0;34m:[0m [0mForwardRef[0m[0;34m([0m[0;34m'Union[BaseCache, bool, None]'[0m[0;34m)[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcallbacks[0m[0;34m:[0m [0mForwardRef[0m[0;34m([0m[0;34m'Callbacks'[0m[0;34m)[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtags[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetadata[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[

In [16]:
chat = ChatOpenAI(temperature=0, model=llm_model)


In [17]:
result = chat.invoke([SystemMessage(content=SYSTEM_PROMPT),
                      HumanMessage(content=json.dumps(depth_2_links))])

result.__dict__

{'content': '{\n    "useful_links": [\n        "http://www.therocketbrew.com/book-a-demo",\n        "http://www.therocketbrew.com/team",\n        "http://www.therocketbrew.com/blogs/power-of-personalization",\n        "http://www.therocketbrew.com/blogs/should-you-send-empty-connection-requests-on-linkedin-for-cold-outreach",\n        "http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach",\n        "http://www.therocketbrew.com/blogs/whats-the-future-of-outbound-in-2024",\n        "http://www.therocketbrew.com/blogs/how-to-make-cold-outreach-follow-ups-effective",\n        "http://www.therocketbrew.com/blogs/what-is-the-difference-between-personalized-and-templated-outreach",\n        "http://www.therocketbrew.com/blogs/is-personalization-important-in-cold-outreach",\n        "http://www.therocketbrew.com/blogs/how-to-respond-to-cold-outreach-when-its-not-a-yes"\n    ]\n}',
 'additional_kwargs': {},
 'response_metadata': {'token_usage': {'completion_tokens':

In [20]:
clean_output = clean_llm_output(result.content)

llm_links = json.loads(clean_output)
summary_links = llm_links['useful_links']

In [21]:
summary_links

['http://www.therocketbrew.com/pricing',
 'http://www.therocketbrew.com/blogs/is-personalization-important-in-cold-outreach',
 'http://www.therocketbrew.com/blogs/is-chatgpt-effective-for-outreach',
 'https://www.therocketbrew.com/book-a-demo',
 'http://www.therocketbrew.com/security-and-trust',
 'http://www.therocketbrew.com/sales-and-crm-integrations',
 'http://www.therocketbrew.com/features',
 'http://www.therocketbrew.com/blogs/how-to-make-cold-outreach-follow-ups-effective',
 'http://www.therocketbrew.com/blogs/how-do-you-pitch-a-technical-product',
 'http://www.therocketbrew.com/blogs/how-consumption-based-products-are-better-for-outreach']

In [9]:
summary_links = [
    "http://www.therocketbrew.com/features",
    "http://www.therocketbrew.com/pricing",
    "https://www.therocketbrew.com/startup",
    "https://www.therocketbrew.com/enterprise",
    "https://www.therocketbrew.com/book-a-demo",
    "http://www.therocketbrew.com/security-and-trust",
    "http://www.therocketbrew.com/blogs/how-consumption-based-products-are-better-for-outreach",
    "http://www.therocketbrew.com/blogs/how-outbound-personalization-has-changed-in-2024",
    "http://www.therocketbrew.com/blogs/what-are-the-key-elements-of-a-good-value-proposition",
    "http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach"
]


In [40]:
summary_links_str_to_save = json.dumps({"summary_links": summary_links})

In [41]:
summary_links_filename = 'summary_links.json'

file_path_to_save = os.path.join(domain_folder_name, summary_links_filename)

# Write the JSON object to a file
with open(file_path_to_save, 'w') as json_file:
    json.dump(summary_links_str_to_save, json_file, indent=4)

## Extract text from these links

In [42]:
import requests
from bs4 import BeautifulSoup
from typing import List


def fetch_html_content(url: str) -> str:
    """Fetch the HTML content from a given URL.

    Args:
        url (str): The URL of the website to fetch HTML content from.

    Returns:
        str: The raw HTML content of the page.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful

         # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        return soup.get_text()
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return ""

 
html_content = fetch_html_content(url)

html_content


'Rocketbrew | AI for personalized outbound\n\n\n\n\n\nSolutionsFor EnterpriseFor sales teams who want their entire outbound process to be automated with AI.For StartupsFor sellers just getting started with outreach and want an end-to-end solution to take care of everything.For TeamsFor outbound teams who want the highest performing tools for their SDRs.FeaturesPricingBlogsHow it worksPricingBlogsHow it worksPricingLoginGet startedYour Cart$Â\xa00.00Â\xa0USD: RemoveSubtotalPay with browser.Continue to CheckoutNo items found.Product is not available in this quantity.Personalized sales outbound,on autopilot.Automate your entire LinkedIn and email outreach motion including initial message, follow ups and scheduling a meeting - without a template.Get StartedSee how this workstrusted by 100+ companiestrusted by 100+ companiesOur generative AIÂ\xa0model books more calls directly into your calendar.No more boring templates, wondering what to say, or endless follow ups. A completely automated e

In [43]:
depth_2_links

['http://www.therocketbrew.com/pricing',
 'http://www.therocketbrew.com/blogs/is-personalization-important-in-cold-outreach',
 'http://www.therocketbrew.com/blogs/is-chatgpt-effective-for-outreach',
 'https://www.therocketbrew.com/book-a-demo',
 'http://www.therocketbrew.com/security-and-trust',
 'http://www.therocketbrew.com/blogs/what-b2b-sales-can-learn-from-b2c-sales',
 'http://www.therocketbrew.com/sales-and-crm-integrations',
 'http://www.therocketbrew.com/book-a-demo',
 'http://www.therocketbrew.com/blogs/how-to-write-relevant-outbound-messages',
 'http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach',
 'http://www.therocketbrew.com/blogs/what-are-the-key-elements-of-a-good-value-proposition',
 'https://www.therocketbrew.com/startup',
 'http://www.therocketbrew.com/features',
 'http://www.therocketbrew.com/blogs/how-to-make-cold-outreach-follow-ups-effective',
 'http://www.therocketbrew.com/blogs/how-do-you-pitch-a-technical-product',
 'http://www.the

In [44]:
website_info = ''

info_for_summary = ''

for link in depth_2_links:

    html_content = fetch_html_content(link)
    website_info += html_content

    if link in summary_links:
        info_for_summary += html_content
        
    print(link, len(website_info), len(info_for_summary))


http://www.therocketbrew.com/pricing 5297 5297
http://www.therocketbrew.com/blogs/is-personalization-important-in-cold-outreach 7102 7102
http://www.therocketbrew.com/blogs/is-chatgpt-effective-for-outreach 9030 9030
https://www.therocketbrew.com/book-a-demo 11160 11160
http://www.therocketbrew.com/security-and-trust 14301 14301
http://www.therocketbrew.com/blogs/what-b2b-sales-can-learn-from-b2c-sales 16251 14301
http://www.therocketbrew.com/sales-and-crm-integrations 22441 20491
http://www.therocketbrew.com/book-a-demo 24571 20491
http://www.therocketbrew.com/blogs/how-to-write-relevant-outbound-messages 26588 20491
http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach 28322 20491
http://www.therocketbrew.com/blogs/what-are-the-key-elements-of-a-good-value-proposition 30499 20491
https://www.therocketbrew.com/startup 32694 20491
http://www.therocketbrew.com/features 35169 22966
http://www.therocketbrew.com/blogs/how-to-make-cold-outreach-follow-ups-effectiv

In [48]:
with open(os.path.join(domain_folder_name, 'website_info_depth_2.txt'), 'w') as file:
    file.write(website_info)

with open(os.path.join(domain_folder_name, 'website_info_for_summary.txt'), 'w') as file:
    file.write(info_for_summary)