In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [10]:


def get_all_links(url):
    try:
        # Step 1: Fetch the HTML content
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        html_content = response.text

        # Step 2: Parse the HTML
        soup = BeautifulSoup(html_content, 'html.parser')

        # Step 3: Extract the links
        links = []
        for a_tag in soup.find_all('a', href=True):
            link = urljoin(url, a_tag['href'])  # Convert relative URLs to absolute
            links.append(link)

        clean_links = get_clean_links(links)
        same_domain_links = filter_different_domain_links(clean_links, url)
        return same_domain_links
    
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []
    
def get_clean_links(links: list) -> list:
    links = list(set(links))
    links = [link for link in links if link.startswith('https://www.') or link.startswith('http://www.')]

    return links

def filter_different_domain_links(links: list, url) -> list:
    domain = url.replace('https://www.', '').replace('http://www.', '').split('.', maxsplit=1)[0]

    links = [link for link in links 
             if link.replace('https://www.', '').replace('http://www.', '').startswith(domain)]
    return links

# Usage
url = 'https://www.therocketbrew.com/'  
links = get_all_links(url)
for link in links:
    print(link)

print(f"{len(links)=}")

http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach
https://www.therocketbrew.com/
https://www.therocketbrew.com/team
https://www.therocketbrew.com/startup
https://www.therocketbrew.com/features
http://www.therocketbrew.com/sales-and-crm-integrations
https://www.therocketbrew.com/checkout
http://www.therocketbrew.com/blogs
https://www.therocketbrew.com/pricing
http://www.therocketbrew.com/blogs/what-are-the-key-elements-of-a-good-value-proposition
https://www.therocketbrew.com/blogs
https://www.therocketbrew.com/enterprise
http://www.therocketbrew.com/security-and-trust
https://www.therocketbrew.com/book-a-demo
http://www.therocketbrew.com/email-101-how-to-set-up-dns-records-email-auth-for-gmail-using-rocketbrew
len(links)=15


In [7]:
domain = url.replace('https://www.', '').replace('http://www.', '').split('.', maxsplit=1)[0]
domain

'therocketbrew'

## Decide what links are useful to scrape

In [14]:
import os
import openai
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [18]:
from langchain_openai import ChatOpenAI

In [19]:
llm_model = "gpt-3.5-turbo"

In [20]:
# To control the randomness and creativity of the generated
# text by an LLM, use temperature = 0.0
chat = ChatOpenAI(temperature=0.0, model=llm_model)
chat

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x10ecc5270>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x10ecc6980>, temperature=0.0, openai_api_key=SecretStr('**********'), openai_proxy='')

In [21]:
SYSTEM_PROMPT = """
I'm scraping website to learn more about the product it offers, what are the benefits of using it and what are the competitive advantages of this company or its products. 

Help me decide what 10 links out of all links should I scrape. Return the JSON with key "useful_links" and value is list of links.

My life depends on this. I will tip you generously if you follow the instructions and do a great job. Only return the JSON with key "useful_links" and value is list of links, no other text is needed.

"""

In [22]:
template_string = """Translate the text \
that is delimited by triple backticks \
into a style that is {style}. \
text: ```{text}```
"""

In [23]:
from langchain.prompts import ChatPromptTemplate

prompt_template = ChatPromptTemplate.from_template(template_string)

In [24]:
prompt_template.messages[0].prompt.input_variables

['style', 'text']

In [25]:
customer_email = """
Arrr, I be fuming that me blender lid \
flew off and splattered me kitchen walls \
with smoothie! And to make matters worse, \
the warranty don't cover the cost of \
cleaning up me kitchen. I need yer help \
right now, matey!
"""

In [26]:
customer_style = """American English \
in a calm and respectful tone
"""

In [27]:
customer_messages = prompt_template.format_messages(
                    style=customer_style,
                    text=customer_email)

In [28]:
print(customer_messages[0])

content="Translate the text that is delimited by triple backticks into a style that is American English in a calm and respectful tone\n. text: ```\nArrr, I be fuming that me blender lid flew off and splattered me kitchen walls with smoothie! And to make matters worse, the warranty don't cover the cost of cleaning up me kitchen. I need yer help right now, matey!\n```\n"


In [30]:
# Call the LLM to translate to the style of the customer message
customer_response = chat.invoke(customer_messages)

In [31]:
print(customer_response.content)

Oh man, I'm really frustrated that my blender lid flew off and made a mess of my kitchen walls with smoothie! And on top of that, the warranty doesn't cover the cost of cleaning up my kitchen. I could really use your help right now, buddy.


## Extract text from these links

In [32]:
links

['http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach',
 'https://www.therocketbrew.com/',
 'https://www.therocketbrew.com/team',
 'https://www.therocketbrew.com/startup',
 'https://www.therocketbrew.com/features',
 'http://www.therocketbrew.com/sales-and-crm-integrations',
 'https://www.therocketbrew.com/checkout',
 'http://www.therocketbrew.com/blogs',
 'https://www.therocketbrew.com/pricing',
 'http://www.therocketbrew.com/blogs/what-are-the-key-elements-of-a-good-value-proposition',
 'https://www.therocketbrew.com/blogs',
 'https://www.therocketbrew.com/enterprise',
 'http://www.therocketbrew.com/security-and-trust',
 'https://www.therocketbrew.com/book-a-demo',
 'http://www.therocketbrew.com/email-101-how-to-set-up-dns-records-email-auth-for-gmail-using-rocketbrew']

In [40]:
selected_links = [
    "https://www.therocketbrew.com/",
    "https://www.therocketbrew.com/features",
    "http://www.therocketbrew.com/sales-and-crm-integrations",
    "https://www.therocketbrew.com/enterprise",
    "http://www.therocketbrew.com/security-and-trust",
    "http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach",
    "http://www.therocketbrew.com/blogs/what-are-the-key-elements-of-a-good-value-proposition",
    "https://www.therocketbrew.com/startup"
]

In [39]:
link = links[0]

link

'http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach'

In [35]:
import requests
from bs4 import BeautifulSoup
from typing import List


def fetch_html_content(url: str) -> str:
    """Fetch the HTML content from a given URL.

    Args:
        url (str): The URL of the website to fetch HTML content from.

    Returns:
        str: The raw HTML content of the page.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful

         # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        return soup.get_text()
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return ""


url = link 
html_content = fetch_html_content(url)

html_content


'Rocketbrew | AI for personalized outbound\n\n\n\n\n\nSolutionsFor EnterpriseFor sales teams who want their entire outbound process to be automated with AI.For StartupsFor sellers just getting started with outreach and want an end-to-end solution to take care of everything.For TeamsFor outbound teams who want the highest performing tools for their SDRs.FeaturesPricingBlogsHow it worksPricingBlogsHow it worksPricingLoginGet startedYour Cart$Â\xa00.00Â\xa0USD: RemoveSubtotalPay with browser.Continue to CheckoutNo items found.Product is not available in this quantity.Personalized sales outbound,on autopilot.Automate your entire LinkedIn and email outreach motion including initial message, follow ups and scheduling a meeting - without a template.Get StartedSee how this workstrusted by 100+ companiestrusted by 100+ companiesOur generative AIÂ\xa0model books more calls directly into your calendar.No more boring templates, wondering what to say, or endless follow ups. A completely automated e

In [37]:
len(html_content)

4931

In [41]:
website_info = ''

for link in selected_links:
    html_content = fetch_html_content(url)
    website_info += html_content
    print(link, len(website_info))


https://www.therocketbrew.com/ 4931
https://www.therocketbrew.com/features 9862
http://www.therocketbrew.com/sales-and-crm-integrations 14793
https://www.therocketbrew.com/enterprise 19724
http://www.therocketbrew.com/security-and-trust 24655
http://www.therocketbrew.com/blogs/what-data-to-use-to-personalize-cold-outreach 29586
http://www.therocketbrew.com/blogs/what-are-the-key-elements-of-a-good-value-proposition 34517
https://www.therocketbrew.com/startup 39448


In [44]:
with open('website_info.txt', 'w') as file:
    file.write(website_info)

In [43]:
print(website_info)

Rocketbrew | AI for personalized outbound





SolutionsFor EnterpriseFor sales teams who want their entire outbound process to be automated with AI.For StartupsFor sellers just getting started with outreach and want an end-to-end solution to take care of everything.For TeamsFor outbound teams who want the highest performing tools for their SDRs.FeaturesPricingBlogsHow it worksPricingBlogsHow it worksPricingLoginGet startedYour Cart$Â 0.00Â USD: RemoveSubtotalPay with browser.Continue to CheckoutNo items found.Product is not available in this quantity.Personalized sales outbound,on autopilot.Automate your entire LinkedIn and email outreach motion including initial message, follow ups and scheduling a meeting - without a template.Get StartedSee how this workstrusted by 100+ companiestrusted by 100+ companiesOur generative AIÂ model books more calls directly into your calendar.No more boring templates, wondering what to say, or endless follow ups. A completely automated end-to-end soluti