## Builds a brochure for any given website with important highlights of the company


In [6]:
import requests
from bs4 import BeautifulSoup

In [15]:
class Website:
    def __init__(self, url):
        self.url = url
        response = requests.get(url)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(['script', 'style', 'img', 'input']):
                irrelevant.decompose()
            self.text = soup.body.getText(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Website Title: {self.title}\nWebpage Contents:\n{self.text}\n\n"


In [16]:
link_system_prompt = "You are provided with a list of links found on a webpage." \
"You are able to decide which of the links will be most relevant to include in a briefer about the company," \
"such as links to an About page, or a Company page, or Careers/jobs pages.\nYou should respond in JSON as in this example:"

link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://company-url/about"},
        {"type": "careers page", "url": "https://company-full-url/jobs"},
    ]
}
"""


In [17]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website {website.url} - please decide which of these are relevant web links for a company briefer, respond with the full https URL in JSON format.\
    Do not include Terms or privacy, service or email links."

    user_prompt += f"\nLinks (some might be relevant links): \n{website.links}"
    return user_prompt

In [19]:
print(get_links_user_prompt(Website("https://huggingface.co")))

Here is the list of links on the website https://huggingface.co - please decide which of these are relevant web links for a company briefer, respond with the full https URL in JSON format.    Do not include Terms or privacy, service or email links.
Links (some might be relevant links): 
['/', '/models', '/datasets', '/spaces', '/docs', '/enterprise', '/pricing', '/login', '/join', '/spaces', '/models', '/tencent/SRPO', '/Qwen/Qwen3-Next-80B-A3B-Instruct', '/baidu/ERNIE-4.5-21B-A3B-Thinking', '/Qwen/Qwen3-Next-80B-A3B-Thinking', '/google/vaultgemma-1b', '/models', '/spaces/enzostvs/deepsite', '/spaces/zerogpu-aoti/wan2-2-fp8da-aoti-faster', '/spaces/multimodalart/wan-2-2-first-last-frame', '/spaces/IndexTeam/IndexTTS-2-Demo', '/spaces/tencent/HunyuanImage-2.1', '/spaces', '/datasets/HuggingFaceFW/finepdfs', '/datasets/HuggingFaceM4/FineVision', '/datasets/LucasFang/FLUX-Reason-6M', '/datasets/fka/awesome-chatgpt-prompts', '/datasets/InternRobotics/OmniWorld', '/datasets', '/join', '/pri