In [3]:
# scrape the web and build a brochure
# imports

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI


In [4]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [5]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [6]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [7]:
# user prompt

def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [9]:
# Anthropic has made their site harder to scrape, so I'm using HuggingFace..

huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/black-forest-labs/FLUX.1-Kontext-dev',
 '/THUDM/GLM-4.1V-9B-Thinking',
 '/kyutai/tts-1.6b-en_fr',
 '/google/gemma-3n-E4B-it',
 '/apple/DiffuCoder-7B-cpGRPO',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/black-forest-labs/FLUX.1-Kontext-Dev',
 '/spaces/ilcve21/Sparc3D',
 '/spaces/multimodalart/wan2-1-fast',
 '/spaces/Kwai-Kolors/Kolors-Virtual-Try-On',
 '/spaces',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/HuggingFaceFW/fineweb-2',
 '/datasets/facebook/seamless-interaction',
 '/datasets/marcelbinz/Psych-101',
 '/datasets/black-forest-labs/kontext-bench',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writ

In [11]:
# check for important links; reduces down the list

get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'docs page', 'url': 'https://huggingface.co/docs'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'community page', 'url': 'https://discuss.huggingface.co'},
  {'type': 'GitHub page', 'url': 'https://github.com/huggingface'},
  {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'},
  {'type': 'LinkedIn page',
   'url': 'https://www.linkedin.com/company/huggingface/'}]}

# Make the Brochure

In [12]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [13]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}]}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
black-forest-labs/FLUX.1-Kontext-dev
Updated
10 days ago
•
171k
•
1.41k
THUDM/GLM-4.1V-9B-Thinking
Updated
5 days ago
•
10.1k
•
261
kyutai/tts-1.6b-en_fr
Updated
4 days ago
•
12.2

In [14]:
# some variations on system prompts

system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."

In [15]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [16]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")


Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nCommunity\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nblack-forest-labs/FLUX.1-Kontext-dev\nUpdated\n10 days ago\n•\n171k\n•\n1.41k\nTHUDM/GLM-4.1V-9B-Thinking\nUpdated\n5 days ago\n•\n10.1k\n•\n261\nkyutai/tts-1.6b-en_fr\nUpdated\n4 days ago\n•\n12.2k\n•\n220\ngoogle/gemma-3n-E4B-it\nUpdated\n4 days ago\n•\n223k\n•\n502\napple/DiffuCoder-7B-cpGRPO\nUpdated\n3 days ago\n•\n599\n•\n189\nBrowse 1M+ models\nSpaces\nRunning\n9.41k\n9.41k\nDeepSit

In [18]:
# let's clean it all up

def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [19]:
# new format

create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'documentation page', 'url': 'https://huggingface.co/docs'}]}


# Hugging Face Brochure

### Company Overview
**Hugging Face** is at the forefront of the AI revolution, fostering a vibrant community dedicated to building the future of machine learning. As a collaborative platform, Hugging Face serves as a hub where developers, researchers, and businesses can come together to create and share models, datasets, and applications. With over **1 million models** and **250,000+ datasets**, Hugging Face is a catalyst for innovation in the realm of artificial intelligence.

### Our Mission
At Hugging Face, our mission is simple: to democratize AI and make machine learning accessible to everyone. We believe in the power of collaboration, transparency, and open-source technology, which empowers users to create, explore, and innovate without barriers.

### Customers
Hugging Face is trusted by over **50,000 organizations** worldwide, including industry giants like **Google**, **Microsoft**, **Amazon**, and **Meta**. Our platform supports a diverse range of users, from individual developers to large enterprises, all seeking to harness the power of AI.

### Company Culture
We value a **community-driven approach** at Hugging Face. Our team is built on principles of openness, inclusivity, and collaboration, thriving in a dynamic environment where everyone’s contributions are valued. We encourage innovation and support continuous learning, ensuring that team members can grow alongside the rapidly evolving landscape of AI and machine learning.

### Careers
Join us in shaping the future of AI! At Hugging Face, we are constantly looking for talented individuals who share our passion for machine learning and open-source collaboration. Opportunities span a range of roles within our **development**, **research**, and **community engagement** teams. We offer a supportive environment where you can enhance your skill set and make a meaningful impact.

- **Open Positions:** Developers, Data Scientists, Community Managers, and more!
- **Perks:** Competitive salaries, flexible work arrangements, and a culture committed to your growth and well-being.

### Explore More
- **Models:** Explore over **1 million models** to find the perfect fit for your projects.
- **Datasets:** Access **250,000+ datasets** tailored for machine learning applications.
- **Spaces:** Discover and collaborate on engaging AI applications across various modalities including text, image, video, audio, and 3D.
- **Enterprise Solutions:** Elevate your organization with our enterprise-grade tools, security measures, and dedicated support.

### Connect with Us
Join our mission at Hugging Face. Whether you’re a potential customer, investor, or future team member, we invite you to explore the limitless possibilities within the world of AI. 

- [Visit Our Website](https://huggingface.co)
- [Follow Us on Twitter](https://twitter.com/huggingface)
- [Join Us on LinkedIn](https://www.linkedin.com/company/huggingface/)

Together, let’s build the future of AI!

In [20]:
# we can also stream it and build on the fly

def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [21]:
stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'join page', 'url': 'https://huggingface.co/join'}]}


# Hugging Face Brochure

## About Us
Hugging Face is a vibrant and collaborative community dedicated to building the future of artificial intelligence (AI). We empower machine learning enthusiasts and professionals to work together on innovative models, datasets, and applications. Our platform is at the heart of machine learning, enabling users to create, discover, and collaborate more effectively than ever before.

## Our Offerings
- **Models**: With over 1 million models available, Hugging Face provides a rich repository for machine learning applications ranging from text to 3D images.
- **Datasets**: Access more than 250,000 datasets to enhance your AI projects and enrich your learning experience.
- **Spaces**: Create and run applications within our user-friendly environment to bring your ideas to life.
- **Community**: Join a global network of over 50,000 organizations, including industry giants like Google, Microsoft, and Amazon, who use Hugging Face for their AI needs.

## Company Culture
At Hugging Face, we believe in democratizing AI and making it accessible to everyone. Our open-source ethos fosters a collaborative environment where creativity and innovation thrive. We celebrate diversity and aim to create a welcoming space for all voices in the AI community. 

## Career Opportunities
We are continually on the lookout for passionate and talented individuals to join our team. Whether you're a seasoned professional or just starting in the field, we have roles that cater to a diverse range of skills and experiences. Join us to be part of a mission-driven company that values creativity, teamwork, and personal growth.

### Current Openings
- Research Scientists
- Software Engineers
- Community Managers
- Data Scientists

To learn more about our available positions, visit the **[Jobs page](https://huggingface.co/jobs)**.

## Join Us!
Be part of the AI revolution. Whether you're a customer looking to leverage AI for your business, an investor seeking innovative opportunities, or a recruit aiming to build an exciting career, Hugging Face is the place to be.

---

## Contact Us
- **Website**: [Hugging Face](https://huggingface.co)
- **Social Media**: 
  - [GitHub](https://github.com/huggingface)
  - [Twitter](https://twitter.com/huggingface)
  - [LinkedIn](https://linkedin.com/company/huggingface)
  - [Discord](https://discord.gg/huggingface)

Together, let’s build the future of AI!