In [1]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import json
import sys
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display
from openai import OpenAI

# Add parent directory of the notebook to sys.path to import functions from scraper.py file
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Now you can import modules from parent directory
from scraper import fetch_website_contents, fetch_website_links

In [2]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-5-nano'
openai = OpenAI()

API key looks good so far


In [3]:
links = fetch_website_links("https://edwarddonner.com")
links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/11/11/ai-live-event/',
 'https://edwarddonner.com/2025/11/11/ai-live-event/',
 'https://edwarddonner.com/2025/09/15/ai-in-production-gen-ai-and-agentic-ai-on-aws-at-scale/',
 'https://edwarddonner.com/2025/09/15/ai-in-production-gen-ai-and-agentic-ai-on-aws-at-scale/',
 'https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/',
 'https://edwarddonner.com/2025/05/28/connecting-my-cou

In [4]:
link_system_prompt = """
You are provided with a list of links found on a webpage.
You are able to decide which of the links would be most relevant to include in a brochure about the company,
such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:

{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [5]:
def get_links_user_prompt(url):
    user_prompt = f"""
Here is the list of links on the website {url} -
Please decide which of these are relevant web links for a brochure about the company, 
respond with the full https URL in JSON format.
Do not include Terms of Service, Privacy, email links.

Links (some might be relative links):

"""
    links = fetch_website_links(url)
    user_prompt += "\n".join(links)
    return user_prompt

In [6]:
print(get_links_user_prompt("https://edwarddonner.com"))


Here is the list of links on the website https://edwarddonner.com -
Please decide which of these are relevant web links for a brochure about the company, 
respond with the full https URL in JSON format.
Do not include Terms of Service, Privacy, email links.

Links (some might be relative links):

https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/11/11/ai-live-event/
https://edwarddonner.com/2025/11/11/ai-live-event/
https://edwarddonner.com/2025/09/15/ai-in-production-gen-ai-and-agentic-ai-on-aws-at-scale/
htt

In [7]:
def select_relevant_links(url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(url)}
        ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    links = json.loads(result)
    return links

In [8]:
select_relevant_links("https://edwarddonner.com")

{'links': [{'type': 'home page', 'url': 'https://edwarddonner.com/'},
  {'type': 'about page',
   'url': 'https://edwarddonner.com/about-me-and-about-nebula/'},
  {'type': 'company/product page',
   'url': 'https://nebula.io/?utm_source=ed&utm_medium=referral'},
  {'type': 'event page',
   'url': 'https://edwarddonner.com/2025/11/11/ai-live-event/'},
  {'type': 'blog post',
   'url': 'https://edwarddonner.com/2025/09/15/ai-in-production-gen-ai-and-agentic-ai-on-aws-at-scale/'},
  {'type': 'blog post',
   'url': 'https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/'},
  {'type': 'blog post',
   'url': 'https://edwarddonner.com/2025/05/18/2025-ai-executive-briefing/'},
  {'type': 'blog page', 'url': 'https://edwarddonner.com/posts/'}]}

In [9]:
def select_relevant_links(url):
    print(f"Selecting relevant links for {url} by calling {MODEL}")
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(url)}
        ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    links = json.loads(result)
    print(f"Found {len(links['links'])} relevant links")
    return links

In [10]:
select_relevant_links("https://edwarddonner.com")

Selecting relevant links for https://edwarddonner.com by calling gpt-5-nano
Found 8 relevant links


{'links': [{'type': 'home page', 'url': 'https://edwarddonner.com/'},
  {'type': 'about page',
   'url': 'https://edwarddonner.com/about-me-and-about-nebula/'},
  {'type': 'project page', 'url': 'https://edwarddonner.com/connect-four/'},
  {'type': 'project page', 'url': 'https://edwarddonner.com/outsmart/'},
  {'type': 'company page',
   'url': 'https://nebula.io/?utm_source=ed&utm_medium=referral'},
  {'type': 'linkedin page', 'url': 'https://www.linkedin.com/in/eddonner/'},
  {'type': 'twitter page', 'url': 'https://twitter.com/edwarddonner'},
  {'type': 'facebook page',
   'url': 'https://www.facebook.com/edward.donner.52'}]}

In [11]:
select_relevant_links("https://huggingface.co")

Selecting relevant links for https://huggingface.co by calling gpt-5-nano
Found 12 relevant links


{'links': [{'type': 'brand page', 'url': 'https://huggingface.co/brand'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'blog', 'url': 'https://huggingface.co/blog'},
  {'type': 'GitHub page', 'url': 'https://github.com/huggingface'},
  {'type': 'LinkedIn page',
   'url': 'https://www.linkedin.com/company/huggingface/'},
  {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'},
  {'type': 'Discussions forum', 'url': 'https://discuss.huggingface.co'},
  {'type': 'Status page', 'url': 'https://status.huggingface.co/'},
  {'type': 'Endpoints product', 'url': 'https://endpoints.huggingface.co'},
  {'type': 'Docs hub', 'url': 'https://huggingface.co/docs'}]}

In [12]:
def fetch_page_and_all_relevant_links(url):
    contents = fetch_website_contents(url)
    relevant_links = select_relevant_links(url)
    result = f"## Landing Page:\n\n{contents}\n## Relevant Links:\n"
    for link in relevant_links['links']:
        result += f"\n\n### Link: {link['type']}\n"
        result += fetch_website_contents(link["url"])
    return result

In [13]:
print(fetch_page_and_all_relevant_links("https://huggingface.co"))

Selecting relevant links for https://huggingface.co by calling gpt-5-nano
Found 10 relevant links
## Landing Page:

Hugging Face ‚Äì The AI community building the future.

Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
moonshotai/Kimi-K2-Thinking
Updated
5 days ago
‚Ä¢
105k
‚Ä¢
1.13k
baidu/ERNIE-4.5-VL-28B-A3B-Thinking
Updated
1 day ago
‚Ä¢
4.41k
‚Ä¢
355
maya-research/maya1
Updated
1 day ago
‚Ä¢
18.6k
‚Ä¢
543
dx8152/Qwen-Edit-2509-Multiple-angles
Updated
1 day ago
‚Ä¢
40.4k
‚Ä¢
559
MiniMaxAI/MiniMax-M2
Updated
about 4 hours ago
‚Ä¢
891k
‚Ä¢
1.28k
Browse 1M+ models
Spaces
Running
on
Zero
729
729
Qwen Image Edit Camera Control
üé¨
Fast 4 step inference with Qwen Image Edit 2509
Running
on
CPU Upgrade
2.14k
2.14k
The Smol Training Playbook
üìö
The

In [14]:
brochure_system_prompt = """
You are an assistant that analyzes the contents of several relevant pages from a company website
and creates a short brochure about the company for prospective customers, investors and recruits.
Respond in markdown without code blocks.
Include details of company culture, customers and careers/jobs if you have the information.
"""

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# brochure_system_prompt = """
# You are an assistant that analyzes the contents of several relevant pages from a company website
# and creates a short, humorous, entertaining, witty brochure about the company for prospective customers, investors and recruits.
# Respond in markdown without code blocks.
# Include details of company culture, customers and careers/jobs if you have the information.
# """

In [15]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"""
You are looking at a company called: {company_name}
Here are the contents of its landing page and other relevant pages;
use this information to build a short brochure of the company in markdown without code blocks.\n\n
"""
    user_prompt += fetch_page_and_all_relevant_links(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [16]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Selecting relevant links for https://huggingface.co by calling gpt-5-nano
Found 8 relevant links


'\nYou are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages;\nuse this information to build a short brochure of the company in markdown without code blocks.\n\n\n## Landing Page:\n\nHugging Face ‚Äì The AI community building the future.\n\nHugging Face\nModels\nDatasets\nSpaces\nCommunity\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nmoonshotai/Kimi-K2-Thinking\nUpdated\n5 days ago\n‚Ä¢\n105k\n‚Ä¢\n1.13k\nbaidu/ERNIE-4.5-VL-28B-A3B-Thinking\nUpdated\n1 day ago\n‚Ä¢\n4.41k\n‚Ä¢\n355\nmaya-research/maya1\nUpdated\n1 day ago\n‚Ä¢\n18.6k\n‚Ä¢\n543\ndx8152/Qwen-Edit-2509-Multiple-angles\nUpdated\n1 day ago\n‚Ä¢\n40.4k\n‚Ä¢\n559\nMiniMaxAI/MiniMax-M2\nUpdated\nabout 4 hours ago\n‚Ä¢\n891k\n‚Ä¢\n1.28k\nBrowse 1M+ models\nSpa

In [17]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [18]:
create_brochure("HuggingFace", "https://huggingface.co")

Selecting relevant links for https://huggingface.co by calling gpt-5-nano
Found 7 relevant links


# Hugging Face Brochure

---

## Who We Are

**Hugging Face** is the thriving AI community building the future of machine learning. As a leading platform for collaboration, we connect machine learning engineers, scientists, and AI enthusiasts globally to create, share, and advance open-source AI models, datasets, and applications. Our mission is to empower the next generation of AI innovators by fostering an open, ethical, and collaborative ecosystem.

---

## What We Offer

- **Hugging Face Hub**: A central platform hosting over **1 million machine learning models**, **250,000+ datasets**, and **400,000+ applications** covering text, image, video, audio, and 3D modalities.
- **Spaces**: Interactive environments where users run, demo, and experiment with AI applications.
- **Open-Source Stack**: Our open-source tools and libraries help accelerate ML development and deployment.
- **Enterprise Solutions**: Dedicated offerings for businesses needing scalable, secure, and private ML infrastructure.
- **Compute Services**: Paid compute resources to accelerate research and production workflows.

---

## Our Community & Customers

- **Global Community**: Tens of thousands of ML practitioners collaborate daily on our platform.
- **Diverse Use Cases**: From autonomous vehicles (NVIDIA datasets) to natural language processing (Facebook omnilingual ASR) and generative AI for images and videos.
- **Collaborative Innovation**: Users share cutting-edge models like moonshotai/Kimi-K2-Thinking and baidu/ERNIE-4.5-VL, accessible to all to build on and innovate.

---

## Our Culture

- **Open and Ethical AI**: We champion transparency and inclusivity, promoting responsible AI development.
- **Collaborative Spirit**: Knowledge sharing and community contributions are at the core of everything we do.
- **Empowerment**: Providing tools and resources for all skill levels to create, learn, and grow in ML.
- **Innovation-Driven**: Constantly evolving to support the latest developments in machine learning research and applications.

---

## Career Opportunities

Join us if you‚Äôre passionate about building the future of AI through open source and community collaboration. We seek talented individuals in:

- Machine Learning Research & Engineering
- Software Development
- DevOps and Infrastructure
- Community Management
- Product and Design
- Enterprise Solutions and Support

**Grow your career while shaping the responsible AI landscape!**

---

## Get Involved

- **Explore AI Apps and Models**: Try out state-of-the-art models and datasets on our website.
- **Contribute**: Share your own models, data, and applications with the community.
- **Join the Community**: Participate in forums, meetups, and events.
- **Sign Up**: Build your ML portfolio and accelerate your projects with Hugging Face.

**Website:** [huggingface.co](https://huggingface.co)

---

## Brand Essence

- Vibrant yellow (#FFD21E) and warm orange (#FF9D00) reflect our bright, welcoming, and innovative community.
- Simple, friendly logos embody our open, accessible approach to machine learning.
- Trusted by leading innovators, we are the home for machine learning collaboration worldwide.

---

**Hugging Face** ‚Äî The Home of Machine Learning Collaboration.  
Building the future, together.

In [19]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        update_display(Markdown(response), display_id=display_handle.display_id)

In [20]:
stream_brochure("HuggingFace", "https://huggingface.co")

Selecting relevant links for https://huggingface.co by calling gpt-5-nano
Found 12 relevant links


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


# Hugging Face Brochure

---

## About Hugging Face

Hugging Face is the AI community building the future of machine learning. It is the leading collaboration platform where machine learning engineers, researchers, and enthusiasts come together to create, share, and explore open-source AI models, datasets, and applications. 

The platform hosts over **1 million models**, **250,000+ datasets**, and **400,000+ applications**, covering diverse modalities including text, image, video, audio, and even 3D.

---

## What We Offer

### Hugging Face Hub  
A central place to **share, discover, and experiment** with machine learning resources. Users can host unlimited public models, datasets, and applications, building their portfolios and machine learning profiles.

### Open-Source Stack  
Accelerate your ML projects with Hugging Face‚Äôs powerful open-source libraries and tools that enable faster development cycles and innovation.

### Spaces  
Run and share ML applications directly on the platform, ranging from text-to-image models to video generation and multilingual transcription.

### Enterprise Solutions  
Hugging Face provides paid compute and enterprise-grade solutions for businesses ready to deploy AI responsibly and at scale.

---

## Our Community & Culture

- **Open and Ethical AI** - Hugging Face is committed to building an open and ethical AI ecosystem.  
- **Collaborative Spirit** - The community is the heart of Hugging Face, empowering the next generation of ML engineers and scientists to learn, share, and innovate together.  
- **Diversity in AI Modalities** - Supporting a wide range of AI applications from natural language processing to autonomous vehicles and multimedia.

---

## Who Uses Hugging Face?

- **Researchers and Scientists** - Access cutting-edge models and datasets for experimentation and discovery.  
- **Developers and Engineers** - Quickly build, share, and deploy ML models and apps.  
- **Enterprises** - Leverage scalable AI infrastructure and support for real-world AI deployment.  
- **Educators and Students** - Learn and build portfolios with a rich repository of open-source resources.

---

## Careers at Hugging Face

Hugging Face is growing its team to further empower the AI community worldwide. The company values passionate individuals who thrive in a collaborative, innovative, and ethical work environment. 

**Opportunities include roles in:**

- Machine Learning Research  
- Software Engineering  
- Community and Developer Relations  
- Enterprise Solutions and Customer Support  

If you're excited about redefining the future of AI with a vibrant and inclusive community, Hugging Face invites you to join their team.

---

## Brand Identity

- **Colors:** Bright, inclusive, and energetic hues (#FFD21E, #FF9D00, #6B7280)  
- **Logo:** Friendly and recognizable, symbolizing collaboration and openness.

---

## Get Started with Hugging Face

- **Explore AI Applications**  
- **Browse millions of models and datasets**  
- **Join a fast-growing global machine learning community**  
- **Sign up and build your AI future today**

Visit [huggingface.co](https://huggingface.co) to join the AI revolution!

---

Hugging Face ‚Äî The AI community building the future.