In [2]:
import os
import re
import requests
import socket
from requests.exceptions import ConnectionError
from requests.exceptions import MissingSchema
from requests.exceptions import InvalidSchema
from urllib3.exceptions import MaxRetryError, NameResolutionError
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import ollama

In [3]:
load_dotenv(override=True)
MODEL = 'llama3.2'

In [17]:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found" 
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]): 
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]
    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents: \n{self.text}\n\n"

In [18]:
page = Website("https://huggingface.co")
page.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/deepseek-ai/DeepSeek-R1-0528',
 '/deepseek-ai/DeepSeek-R1-0528-Qwen3-8B',
 '/ResembleAI/chatterbox',
 '/google/gemma-3n-E4B-it-litert-preview',
 '/ByteDance-Seed/BAGEL-7B-MoT',
 '/models',
 '/spaces/ResembleAI/Chatterbox',
 '/spaces/enzostvs/deepsite',
 '/spaces/multimodalart/wan2-1-fast',
 '/spaces/wushuang98/Direct3D-S2-v1.0-demo',
 '/spaces/NihalGazi/Text-To-Speech-Unlimited',
 '/spaces',
 '/datasets/open-r1/Mixture-of-Thoughts',
 '/datasets/yandex/yambda',
 '/datasets/MiniMaxAI/SynLogic',
 '/datasets/cognitivecomputations/china-refusals',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/gramma

In [19]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in \
a brochure about the company, such as links to an About page, or a Company page, \
or Careers/Jobs pages.\n"
link_system_prompt += "You should respond only in JSON, without text, object as in this example:" 
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"}, 
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [20]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond only in JSON, without text, object as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"}, 
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [21]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for \
    a brochure about the company, respond with the full https URL in clean JSON format \
    without text json on the beginning of the response. \
    Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

print(get_links_user_prompt(page))

Here is the list of links on the website of https://huggingface.co - please decide which of these are relevant web links for     a brochure about the company, respond with the full https URL in clean JSON format     without text json on the beginning of the response.     Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
/
/models
/datasets
/spaces
/docs
/enterprise
/pricing
/login
/join
/spaces
/models
/deepseek-ai/DeepSeek-R1-0528
/deepseek-ai/DeepSeek-R1-0528-Qwen3-8B
/ResembleAI/chatterbox
/google/gemma-3n-E4B-it-litert-preview
/ByteDance-Seed/BAGEL-7B-MoT
/models
/spaces/ResembleAI/Chatterbox
/spaces/enzostvs/deepsite
/spaces/multimodalart/wan2-1-fast
/spaces/wushuang98/Direct3D-S2-v1.0-demo
/spaces/NihalGazi/Text-To-Speech-Unlimited
/spaces
/datasets/open-r1/Mixture-of-Thoughts
/datasets/yandex/yambda
/datasets/MiniMaxAI/SynLogic
/datasets/cognitivecomputations/china-refusals
/datasets/fka/awesome-chatgpt-prompts
/datasets
/join
/pricing#

In [22]:
def get_links(url):
    website = Website(url)
    response = ollama.chat(
        model=MODEL,
    messages=[
        {"role": "system", "content": link_system_prompt},
        {"role": "user", "content": get_links_user_prompt(website)}
    ],
    options={"format":"json"}
    )
    result = response['message']['content']

    print(result)
    try:
        content_json = json.loads(result)
        return content_json
    except json.JSONDecodeError:
        print("Odpowiedź nie jest poprawnym JSON")
huggingface = Website("https://huggingface.co")
huggingface.links
get_links("https://huggingface.co")

{
  "links": [
    {"type": "About page", "url": "https://huggingface.co/"},
    {"type": "Company page", "url": "https://huggingface.co/allenai"},
    {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"},
    {"type": "Blog", "url": "https://blog.huggingface.co/"},
    {"type": "Discussions forum", "url": "https://discuss.huggingface.co/"},
    {"type": "Status page", "url": "https://status.huggingface.co/"},
    {"type": "GitHub repository", "url": "https://github.com/huggingface"},
    {"type": "Twitter account", "url": "https://twitter.com/huggingface"},
    {"type": "LinkedIn company page", "url": "https://www.linkedin.com/company/huggingface/"}
  ]
}


{'links': [{'type': 'About page', 'url': 'https://huggingface.co/'},
  {'type': 'Company page', 'url': 'https://huggingface.co/allenai'},
  {'type': 'Careers/Jobs page',
   'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'Blog', 'url': 'https://blog.huggingface.co/'},
  {'type': 'Discussions forum', 'url': 'https://discuss.huggingface.co/'},
  {'type': 'Status page', 'url': 'https://status.huggingface.co/'},
  {'type': 'GitHub repository', 'url': 'https://github.com/huggingface'},
  {'type': 'Twitter account', 'url': 'https://twitter.com/huggingface'},
  {'type': 'LinkedIn company page',
   'url': 'https://www.linkedin.com/company/huggingface/'}]}

In [23]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    for link in links["links"]:
        try:
            result += f"\n\n{link['type']}\n"
            result += Website(link["url"]).get_contents()
        except socket.gaierror as e:
            print(f"DNS resolution failed: {e}")
        except NameResolutionError as e:
            print(f"Name resolution error: {e}")
        except MaxRetryError as e:
            print(f"Max retries exceeded: {e}")
        except ConnectionError as e:
            print(f"Connection error: {e}")
        except MissingSchema as e:
            print(f"Invalid URL schema: {e}")
        except InvalidSchema as e:
            print(f"Omitted unsupported URL (InvalidSchema): {e}")
    return result
print(get_all_details("https://huggingface.co"))

{
    "links": [
        {"type": "about page", "url": "https://huggingface.co/"},
        {"type": "careers page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "company page", "url": "https://www.linkedin.com/company/huggingface/"},
        {"type": "blog page", "url": "https://discuss.huggingface.co/"},
        {"type": "status page", "url": "https://status.huggingface.co/"},
        {"type": "github page", "url": "https://github.com/huggingface"},
        {"type": "twitter page", "url": "https://twitter.com/huggingface"}
    ]
}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents: 
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
deepseek-ai/DeepSeek-R1-0528
Updated
4 days ago

In [24]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; \
    use this information to build a short brochure of te company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt


In [27]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

{
    "links": [
        {"type": "about page", "url": "https://huggingface.co/"},
        {"type": "brand page", "url": "https://huggingface.co/"},
        {"type": "blog page", "url": "https://discuss.huggingface.co/"},
        {"type": "status page", "url": "https://status.huggingface.co/"},
        {"type": "github page", "url": "https://github.com/huggingface"},
        {"type": "twitter page", "url": "https://twitter.com/huggingface"},
        {"type": "linkedin company page", "url": "https://www.linkedin.com/company/huggingface/"}
    ]
}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages;     use this information to build a short brochure of te company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents: \nHugging Face\nModels\nDatasets\nSpaces\nCommunity\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\ndeepseek-ai/DeepSeek-R1-0528\nUpdated\n4 days ago\n•\n41.6k\n•\n1.62k\ndeepseek-ai/DeepSeek-R1-0528-Qwen3-8B\nUpdated\n4 days ago\n•\n55.8k\n•\n579\nResembleAI/chatterbox\nUpdated\n3 days ago\n•\n502\ngoogle/gemma-3n-E4B-it-litert-preview\nUpdated\n7 days ago\n•\n803\nByteDance-Seed/BAGEL-7B-MoT\nUpdated\n11 days ago\n•\n8.22k\n•\n921\nBrowse 1M+ models\nSpaces\nRunning\non\nZero\n580\

In [28]:
def create_brochure(company_name, url):
    response = ollama.chat(
            model=MODEL,
            messages=[
                {"role":"system", "content":system_prompt},
                {"role":"user", "content":get_brochure_user_prompt(company_name, url)}
            ],
    )
    result = response['message']['content']
    display(Markdown(result))

In [31]:
create_brochure("HuggingFace", "https://huggingface.co")

{
  "links": [
    {"type": "About page", "url": "https://huggingface.co/"},
    {"type": "Company page", "url": "https://huggingface.co/"},
    {"type": "Blog", "url": "https://blog.huggingface.co/"},
    {"type": "Discussions", "url": "https://discuss.huggingface.co/"},
    {"type": "GitHub repository", "url": "https://github.com/huggingface"},
    {"type": "LinkedIn page", "url": "https://www.linkedin.com/company/huggingface/"},
    {"type": "Twitter profile", "url": "https://twitter.com/huggingface"},
    {"type": "Join Discord community", "url": "https://joindiscord.huggingface.co/"},
    {"type": "Changelog", "url": "https://huggingface.co/changelog"}
  ]
}
Connection error: HTTPSConnectionPool(host='blog.huggingface.co', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000195144D44D0>: Failed to resolve 'blog.huggingface.co' ([Errno 11001] getaddrinfo failed)"))
Connection error: HTTPSConnectionPool(hos

# Hugging Face: Building the Future of AI Together

Welcome to Hugging Face, the leading platform for the machine learning community. Our mission is to provide a collaborative space for researchers and developers to build, share, and apply AI models to real-world problems.

## Our Story

Hugging Face was founded on the idea that AI should be accessible to everyone, not just a select few. We believe in the power of community-driven innovation and open-source collaboration. Our platform is built on top of the latest advancements in natural language processing (NLP) and computer vision, allowing developers to easily build, train, and deploy their own AI models.

## Our Community

Our community is diverse and vibrant, with over 50,000 organizations worldwide relying on our platform for their AI needs. We're proud to have some of the most influential companies in the world as part of our ecosystem, including Meta, Google, Amazon, Intel, Microsoft, and Grammarly.

## What We Offer

* **1M+ Models**: Browse our vast library of pre-trained models, covering text, image, video, audio, and 3D modalities.
* **Collaboration Platform**: Host and collaborate on unlimited public models, datasets, and applications with our community.
* **Open-Source Stack**: Use our free and open-source stack to build your own AI models, including Transformers, Diffusers, Safetensors, Tokenizers, and more.
* **Compute and Enterprise Solutions**: Get access to optimized inference endpoints, GPU deployment, and enterprise-grade security with our paid Compute and Enterprise solutions.

## Our Mission

At Hugging Face, we're committed to making AI accessible to everyone. We believe that by working together, we can:

* **Accelerate Innovation**: By providing a collaborative platform for researchers and developers to build and share AI models.
* **Foster Community Growth**: By supporting the growth of our community through open-source collaboration and education.
* **Empower Developers**: By providing the tools and resources needed to build and deploy AI models.

## Join Our Journey

Ready to be part of the Hugging Face community? Sign up for our platform today and start building, sharing, and applying AI models to real-world problems. Explore our 1M+ models, collaboration platform, open-source stack, and Compute and Enterprise solutions. Together, let's build a future powered by AI.

## Get in Touch

Want to learn more about Hugging Face or get in touch with our community? Follow us on social media:

* Twitter: [@HuggingFace](https://twitter.com/HuggingFace)
* LinkedIn: [Hugging Face](https://linkedin.com/company/hugging-face)
* GitHub: [Hugging Face](https://github.com/huggingface)

Let's build a future together!

In [32]:
def stream_brochure(company_name, url):
    stream = ollama.chat(
        model = MODEL,
        messages=[
                {"role":"system", "content":system_prompt},
                {"role":"user", "content":get_brochure_user_prompt(company_name, url)}
            ],
        stream=True
    )
    result=""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        result += chunk['message']['content'] or ''
        result = result.replace("```", "").replace("markdown", "")
        update_display(Markdown(result), display_id=display_handle.display_id)

In [33]:
stream_brochure("HuggingFace", "https://huggingface.co")

{
  "links": [
    {"type": "About page", "url": "https://huggingface.co/"},
    {"type": "Company page", "url": "https://huggingface.co/"},
    {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"},
    {"type": "Blog", "url": "https://blog.huggingface.co/"}
  ]
}
Connection error: HTTPSConnectionPool(host='blog.huggingface.co', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000019514F3FF10>: Failed to resolve 'blog.huggingface.co' ([Errno 11001] getaddrinfo failed)"))


# Hugging Face Brochure
=====================================

Welcome to Hugging Face, the AI community building the future.

## About Us
Hugging Face is a platform where the machine learning community collaborates on models, datasets, and applications. Our mission is to accelerate the development of artificial intelligence by providing a comprehensive suite of tools and resources for researchers and practitioners alike.

### Company Culture

At Hugging Face, we value:

*   Collaboration: We believe that the best ideas come from diverse perspectives and expertise.
*   Innovation: We're committed to pushing the boundaries of what's possible with AI.
*   Community: We build strong relationships with our users, partners, and contributors to drive growth and progress.

### Customers
Hugging Face serves a wide range of customers, including:

*   AI Researchers
*   Machine Learning Engineers
*   Data Scientists
*   Companies of all sizes (including startups, enterprises, non-profits, and more)

## Key Features

*   **1M+ Models**: Browse our vast library of pre-trained models for popular tasks like text classification, object detection, and more.
*   **Datasets**: Access a massive collection of datasets for training and testing your models.
*   **Spaces**: Collaborate with others on public models, datasets, and applications using our secure platform.

## Products

*   **Compute**: Deploy models on optimized inference endpoints or update Spaces applications to GPU in just a few clicks.
*   **Enterprise**: Get access to enterprise-grade security, access controls, dedicated support, and more for your team's AI needs.

### Pricing
Our pricing plans start at:

*   $0.60/hour for GPU Compute
*   $20/user/month for Enterprise solutions

## Community
Join our vibrant community of users, contributors, and partners to share knowledge, learn from others, and accelerate your AI journey.

[GitHub](https://github.com/huggingface)
[Twitter](https://twitter.com/huggingface)
[LinkedIn](https://linkedin.com/company/hugging-face)

### Join the Conversation

*   [Forum](https://forum.huggingface.co/)
*   [Blog](https://blog.huggingface.co/)

Get started with Hugging Face today and build your portfolio, train PyTorch models, or collaborate on a project. Sign up now!

In [34]:
import sys

In [36]:
try:
    response = requests.get("http://localhost:11434/api/version", timeout=5)
    print(f"Server Ollama responds: {response.text}")
except Exception as e:
    print(f"Cant connect to the Ollama server: {e}")
    print("Make sure that Ollama is started with this command: ollama serve")
    sys.exit(1)

try:
    client = ollama.Client(host='http://localhost:11434')
    models = client.list()
    print(f"Available models: {models}")
except Exception as e:
    print(f"Error while using ollama client: {e}")

Server Ollama responds: {"version":"0.9.0"}
Available models: models=[Model(model='gemma2:27b', modified_at=datetime.datetime(2025, 6, 2, 19, 7, 7, 992291, tzinfo=TzInfo(+02:00)), digest='53261bc9c192c1cb5fcc898dd3aa15da093f5ab6f08e17e48cf838bb1c58abfe', size=15628387458, details=ModelDetails(parent_model='', format='gguf', family='gemma2', families=['gemma2'], parameter_size='27.2B', quantization_level='Q4_0')), Model(model='deepseek-r1:8b', modified_at=datetime.datetime(2025, 6, 2, 18, 19, 44, 176974, tzinfo=TzInfo(+02:00)), digest='6995872bfe4c521a67b32da386cd21d5c6e819b6e0d62f79f64ec83be99f5763', size=5225376047, details=ModelDetails(parent_model='', format='gguf', family='qwen3', families=['qwen3'], parameter_size='8.2B', quantization_level='Q4_K_M')), Model(model='llama3.2:latest', modified_at=datetime.datetime(2025, 6, 2, 18, 0, 18, 274458, tzinfo=TzInfo(+02:00)), digest='a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72', size=2019393189, details=ModelDetails(par