In [1]:
import os #Access environment variables like API keys (os.getenv()), file paths.
import requests#Make HTTP requests to fetch webpage content and send POST requests to Ollama API.
import json#Parse or format JSON data (for input/output with Ollama).
from typing import List#From typing, used for type hinting lists in functions or classes.
from dotenv import load_dotenv#Load variables from .env file into environment (good for secrets like API keys).
from bs4 import BeautifulSoup#Parses HTML for easy text and tag extraction.
from IPython.display import Markdown, display, update_display#Used in Jupyter Notebooks to show nicely formatted markdown outputs
from json import JSONDecodeError
from urllib.parse import urlparse, urljoin


In [2]:
OLLAMA_MODEL = "llama3.2"
OLLAMA_API = "http://localhost:11434/api/chat"
#Defines which local model you’ll use via Ollama (llama2, mistral, gemma).

In [3]:
headers={
    "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}#Some websites block scrapers. This makes your request look like it's from a browser.

In [15]:
class website:
    def __init__(self, url):
        self.url = url
        self.title = "No title found"
        self.text = ""
        self.links = []
        
        try:
            # Add timeout to prevent hanging
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()  # Raise an exception for bad status codes
            
            self.body = response.content
            soup = BeautifulSoup(self.body, 'html.parser')
            
            self.title = soup.title.string if soup.title else "No title found"
            
            if soup.body:
                for tag in soup.body(["script", "style", "img", "input"]):
                    tag.decompose()
                self.text = soup.body.get_text(separator="\n", strip=True)
            
            # Extract and normalize links
            base_url = self.get_base_url(url)
            raw_links = [link.get('href') for link in soup.find_all('a')]
            self.links = [self.normalize_url(link, base_url) for link in raw_links if link]
            
        except (requests.RequestException, Exception) as e:
            print(f"Error fetching {url}: {str(e)}")
            # Initialize with empty/default values on error
    
    def get_base_url(self, url):
        """Extract base URL for resolving relative URLs"""
        parsed = urlparse(url)
        return f"{parsed.scheme}://{parsed.netloc}"
    
    def normalize_url(self, link, base_url):
        """Convert relative URLs to absolute"""
        if not link:
            return None
        
        # Skip mailto: links, javascript:, etc.
        if link.startswith(('mailto:', 'javascript:', 'tel:')):
            return None
            
        # Handle fragment-only links
        if link.startswith('#'):
            return None
        
        # Convert relative to absolute
        if not link.startswith(('http://', 'https://')):
            return urljoin(base_url, link)
        
        return link
    
    def get_contents(self):
        return f"webpage title:\n{self.title}\nwebpage contents:\n{self.text}\n\n"

In [16]:
link_system_prompt = """You are provided with a list of links..."""#A system prompt that instructs the LLM how to identify important links (e.g., About, Careers).
#✅ Use this when giving clear roles to the LLM for specific tasks (like classification or summarization).
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url":"https://another.full.url/careers"}
    ]
}
"""
print(link_system_prompt)

You are provided with a list of links...You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url":"https://another.full.url/careers"}
    ]
}



In [17]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt
    #Takes the scraped links and converts them into a natural prompt for the LLM to decide relevance.

In [18]:
import re

def get_links(url):
    site_obj = website(url)  # Updated class name
    payload = {
        "model": OLLAMA_MODEL,
        "messages": [
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(site_obj)}
        ]
    }
    resp = requests.post(OLLAMA_API, json=payload, stream=True)
    # 1) Accumulate all the chunks into one string
    raw = ""
    for line in resp.iter_lines():
        if not line:
            continue
        chunk = json.loads(line.decode("utf-8"))
        raw += chunk.get("message", {}).get("content", "")
    # 2) Strip code fences/backticks
    cleaned = raw.replace("```", "").strip()
    # 3) Extract the JSON object using regex
    match = re.search(r"\{.*\}", cleaned, flags=re.DOTALL)
    if not match:
        raise RuntimeError(f"Failed to extract JSON. Here's a snippet:\n{cleaned[:200]}…")
    # 4) Extracted JSON string
    json_str = match.group(0)
    # 5) Parse and return the JSON data
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        raise RuntimeError(f"Failed to parse JSON:\n{json_str[:200]}...") from e

In [19]:
def get_all_details(url):
    result = "landing page:\n"
    try:
        landing_page = Website(url)
        result += landing_page.text
        
        # Try to get links, but handle errors
        try:
            links = get_links(url)
            if links and "links" in links:
                for link in links["links"]:
                    try:
                        link_url = link.get("url")
                        if link_url:
                            result += f"\n\n{link['type']}\n"
                            result += Website(link_url).get_contents()
                    except Exception as e:
                        result += f"\n\nError fetching {link.get('type', 'unknown')} page: {str(e)}\n"
        except Exception as e:
            result += f"\n\nError getting links: {str(e)}\n"
            
    except Exception as e:
        result += f"Error fetching main page: {str(e)}\n"
        
    return result
#Returns the full combined content of:
#The landing page, and
#The important subpages selected by the LLM.

In [20]:
system_prompt = """You are an assistant that analyzes the contents..."""
def get_brochure_user_prompt(company_name, url):
    prompt = f"You are looking at a company called: {company_name}\n"
    prompt += "Here are the contents..."
    prompt += get_all_details(url)
    return prompt[:5000]


In [21]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt


In [22]:
def create_brochure(company_name, url):
    payload = {
        "model": OLLAMA_MODEL,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": get_brochure_user_prompt(company_name, url)}
        ]
    }
    resp = requests.post(OLLAMA_API, json=payload)
    try:
        data = resp.json()
        if "response" in data:
            brochure_md = data["response"]
        elif "message" in data and "content" in data["message"]:
            brochure_md = data["message"]["content"]
        else:
            raise KeyError(f"Couldn't find a response in:\n{data}")
    except JSONDecodeError:
        # Handle streaming response if needed
        raw = ""
        for line in resp.iter_lines():
            if not line:
                continue
            try:
                chunk = json.loads(line.decode("utf-8"))
                raw += chunk.get("message", {}).get("content", "")
            except:
                continue
        brochure_md = raw
    
    display(Markdown(brochure_md))



In [23]:
create_brochure("HuggingFace", "https://huggingface.co")


Unfortunately, it seems like there's an issue with the content. However, I can still provide a generic brochure for Hugging Face based on my general knowledge.

**Hugging Face**
================

### About Us

Hugging Face is a leading open-source software company that provides tools and technologies for natural language processing (NLP) and machine learning.

### Our Mission

Empowering developers to build intelligent applications through cutting-edge NLP and ML libraries.

### What We Do

* Develop and maintain popular open-source libraries such as Transformers, Hugging Face Datasets, and SparkNLP.
* Provide pre-trained models for a wide range of NLP tasks, including language modeling, sentiment analysis, and text classification.
* Offer a suite of tools for building and deploying ML models, including the popular Transformers library.

### Our Products

#### **Transformers**

A popular open-source library for transformer-based architectures, offering state-of-the-art performance in NLP tasks.

#### **Hugging Face Datasets**

A comprehensive collection of pre-processed datasets for various NLP tasks, making it easy to get started with your projects.

#### **SparkNLP**

A Java library that provides a simple and efficient way to build ML models for text data.

### Why Choose Us

* **Community-driven**: Our libraries are built and maintained by a passionate community of developers.
* **High-performance**: Our models are optimized for speed and accuracy, making them perfect for production environments.
* **Easy to use**: Our libraries provide simple and intuitive APIs, making it easy to get started with NLP and ML.

### Join the Community

* Visit our [GitHub page](https://github.com/huggingface) to explore our open-source projects.
* Follow us on [Twitter](https://twitter.com/huggingface) for updates and news.
* Join our [Discord community](https://discord.huggingface.co/) to connect with other developers and experts.

### Get Started

Visit our [website](http://huggingface.co) to learn more about our products and libraries.