In [84]:
import os #Access environment variables like API keys (os.getenv()), file paths.
import requests#Make HTTP requests to fetch webpage content and send POST requests to Ollama API.
import json#Parse or format JSON data (for input/output with Ollama).
from typing import List#From typing, used for type hinting lists in functions or classes.
from dotenv import load_dotenv#Load variables from .env file into environment (good for secrets like API keys).
from bs4 import BeautifulSoup#Parses HTML for easy text and tag extraction.
from IPython.display import Markdown, display, update_display#Used in Jupyter Notebooks to show nicely formatted markdown outputs
from json import JSONDecodeError


In [69]:
OLLAMA_MODEL = "llama3.2"
OLLAMA_API = "http://localhost:11434/api/chat"
#Defines which local model you’ll use via Ollama (llama2, mistral, gemma).

In [70]:
headers={
    "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}#Some websites block scrapers. This makes your request look like it's from a browser.

In [71]:
class website:
    def __init__(self,url):
        self.url = url#Stores the input URL in an instance variable so it can be used later.
        response = requests.get(url,headers = headers)#Uses the requests library to send an HTTP GET request to the given URL.
        self.body = response.content#Stores the raw HTML content of the webpage as self.body.
        soup = BeautifulSoup(self.body,'html.parser')
        #Beautiful Soup is a Python library used for parsing HTML and XML documents. It creates a parse tree from the page source code that makes it super easy to extract, search, and modify the contents of web pages.
        self.title = soup.title.string if soup.title else "no title found"#If a <title> tag is found, it stores the text content (.string), otherwise defaults to "No title found".
        if soup.body:
            for tag in soup.body(["script","style","img","input"]):#decompose() permanently removes these tags from the soup object.
                tag.decompose()#Removes unwanted tags from the body that do not contribute meaningful text:<script>: JavaScript||<style>: CSS||<img>: Images||<input>: Form fields
            self.text = soup.body.get_text(separator= "\n",strip =True) #Extracts the clean, readable text from the <body> tag.||separator="\n" adds newlines between elements.
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')] #Finds all <a> tags on the page.||Extracts their href attribute (the actual link).
        self.links = [link for link in links if link]    #Filters out any None or empty links.
    def get_contents(self): #Returns the cleaned and readable version of a webpage
        return f"webpage title:\n{self.title}\nwebpage contents:\n{self.text}\n\n"    

In [72]:
link_system_prompt = """You are provided with a list of links..."""#A system prompt that instructs the LLM how to identify important links (e.g., About, Careers).
#✅ Use this when giving clear roles to the LLM for specific tasks (like classification or summarization).
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url", "https://full.url/goes/here/about"},
        {"type": "careers page": "url","https://another.full.url/careers"}
    ]
}
"""
print(link_system_prompt)

You are provided with a list of links...You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url", "https://full.url/goes/here/about"},
        {"type": "careers page": "url","https://another.full.url/careers"}
    ]
}



In [73]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt
    #Takes the scraped links and converts them into a natural prompt for the LLM to decide relevance.

In [100]:
import re
import json
import requests

def get_links(url):
    site_obj = website(url)
    payload = {
        "model": OLLAMA_MODEL,
        "messages": [
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(site_obj)}
        ]
    }

    resp = requests.post(OLLAMA_API, json=payload, stream=True)

    # 1) Accumulate all the chunks into one string
    raw = ""
    for line in resp.iter_lines():
        if not line:
            continue
        chunk = json.loads(line.decode("utf-8"))
        raw += chunk.get("message", {}).get("content", "")

    # 2) Strip code fences/backticks
    cleaned = raw.replace("```", "").strip()

    # 3) Extract the JSON object using regex
    # We look for the first `{` and matching `}` to capture the JSON string
    match = re.search(r"\{.*\}", cleaned, flags=re.DOTALL)
    if not match:
        raise RuntimeError(f"Failed to extract JSON. Here’s a snippet:\n{cleaned[:200]}…")

    # 4) Extracted JSON string
    json_str = match.group(0)

    # 5) Parse and return the JSON data
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        raise RuntimeError(f"Failed to parse JSON:\n{json_str[:200]}...") from e



In [75]:
def get_all_details(url):
    result = "landing page:\n"
    result += website(url).text
    links = get_links(url)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += website(link["url"]).get_contents()
    return result
#Returns the full combined content of:
#The landing page, and
#The important subpages selected by the LLM.

In [76]:
system_prompt = """You are an assistant that analyzes the contents..."""
def get_brochure_user_prompt(company_name, url):
    prompt = f"You are looking at a company called: {company_name}\n"
    prompt += "Here are the contents..."
    prompt += get_all_details(url)
    return prompt[:5000]


In [77]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt


In [93]:
def create_brochure(company_name, url):
    payload = {
        "model": OLLAMA_MODEL,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": get_brochure_user_prompt(company_name, url)}
        ]
    }

    resp = requests.post(OLLAMA_API, json=payload, stream=False)
    data = resp.json()
    # debug:
    # print("Response keys:", data.keys())

    if "response" in data:
        brochure_md = data["response"]
    elif "message" in data and "content" in data["message"]:
        brochure_md = data["message"]["content"]
    else:
        raise KeyError(f"Couldn't find a response in:\n{data}")

    display(Markdown(brochure_md))



In [101]:
create_brochure("HuggingFace", "https://huggingface.co")


ConnectionError: HTTPSConnectionPool(host='blog.huggingface.co', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000027E723AF490>: Failed to resolve 'blog.huggingface.co' ([Errno 11001] getaddrinfo failed)"))