WEBSITE SUMMARY BY SCRAPING


In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
import os
import time
import ollama
import asyncio
from openai import OpenAI
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from IPython.display import Markdown, display
from playwright.async_api import async_playwright

In [None]:
load_dotenv()

_raw_openai_key = os.getenv("OPENAI_API_KEY")
OPENAI_API_KEY = None
if _raw_openai_key:
    # Remove all whitespace characters anywhere in the key (spaces, tabs, newlines)
    sanitized_key = "".join(_raw_openai_key.split())
    if sanitized_key != _raw_openai_key:
        print("Sanitized OPENAI_API_KEY by removing whitespace.")
    OPENAI_API_KEY = sanitized_key
    # Ensure downstream libraries that read from env get the sanitized key
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    print("OPENAI_API_KEY found!")
else:
    print(
        "OPENAI_API_KEY is not set. The app will start, but calls that require OpenAI will fail until it is configured."
    )

In [None]:
def time_now():
    return time.strftime("%Y-%m-%d %H:%M:%S")

In [None]:
class Website:
    url: str
    title: str
    text: str

    def __init__(self, url: str):
        self.url = url
        self.title = None
        self.text = None

    async def scrape(self):
        print(time_now() + " Scraping Started...")
        async with async_playwright() as p:
            browser = await p.chromium.launch()
            page = await browser.new_page()
            await page.goto(self.url)
            content = await page.content()
            await browser.close()

        soup = BeautifulSoup(content, "html.parser")
        self.title = soup.title.string if soup.title else "No Title Found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)
        print(time_now() + " Scraping Completed")

In [None]:
system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."


def user_prompt_for(website: Website):
    user_prompt = f"You are looking at a website titled '{website.title}' "
    user_prompt += "\nThe contents of this website is as follows; \
    please provide a short summary of this website in markdown. \
    If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [None]:
def messages_for(website: Website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)},
    ]

In [None]:
openai = OpenAI()


def summarize_website(url: str):
    website = Website(url)
    asyncio.run(website.scrape())

    print(time_now() + " Summary requested...")

    # # OpenAI
    # response = openai.chat.completions.create(
    #     model="gpt-4o-mini", messages=messages_for(website)
    # )
    # print(time_now() + " Summary received")
    # return response.choices[0].message.content

    # Ollama
    response = ollama.chat(
        model="gemma3:1b",
        messages=messages_for(website)
    )
    print(time_now() + " Summary received")
    return response["message"]["content"]

In [None]:
def display_website_summary(url: str):
    summary = summarize_website(url)
    print(time_now() + " Process Completed\n\n")
    display(Markdown(summary))

In [None]:
display_website_summary("https://abhaseeb.com")