In [2]:
!pip install openai



In [3]:
# imports
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI
import json

In [4]:
# Load environment variables and check API key
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key
if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters")
else:
    print("API key found and looks good so far!")

API key found and looks good so far!


In [5]:
# Initialize OpenAI client
openai = OpenAI()

In [6]:
class Website:
    """A utility class to represent a Website that we have scraped"""

    def __init__(self, url):
        self.url = url
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
        }
        # Fetch the webpage with proper headers
        response = requests.get(url, headers=self.headers)
        self.body = response.content
        
        # Parse the content using BeautifulSoup
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        
        # Process the webpage body to remove irrelevant tags
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        
        # Extract all valid links from the webpage
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        """Returns the formatted webpage contents including title and text"""
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [7]:
# Define our system prompts
system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [8]:
def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [9]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)
    
def summarize(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt_for(website)}
        ]
    )
    return response.choices[0].message.content

In [10]:
def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [11]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ]
    )
    result = response.choices[0].message.content
    display(Markdown(result))

def stream_brochure(company_name, url):
    website = Website(url)
    stream = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        display_handle.update(Markdown(response))

In [12]:
# Example usage
url = "https://www.example.com"
company_name = "Example Company"

print("Website Summary:")
display_summary(url)

print("\nCompany Brochure:")
create_brochure(company_name, url)

print("\nStreaming Brochure:")
stream_brochure(company_name, url)

Website Summary:


# Example Domain Overview

The **Example Domain** is designated for use in illustrative examples within documents. It explicitly allows users to utilize this domain in literature without the need for prior coordination or permission. 

There are no news or announcements provided on the website.


Company Brochure:
Found links: {'links': []}


# Example Company Brochure

## Overview
**Example Company** operates under the domain "Example Domain" which serves as a resource for illustrative examples in various documents. This domain is intended for use in literature, allowing users to incorporate it freely without requiring prior coordination or permission. 

## Key Information
- **Purpose**: The Example Domain is specifically designed for illustrative purposes, making it a valuable tool for writers and educators.
- **Usage**: Users can utilize this domain in documents, literature, and examples as needed, promoting ease of access and flexibility.

## Additional Notes
For more information on how to effectively use the Example Domain, please explore further resources available on the website. 

---

Utilize the Example Domain effectively in your next project!


Streaming Brochure:
Found links: {'links': []}


# Example Company Brochure

## About Us
**Example Company** provides distinct illustrative examples for documents. Our domain is designed for use in various examples within literature, ensuring creators can easily reference and utilize our resources without the need for prior coordination or permission.

## Our Mission
To support creators, educators, and professionals by offering a reliable example domain that simplifies the process of including illustrative content in their works.

## Usage
- **Creative Works:** Utilize our domain in literature and various forms of creative expression.
- **Educational Materials:** Incorporate our domain in teaching materials and academic examples.
  
## Contact
For more information, please visit our website or reach out to us directly.

---

Feel free to customize any sections to better match your vision!