In [50]:
import ollama
import requests
from IPython.display import Markdown, display
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from openai import OpenAI
import os
import json

In [14]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [38]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [6]:
link_system_prompt

'You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.\nYou should respond in JSON as in this example:\n{\n    "links": [\n        {"type": "about page", "url": "https://full.url/goes/here/about"},\n        {"type": "careers page": "url": "https://another.full.url/careers"}\n    ]\n}\n'

In [16]:
headers = {
 "User-Agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36'
}


In [76]:
class Website:
    def __init__(self, url):
        self.url = url
        response = requests.get(self.url, headers = headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found" 
        if soup.body:
            for irrelevant in soup.body.find_all(['style', 'script', 'img', 'input']):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator = '\n', strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]
        
    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"


In [22]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)

    return user_prompt

In [None]:
def get_links(url):
    website = Website(url)
    response = openai.responses.create(
        model="gpt-4o-mini",
        input=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ]
    )
    raw_response_text = response.output_text
    cleaned_response_text = raw_response_text.strip()
    # Remove '```json' from the start and '```' from the end
    if cleaned_response_text.startswith('```json'):
        cleaned_response_text = cleaned_response_text[len('```json'):].strip()
    if cleaned_response_text.endswith('```'):
        cleaned_response_text = cleaned_response_text[:-len('```')].strip()
    try:
        return json.loads(cleaned_response_text)
    except json.JSONDecodeError as e:
        print(f"\nJSONDecodeError: Failed to decode JSON from API response for URL: {url}")
        print(f"Error details: {e}")
        print(f"The problematic string that caused the error was: '{response.output_text}'")
        raise
    except Exception as e:
        print(f"\nAn unexpected error occurred in get_links for URL: {url}")
        print(f"Error details: {e}")
        raise # Re-raise for now to see the full traceback
    

In [78]:
url = "https://www.cursor.com/"
name = "Cursor"

In [75]:
# print(get_all_details(url))

Found links: {'links': [{'type': 'homepage', 'url': 'https://www.cursor.com/'}, {'type': 'pricing page', 'url': 'https://www.cursor.com/pricing'}, {'type': 'features page', 'url': 'https://www.cursor.com/features'}, {'type': 'enterprise page', 'url': 'https://www.cursor.com/enterprise'}, {'type': 'blog', 'url': 'https://www.cursor.com/blog'}, {'type': 'community page', 'url': 'https://www.cursor.com/community'}, {'type': 'downloads page', 'url': 'https://www.cursor.com/downloads'}, {'type': 'documentation', 'url': 'https://docs.cursor.com'}, {'type': 'students page', 'url': 'https://www.cursor.com/students'}, {'type': 'changelog', 'url': 'https://www.cursor.com/changelog'}, {'type': 'GitHub', 'url': 'https://github.com/getcursor/cursor'}, {'type': 'forum', 'url': 'https://forum.cursor.com'}]}


AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
brochure_system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [None]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [None]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [None]:
create_brochure(name, url)

In [None]:
### Heelo
