In [1]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [10]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class scrapeWebsite:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    #get webpage Title and Page content
    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [11]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a childrens broucher about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"

link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [12]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a childrens broucher about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [16]:
#Get Links User prompt

def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a childrens brochure about the company, respond with the full https URL in JSON format. \
    Do not include Terms of Service, Contact, Privacy pages or links and email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [18]:
pages = scrapeWebsite("https://www.arla.com/")
pages.links
get_links_user_prompt(

['/',
 '/',
 '/all-our-brands/',
 '/sustainability/',
 '/sustainability/the-food/',
 '/sustainability/the-packaging/',
 '/sustainability/the-transport/',
 '/sustainability/the-dairies/',
 '/sustainability/the-farms/',
 '/sustainability/the-cows/',
 '/sustainability/the-land/',
 '/sustainability/arlas-climate-ambition/',
 '/sustainability/lets-talk-about-dairy/',
 '/company/investor/annual-reports/',
 '/company/arla-farmers/farm-ahead/',
 '/company/',
 '/sustainability/',
 '/sustainability/the-farms/arlas-sustainability-incentive-model-qa/',
 'https://ec.europa.eu/clima/policies/international/negotiations/paris_en',
 'https://www.arla.com/company/news-and-press/2022/pressrelease/arla-doubles-co2e-target-for-operations/',
 '/sustainability/the-farms/how-arla-farmers-reduce-dairys-carbon-footprint/',
 '/sustainability/lets-talk-about-dairy/',
 '/company/news-and-press/2025/pressrelease/creating-the-future-of-dairy-arla-foods-and-dmk-group-announce-intention-to-merge/',
 'https://www.arla.