# Web scraping to extract content from websites (Free)

In [None]:
# imports
import requests
import ollama
from bs4 import BeautifulSoup
from IPython.display import Markdown, display

## Installation of Ollama

Simply visit ollama.com and install

Once complete, the ollama server should already be running locally.
If you visit:
http://localhost:11434/

You should see the message Ollama is running.

If not, bring up a new Terminal (Mac) or Powershell (Windows) and enter ollama serve
And in another Terminal (Mac) or Powershell (Windows), enter ollama pull llama3.2
Then try http://localhost:11434/ again.

If Ollama is slow on your machine, try using llama3.2:1b as an alternative. Run ollama pull llama3.2:1b from a Terminal or Powershell, and change the code below from MODEL = "llama3.2" to MODEL = "llama3.2:1b"

In [None]:
# Creation of the class Website
class Website:

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.content, "html.parser")
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["body", "script", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

In [None]:
# Define the system prompt for the chosen website
def user_prompt_for(website):
    user_prompt = f"You are looking at a website title {website}"
    user_prompt += "\nThe contents of this website is as follows; \
    please provide a short summary of this website in markdown. \
    If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [None]:
# Define the roles
def messages_for(website):
    return [
        {"role": "system", "content": "You are an assistant that analyzes the contents of a website \
        and provides a short summary, ignoring text that might be navigation related. \
        Respond in markdown."},
        {"role": "user", "content": user_prompt_for(website)}
    ]

In [None]:
# Call the ollama api and return the output
def summarize(url):
    website = Website(url)
    response_llm = ollama.chat(model="llama3.2", messages=messages_for(website))
    return response_llm['message']['content']

In [None]:
# Display the output better, in markdown 
def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

In [None]:
display_summary("https://www.anthropic.com")