In [None]:
# Web Scraping and Summerization

In [1]:
# imports

import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

In [2]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [3]:
openai = OpenAI()

# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.
# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions

In [4]:
# A class to represent a Webpage
# If you're not familiar with Classes, check out the "Intermediate Python" notebook

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:

    def __init__(self, url):
        """
        Create this Website object from the given url using the BeautifulSoup library
        """
        self.url = url
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

In [5]:
# Let's try one out. Change the website and add print statements to follow along.

smk = Website("https://www.nasdaq.com")
print(smk.title)
print(smk.text)

Nasdaq: Stock Market, Data Updates, Reports & News
Skip to main content
Nasdaq+
Weekly Macro+
Scorecard
Market Activity
U.S. Market Activity
->
Stocks
Options
ETFs
Mutual Funds
Indexes
Cryptocurrency
Currencies
Fixed Income
Trading & Market Services
North American Markets
Nasdaq-100 Index
Nasdaq-100 Index Options
Market Data
European Markets
->
Shares
Indexes
Fixed Income
Options & Futures
ETPs
Warrants & Certificates
Funds
News
European Commodities
Market Regulation
->
U.S. Regulation
European Regulation
U.S. Market Quick Links
After-Hours Quotes
Pre-Market Quotes
Nasdaq-100
Symbol Screener
Glossary
Symbol Change History
IPO Performance
Ownership Search
Dividend History
U.S. Market Events
Economic Calendar
Earnings
IPO Calendar
Dividend Calendar
SPO Calendar
Holiday Schedule
U.S. Markets Analyst Activity
Daily Earnings Surprise
Forecast Changes
Nasdaq U.S.  Data
Statistical Milestones
Daily Market Statistics
Most Active
Explore All U.S. Market Activity
->
Insights
Insights + News
Nasd

In [6]:
# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish."

system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

In [7]:
# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [8]:
print(user_prompt_for(smk))

You are looking at a website titled Nasdaq: Stock Market, Data Updates, Reports & News
The contents of this website is as follows; please provide a short summary of this website in markdown. If it includes news or announcements, then summarize these too.

Skip to main content
Nasdaq+
Weekly Macro+
Scorecard
Market Activity
U.S. Market Activity
->
Stocks
Options
ETFs
Mutual Funds
Indexes
Cryptocurrency
Currencies
Fixed Income
Trading & Market Services
North American Markets
Nasdaq-100 Index
Nasdaq-100 Index Options
Market Data
European Markets
->
Shares
Indexes
Fixed Income
Options & Futures
ETPs
Warrants & Certificates
Funds
News
European Commodities
Market Regulation
->
U.S. Regulation
European Regulation
U.S. Market Quick Links
After-Hours Quotes
Pre-Market Quotes
Nasdaq-100
Symbol Screener
Glossary
Symbol Change History
IPO Performance
Ownership Search
Dividend History
U.S. Market Events
Economic Calendar
Earnings
IPO Calendar
Dividend Calendar
SPO Calendar
Holiday Schedule
U.S. Mar

In [9]:
messages = [
    {"role": "system", "content": "You are a snarky assistant"},
    {"role": "user", "content": "What is 2 + 2?"}
]

In [10]:
# To give you a preview -- calling OpenAI with system and user messages:

response = openai.chat.completions.create(model="gpt-4o-mini", messages=messages)
print(response.choices[0].message.content)

Oh, let me dust off my calculator for this one... The thrilling answer is 4! Shocking, I know.


In [11]:
# See how this function creates exactly the format above

def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

In [13]:
# Try this out, and then try for a few more websites

messages_for(smk)

[{'role': 'system',
  'content': 'You are an assistant that analyzes the contents of a website and provides a short summary, ignoring text that might be navigation related. Respond in markdown.'},
 {'role': 'user',
  'content': "You are looking at a website titled Nasdaq: Stock Market, Data Updates, Reports & News\nThe contents of this website is as follows; please provide a short summary of this website in markdown. If it includes news or announcements, then summarize these too.\n\nSkip to main content\nNasdaq+\nWeekly Macro+\nScorecard\nMarket Activity\nU.S. Market Activity\n->\nStocks\nOptions\nETFs\nMutual Funds\nIndexes\nCryptocurrency\nCurrencies\nFixed Income\nTrading & Market Services\nNorth American Markets\nNasdaq-100 Index\nNasdaq-100 Index Options\nMarket Data\nEuropean Markets\n->\nShares\nIndexes\nFixed Income\nOptions & Futures\nETPs\nWarrants & Certificates\nFunds\nNews\nEuropean Commodities\nMarket Regulation\n->\nU.S. Regulation\nEuropean Regulation\nU.S. Market Quick

In [14]:
# And now: call the OpenAI API. You will get very familiar with this!

def summarize(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(website)
    )
    return response.choices[0].message.content

In [16]:
summarize("https://www.nasdaq.com")

'# Nasdaq Overview\n\nThe Nasdaq website serves as a comprehensive platform for market data, updates, reports, and news related to finance and investments. Here are the main highlights:\n\n## Market Data and Services\n- **U.S. Market Activity**: Includes information on stocks, options, ETFs, mutual funds, indices, cryptocurrencies, currencies, and fixed income. \n- **European Markets**: Offers data on shares, indices, fixed income, options and futures, and exchange-traded products (ETPs).\n- **Economic Calendar**: Lists upcoming earnings, IPOs, dividends, and market events.\n\n## Insights and News\n- **Nasdaq Europe IPO Leadership**: Nasdaq Europe has emerged as a leader in the IPO market for the first half of 2025.\n- **Partnerships**: Nasdaq announced a collaboration with AWS to enhance market liquidity and growth while ensuring security and resilience.\n- **Innovation in Sports**: Nasdaq has joined forces with the Mercedes-AMG PETRONAS Formula One Team to accelerate innovation.\n\n#

In [17]:
# A function to display this nicely in the Jupyter output, using markdown

def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

In [20]:
display_summary("https://www.nasdaq.com")

# Summary of Nasdaq: Stock Market, Data Updates, Reports & News

The Nasdaq website serves as a comprehensive platform for market activity, data updates, and financial news. It covers various asset classes including stocks, options, ETFs, mutual funds, cryptocurrencies, and more. The site emphasizes facilitating trading and market services across North American and European markets.

## Key Features:
- **Market Activity & Data**: Provides real-time data and analytics on U.S. and European markets, including indexes like the Nasdaq-100.
- **Financial News**: Offers insights, news articles, and reports focusing on market trends and significant events in the finance sector.
- **Solutions & Services**: Nasdaq provides a wide range of services aimed at enhancing market efficiency, supporting IPO processes, and integrating technology into financial services.
- **Insights & Education**: The platform includes educational resources, newsletters, and insights tailored to investors, regulators, and market participants.

## Recent News:
1. **Nasdaq Europe Leads IPO Market**: Nasdaq Europe outperformed all other exchange groups in IPO performance in the first half of 2025.
2. **Partnership with AWS**: Nasdaq and AWS have launched a suite of solutions designed to enhance liquidity and support global capital markets.
3. **Collaboration with Mercedes-AMG PETRONAS**: Nasdaq has partnered with the F1 team to accelerate innovation in financial markets.

Overall, Nasdaq’s website acts as a central hub for market participants to access essential information and services, positioning itself as a leader in the global financial landscape.