In [1]:
import requests

url = "https://aws.amazon.com/blogs/aws/top-announcements-of-aws-reinvent-2023/"
page = requests.get(url)

len(page.text)

107219

In [4]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(page.text, "html.parser")

announcements = []
results = soup.find_all("p")
for result in results:  # iterate through each announcement
    a = result.find("a", href=True)
    if a and a['href'].startswith("https://aws.amazon.com"):
        announcement_title = a.get_text(strip=True)
        print(announcement_title)
        announcement_summary = result.get_text(separator="#@#", strip=True).split("#@#")[1]
        announcement_link = a['href']
        
        announcements.append({
            "title": announcement_title,
            "summary": announcement_summary,
            "link": announcement_link
        })

print(f"No of announcements: {len(announcements)}")

Introducing Amazon Q Generative SQL in Amazon Redshift (Preview)
Introducing new AI-driven scaling and optimizations in Amazon Redshift Serverless (Preview)
Announcing Amazon Aurora PostgreSQL zero-ETL integration with Amazon Redshift (Preview)
Announcing Amazon RDS for MySQL zero-ETL integration with Amazon Redshift (Preview)
Announcing Amazon DynamoDB zero-ETL integration with Amazon Redshift (Preview)
Introducing highly durable Amazon OpenSearch Service clusters with 30% price/performance improvement
AWS Clean Rooms Differential Privacy enhances privacy protection of your users’ data (preview)
AWS Clean Rooms ML helps customers and partners apply ML models without sharing raw data (preview)
Announcing Amazon OpenSearch Service zero-ETL integration with Amazon S3 (preview)
New generative AI capabilities for Amazon DataZone further simplify data cataloging and discovery (preview)
Analyze large amounts of graph data to get insights and find trends with Amazon Neptune Analytics
New Amaz

In [8]:
for announcement in announcements:
    announcement_link = announcement["link"]
    r = requests.get(announcement_link)
    s = BeautifulSoup(r.text, "html.parser")
    sections = s.find_all("section")
    announcement_content = ""
    for section in sections:
        announcement_content = announcement_content + section.get_text(strip=True, separator=" ")
    announcement["content"] = announcement_content

In [41]:
from llama_index.core.node_parser import SentenceSplitter

text_parser = SentenceSplitter(
    chunk_size=512,
    separator=".",
    chunk_overlap=0
)

import chromadb
import uuid

# set up ChromaVectorStore and load in data
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(
    name="awsreinvent2023_collection",
    metadata={"hnsw:space": "cosine"} # l2 is the default
)

chunk_count = 0
for announcement in announcements:
    chunks = text_parser.split_text(announcement["content"])
    for chunk in chunks:
        chunk_count = chunk_count + 1
        id = str(uuid.uuid4())
        collection.add(
            documents=[chunk],
            metadatas=[
                {
                    "title": announcement["title"], 
                    "summary": announcement["summary"], 
                    "link": announcement["link"]
                }
            ],
            ids=[id]
        )

print(f"{chunk_count} chunks added to chroma db")

235 chunks added to chroma db


In [44]:
results = collection.query(
    query_texts=["what are the major announcements about security improvements"],
    n_results=3
)

results

{'ids': [['a7b56f25-4975-4ab2-a180-4a1a953c4538',
   '4fbc5dd9-34fa-4641-a655-de4f6684cb9a',
   '0cb8492e-fda7-4158-9666-b005de47986c']],
 'distances': [[0.6450698375701904, 0.6631356477737427, 0.6663980484008789]],
 'metadatas': [[{'link': 'https://aws.amazon.com/blogs/aws/amazon-detective-adds-investigations-and-finding-group-summaries-to-help-you-investigate-security-findings/',
    'summary': 'Amazon Detective adds four new capabilities to help you save time and strengthen your security operations.',
    'title': 'Amazon Detective adds new capabilities to accelerate and improve your cloud security investigations'},
   {'link': 'https://aws.amazon.com/blogs/aws/introducing-amazon-guardduty-ecs-runtime-monitoring-including-aws-fargate/',
    'summary': 'The new capability helps detect potential runtime security issues in Amazon Elastic Container Service (Amazon ECS) clusters running on both AWS Fargate and Amazon Elastic Compute Cloud (Amazon EC2).',
    'title': 'Detect runtime secu