In [2]:
import requests

url = "https://aws.amazon.com/blogs/aws/top-announcements-of-aws-reinvent-2023/"
page = requests.get(url)

len(page.text)

107219

In [3]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(page.text, "html.parser")

announcements = []
results = soup.find_all("p")
for result in results:  # iterate through each announcement
    a = result.find("a", href=True)
    if a and a['href'].startswith("https://aws.amazon.com"):
        announcement_title = a.get_text(strip=True)
        print(announcement_title)
        announcement_summary = result.get_text(separator="#@#", strip=True).split("#@#")[1]
        announcement_link = a['href']
        
        announcements.append({
            "title": announcement_title,
            "summary": announcement_summary,
            "link": announcement_link
        })

print(f"No of announcements: {len(announcements)}")

Introducing Amazon Q Generative SQL in Amazon Redshift (Preview)
Introducing new AI-driven scaling and optimizations in Amazon Redshift Serverless (Preview)
Announcing Amazon Aurora PostgreSQL zero-ETL integration with Amazon Redshift (Preview)
Announcing Amazon RDS for MySQL zero-ETL integration with Amazon Redshift (Preview)
Announcing Amazon DynamoDB zero-ETL integration with Amazon Redshift (Preview)
Introducing highly durable Amazon OpenSearch Service clusters with 30% price/performance improvement
AWS Clean Rooms Differential Privacy enhances privacy protection of your users’ data (preview)
AWS Clean Rooms ML helps customers and partners apply ML models without sharing raw data (preview)
Announcing Amazon OpenSearch Service zero-ETL integration with Amazon S3 (preview)
New generative AI capabilities for Amazon DataZone further simplify data cataloging and discovery (preview)
Analyze large amounts of graph data to get insights and find trends with Amazon Neptune Analytics
New Amaz

In [4]:
for announcement in announcements:
    announcement_link = announcement["link"]
    r = requests.get(announcement_link)
    s = BeautifulSoup(r.text, "html.parser")
    sections = s.find_all("section")
    announcement_content = ""
    for section in sections:
        announcement_content = announcement_content + section.get_text(strip=True, separator=" ")
    announcement["content"] = announcement_content

In [9]:
from llama_index.core.node_parser import SentenceSplitter

text_parser = SentenceSplitter(
    chunk_size=512,
    separator=".",
    chunk_overlap=0
)

chunk_count = 0
for announcement in announcements:
    chunks = text_parser.split_text(announcement["content"])
    announcement["chunks"] = chunks
    chunk_count = chunk_count + len(chunks)

print(f"{chunk_count} chunks after splitting")

235 chunks after splitting


In [10]:
from llama_index.core import Document

documents = []

for announcement in announcements:
    for chunk in announcement["chunks"]:
        documents.append(Document(
            text=chunk,
            metadata={"title": announcement["title"], "summary": announcement["summary"]},
        ))

print(documents)



In [20]:
import chromadb

# create client and a new collection
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection(
    name="awsreinvent2023_announcements_collection",
    metadata={"hnsw:space": "cosine"} # l2 is the default
)

In [21]:
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# define embedding function
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, show_progress=True, embed_model=embed_model
)

Parsing nodes: 100%|██████████| 235/235 [00:00<00:00, 751.97it/s]
Generating embeddings: 100%|██████████| 235/235 [01:14<00:00,  3.14it/s]


In [38]:
# Query Data
retriever = index.as_retriever(similarity_top_k=3)
nodes = retriever.retrieve("List some announcements about dynamodb")
for node in nodes:
    print(node.get_score(), node.metadata, node.get_text())

0.8621403134099135 {'title': 'Amazon DynamoDB zero-ETL integration with Amazon OpenSearch Service is now available', 'summary': 'This capability lets you perform a search on your DynamoDB data by automatically replicating and transforming it without custom code or infrastructure.'} To learn more, see DynamoDB zero-ETL integration with Amazon OpenSearch Service and Using an OpenSearch Ingestion pipeline with Amazon DynamoDB in the AWS documentation. Give it a try and send feedback to AWS re:Post for Amazon OpenSearch Service or through your usual AWS Support contacts. — Channy
0.8579389615031549 {'title': 'Amazon DynamoDB zero-ETL integration with Amazon OpenSearch Service is now available', 'summary': 'This capability lets you perform a search on your DynamoDB data by automatically replicating and transforming it without custom code or infrastructure.'} Today, we are announcing the general availability of Amazon DynamoDB zero-ETL integration with Amazon OpenSearch Service , which lets 