In [3]:
import requests
import re
import openai
import json

from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import List, Dict, Tuple
from tqdm import tqdm

In [4]:
def scrape_links(url):
    response = requests.get(url)    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all links in page
    links = soup.find_all('a')    
    return [urljoin(url, link.get('href')) for link in links if link.get('href')]

In [10]:
# Traverse through all links using BFS
def scrape_notion_help_center(start_url, bad_prefix = 'https://www.notion.so/help/notion-academy/', MAX_LINKS = 20):
    visited = set()
    to_visit = [start_url]
    
    while to_visit and len(visited) < MAX_LINKS:
        current_url = to_visit.pop(0)
        
        if current_url in visited:
            continue
        elif current_url.startswith(bad_prefix):
            continue
        
        print(f"Scraping: {current_url}")
        visited.add(current_url)
        
        links = scrape_links(current_url)
                
        for link in links:
            if link.startswith(start_url) and link not in visited:
                to_visit.append(link)
    
    return visited

In [11]:
start_url = "https://www.notion.so/help"
all_links = scrape_notion_help_center(start_url, MAX_LINKS = 20)

Scraping: https://www.notion.so/help
Scraping: https://www.notion.so/help/reference
Scraping: https://www.notion.so/help/guides
Scraping: https://www.notion.so/help/category/new-to-notion
Scraping: https://www.notion.so/help/start-here
Scraping: https://www.notion.so/help/what-is-a-block
Scraping: https://www.notion.so/help/create-your-first-page
Scraping: https://www.notion.so/help/category/meet-your-workspace
Scraping: https://www.notion.so/help/intro-to-workspaces
Scraping: https://www.notion.so/help/navigate-with-the-sidebar
Scraping: https://www.notion.so/help/create-delete-and-switch-workspaces
Scraping: https://www.notion.so/help/category/write-edit-and-customize
Scraping: https://www.notion.so/help/writing-and-editing-basics
Scraping: https://www.notion.so/help/columns-headings-and-dividers
Scraping: https://www.notion.so/help/customize-and-style-your-content
Scraping: https://www.notion.so/help/category/databases
Scraping: https://www.notion.so/help/intro-to-databases
Scraping

In [12]:
print("Num Links:", len(all_links))
for idx, link in enumerate(all_links):
    print("Idx:", idx, "Link:", link)

Num Links: 20
Idx: 0 Link: https://www.notion.so/help/what-is-a-block
Idx: 1 Link: https://www.notion.so/help
Idx: 2 Link: https://www.notion.so/help/guides
Idx: 3 Link: https://www.notion.so/help/category/write-edit-and-customize
Idx: 4 Link: https://www.notion.so/help/intro-to-workspaces
Idx: 5 Link: https://www.notion.so/help/guides/notion-ai-for-docs
Idx: 6 Link: https://www.notion.so/help/database-properties
Idx: 7 Link: https://www.notion.so/help/start-here
Idx: 8 Link: https://www.notion.so/help/create-your-first-page
Idx: 9 Link: https://www.notion.so/help/views-filters-and-sorts
Idx: 10 Link: https://www.notion.so/help/category/new-to-notion
Idx: 11 Link: https://www.notion.so/help/writing-and-editing-basics
Idx: 12 Link: https://www.notion.so/help/category/databases
Idx: 13 Link: https://www.notion.so/help/columns-headings-and-dividers
Idx: 14 Link: https://www.notion.so/help/navigate-with-the-sidebar
Idx: 15 Link: https://www.notion.so/help/customize-and-style-your-content
I

In [13]:
import tiktoken

def num_tokens_from_string(string: str, model_name: str = "gpt-3.5-turbo-16k"):
    """
    Returns the number of tokens in a text string.
    """
    
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def fetch_and_parse_html(url: str):
    """
    Fetch HTML content from a URL and parse it with BeautifulSoup.
    """
    
    response = requests.get(url)
    return BeautifulSoup(response.content, 'html.parser')

def get_content_blocks(soup: BeautifulSoup):
    """
    Extract content blocks (headers with associated content) from parsed HTML.
    """
    
    blocks = []
    current_header = None
    current_content = []

    for element in soup.body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol']):
        if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            if current_header:
                blocks.append((current_header, ''.join(map(str, current_content))))
                
            current_header = str(element)
            current_content = []
            
        else:
            current_content.append(str(element))

    if current_header:
        blocks.append((current_header, ''.join(map(str, current_content))))

    return blocks


def split_content_blocks(blocks: List[Tuple[str, str]], max_tokens: int):
    """
    Split content blocks into parts that fit within the token limit.
    """
    
    parts = []
    current_part = ""
    current_tokens = 0

    for header, content in blocks:
        block = header + content
        block_tokens = num_tokens_from_string(block)

        if current_tokens + block_tokens > max_tokens:
            if current_part:
                parts.append(current_part)
                current_part = ""
                current_tokens = 0

        if block_tokens > max_tokens:
            # If a single block exceeds max_tokens, split it further
            words = block.split()
            temp_part = ""
            
            for word in words:
                if num_tokens_from_string(temp_part + word) > max_tokens:
                    parts.append(temp_part)
                    temp_part = word + " "
                    
                else:
                    temp_part += word + " "
                    
            if temp_part:
                current_part += temp_part
                current_tokens = num_tokens_from_string(current_part)
                
        else:
            current_part += block
            current_tokens += block_tokens

    if current_part:
        parts.append(current_part)

    return parts

In [14]:
import random

# Input + Ouput + Prompt Buffer <= 16k tokens (gpt-3.5-turbo-16k model max tokens)
HTML_TOKEN_INPUT_UPPERBOUND = 4096
MODEL_TOKEN_OUTPUT_UPPERBOUND = 7500
PROMPT_BUFFER = 1000

# Normally would store as environment variable, but repo is private so this seems okay? Would have to ask
OPENAI_KEY = "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"

client = openai.OpenAI(api_key = OPENAI_KEY)

def split_into_chunks(url: str, max_chars: int = 750, model = "gpt-3.5-turbo-16k") -> List[str]:
    """
    Utilize LLM to split the HTML content into chunks of 'max_chars' size.
    """

    CHARS_PER_WORD = 4.7
    prompt_template = f"""
You are an AI assistant specializing in content analysis and chunking for RAG (Retrieval-Augmented Generation) systems. Your task is to extract the relevant text from HTML and partition it into chunk of english text roughly {max_chars // CHARS_PER_WORD} words long. Make sure to keep headers and paragraphs together, and don't break up bulleted lists mid-list. Each of your chunks should be roughly {max_chars // CHARS_PER_WORD} words or less, but they could be more if it's necessary to keep relevant content together. 

Here's the HTML content to process:
{{}}

Please provide your output as a valid JSON array of chunks, where each chunk is the extracted english and NOT HTML.
"""

    all_chunks = []
    
    try:
        processed_soup = fetch_and_parse_html(url)
        blocks = get_content_blocks(processed_soup)
        html_parts = split_content_blocks(blocks, HTML_TOKEN_INPUT_UPPERBOUND)
        
        for part in html_parts:
            prompt = prompt_template.format(part)
                                    
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a content chunking assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,
                max_tokens=MODEL_TOKEN_OUTPUT_UPPERBOUND + PROMPT_BUFFER,
            )
            
            chunks_json = response.choices[0].message.content.strip()
                        
            try:
                chunks = json.loads(chunks_json)                
                
                # Eliminate chunks that are too small or too large (language models are finicky, so range of characters is wide)
                all_chunks.extend([c for c in chunks if len(c) >= max_chars - 250 and len(c) <= max_chars + 250])
                
            except Exception as e:
                # Sometimes we get malformed output, can't load as a valid json
                continue
    
    except Exception as e:
        print(f"Error in splitting content: {e}")
        return [] 

    return all_chunks

def process_articles(links: List[str]) -> List[Dict[str, List[str]]]:
    """
    Process array of links, returns array of chunks to be processed by RAG-type system
    """
    
    processed_articles = []
    
    for link in tqdm(links, desc = "Processing Links", unit = "Link"):        
        chunks = split_into_chunks(link) + ["TMP"]
        
        processed_articles.append({
            "url": link,
            "chunks": chunks
        })
        
        print("Current URL:", link)
        print("Number of extracted chunks:", len(chunks))
        print("Example chunk:", chunks[0])
    
    return processed_articles

In [15]:
# Testing purposes
NUM_SAMPLE_ARTICLES = 5

articles_processed = process_articles(list(all_links)[:NUM_SAMPLE_ARTICLES])

for article in articles_processed[:NUM_SAMPLE_ARTICLES]:
    print("URL:", article['url'])
    print("Num Chunks:", len(article['chunks']))
    
    for chunk in article['chunks']:
        print("Len Chunk:", len(chunk))
        print("Chunk Text:\n", chunk)

Processing Links:   0%|          | 0/5 [00:00<?, ?Link/s]

Processing Links:  20%|██        | 1/5 [00:08<00:35,  8.80s/Link]

Current URL: https://www.notion.so/help/what-is-a-block
Number of extracted chunks: 1
Example chunk: TMP


Processing Links:  40%|████      | 2/5 [00:18<00:28,  9.40s/Link]

Current URL: https://www.notion.so/help
Number of extracted chunks: 1
Example chunk: TMP


Processing Links:  60%|██████    | 3/5 [00:31<00:22, 11.19s/Link]

Current URL: https://www.notion.so/help/guides
Number of extracted chunks: 1
Example chunk: TMP


Processing Links:  80%|████████  | 4/5 [00:43<00:11, 11.26s/Link]

Current URL: https://www.notion.so/help/category/write-edit-and-customize
Number of extracted chunks: 1
Example chunk: TMP


Processing Links: 100%|██████████| 5/5 [01:25<00:00, 17.18s/Link]

Current URL: https://www.notion.so/help/intro-to-workspaces
Number of extracted chunks: 11
Example chunk: Control panel

The control panel at the top of your sidebar contains several key features:

Workspace switcher: Click on your current workspace's name to switch between the workspaces you belong to, create a new one, join another one, or log out. You can also access your settings from inside the workspace switcher. Learn more about switching workspaces here →

Search: Click to open Notion's search window, where you can either type in what you're looking for or quickly jump to a recently visited page. Learn more about Search here →

Home: Click to see the pages and tasks that need your attention. Learn more about Home here →

Inbox: Click to see all your notifications in one place. This menu combines revisions that were made on pages you follow, mentions of you across your workspace, and new work assignments. A red notification badge will appear here when you have unread notificatio




In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class RAGLookupSystem:
    def __init__(self, processed_articles: List[Dict[str, any]]):
        self.processed_articles = processed_articles
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.chunk_embeddings = []
        self.chunks = []
        self.urls = []
        self.client = openai.OpenAI(api_key = OPENAI_KEY)

        for article in processed_articles:
            url = article['url']
            
            for chunk in article['chunks']:
                self.chunks.append(chunk)
                self.urls.append(url)
        
        self.chunk_embeddings = self.model.encode(self.chunks)

    def find_relevant_chunks(self, query: str, top_k: int = 10) -> List[Dict[str, any]]:
        query_embedding = self.model.encode([query])
        similarities = cosine_similarity(query_embedding, self.chunk_embeddings)[0]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        relevant_chunks = []
        
        for idx in top_indices:
            relevant_chunks.append({
                "chunk": self.chunks[idx],
                "url": self.urls[idx],
                "similarity": similarities[idx]
            })
        
        return relevant_chunks

    def perform_rag_lookup(self, query: str) -> str:
        relevant_chunks = self.find_relevant_chunks(query)
        
        context = "\n\n".join([f"Chunk from {chunk['url']}:\n{chunk['chunk']}" for chunk in relevant_chunks])
        
        prompt = f"""Given the following context and query, provide a comprehensive answer. 
        If the context doesn't contain relevant information, say so and provide a general response based on your knowledge.

        Context:
        {context}

        Query: {query}

        Answer:"""

        response = self.client.chat.completions.create(
            model="gpt-3.5-turbo-16k",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that provides accurate information based on the given context."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=4000
        )   
        
        return response.choices[0].message.content.strip()

In [18]:
rag_system = RAGLookupSystem(articles_processed)

queries = [
    'How do I switch my workspace in notion?', 
    'What steps should I take to access my update menu?', 
    'What is a notion block?'
]

for cur_query in queries:
    response = rag_system.perform_rag_lookup(cur_query)
    
    print("Query:", cur_query)
    print("Response:", response)



Query: How do I switch my workspace in notion?
Response: To switch your workspace in Notion, you can follow these steps:

1. Click on the workspace switcher located in the control panel at the top of your sidebar.
2. In the workspace switcher, click on the name of your current workspace.
3. A dropdown menu will appear with options to switch between the workspaces you belong to, create a new one, join another one, or log out.
4. Select the workspace you want to switch to from the dropdown menu.

Please note that this answer is based on the information provided in the given context. If there are any specific instructions or features related to switching workspaces in Notion that are not mentioned in the context, please refer to the official Notion documentation or contact Notion support for further assistance.
Query: What steps should I take to access my update menu?
Response: To access the update menu in Notion, follow these steps:

1. Open Notion and navigate to the page or workspace w