In [44]:
import requests
import re
import openai
import json

from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import List, Dict, Tuple
from tqdm import tqdm

In [4]:
def scrape_links(url):
    response = requests.get(url)    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all links in page
    links = soup.find_all('a')    
    return [urljoin(url, link.get('href')) for link in links if link.get('href')]

In [5]:
# Traverse through all links using BFS
def scrape_notion_help_center(start_url, bad_prefix = 'https://www.notion.so/help/notion-academy/'):
    visited = set()
    to_visit = [start_url]
    
    while to_visit:
        current_url = to_visit.pop(0)
        
        if current_url in visited:
            continue
        elif current_url.startswith(bad_prefix):
            continue
        
        print(f"Scraping: {current_url}")
        visited.add(current_url)
        
        links = scrape_links(current_url)
                
        for link in links:
            if link.startswith(start_url) and link not in visited:
                to_visit.append(link)
    
    return visited

In [6]:
start_url = "https://www.notion.so/help"
all_links = scrape_notion_help_center(start_url)

Scraping: https://www.notion.so/help
Scraping: https://www.notion.so/help/reference
Scraping: https://www.notion.so/help/guides
Scraping: https://www.notion.so/help/category/new-to-notion
Scraping: https://www.notion.so/help/start-here
Scraping: https://www.notion.so/help/what-is-a-block
Scraping: https://www.notion.so/help/create-your-first-page
Scraping: https://www.notion.so/help/category/meet-your-workspace
Scraping: https://www.notion.so/help/intro-to-workspaces
Scraping: https://www.notion.so/help/navigate-with-the-sidebar
Scraping: https://www.notion.so/help/create-delete-and-switch-workspaces
Scraping: https://www.notion.so/help/category/write-edit-and-customize
Scraping: https://www.notion.so/help/writing-and-editing-basics
Scraping: https://www.notion.so/help/columns-headings-and-dividers
Scraping: https://www.notion.so/help/customize-and-style-your-content
Scraping: https://www.notion.so/help/category/databases
Scraping: https://www.notion.so/help/intro-to-databases
Scraping

In [8]:
print("Num Links:", len(all_links))
for idx, link in enumerate(all_links):
    print("Idx:", idx, "Link:", link)

Num Links: 990
Idx: 0 Link: https://www.notion.so/help/guides/10-ai-prompts-to-help-marketers-write-better-copy-faster
Idx: 1 Link: https://www.notion.so/help/workspaces-on-mobile#faq
Idx: 2 Link: https://www.notion.so/help/gdpr-at-notion#subprocessors
Idx: 3 Link: https://www.notion.so/help/export-your-content#export-your-entire-workspace
Idx: 4 Link: https://www.notion.so/help/add-and-manage-connections-with-the-api#add-connections-to-pages
Idx: 5 Link: https://www.notion.so/help/guides/5-steps-to-adopt-notion-for-your-entire-organization
Idx: 6 Link: https://www.notion.so/help/notion-calendar-for-teams#see-a-teammate's-calendar
Idx: 7 Link: https://www.notion.so/help/intro-to-workspaces#editor-tools
Idx: 8 Link: https://www.notion.so/help/formulas#adding-a-formula-property
Idx: 9 Link: https://www.notion.so/help/code-blocks#add-a-code-block
Idx: 10 Link: https://www.notion.so/help/category/enterprise-admin
Idx: 11 Link: https://www.notion.so/help/sales-tax
Idx: 12 Link: https://www.

In [60]:
import tiktoken

def num_tokens_from_string(string: str, model_name: str = "gpt-3.5-turbo-16k"):
    """
    Returns the number of tokens in a text string.
    """
    
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def fetch_and_parse_html(url: str):
    """
    Fetch HTML content from a URL and parse it with BeautifulSoup.
    """
    
    response = requests.get(url)
    return BeautifulSoup(response.content, 'html.parser')

def get_content_blocks(soup: BeautifulSoup):
    """
    Extract content blocks (headers with associated content) from parsed HTML.
    """
    
    blocks = []
    current_header = None
    current_content = []

    for element in soup.body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol']):
        if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            if current_header:
                blocks.append((current_header, ''.join(map(str, current_content))))
                
            current_header = str(element)
            current_content = []
            
        else:
            current_content.append(str(element))

    if current_header:
        blocks.append((current_header, ''.join(map(str, current_content))))

    return blocks


def split_content_blocks(blocks: List[Tuple[str, str]], max_tokens: int):
    """
    Split content blocks into parts that fit within the token limit.
    """
    
    parts = []
    current_part = ""
    current_tokens = 0

    for header, content in blocks:
        block = header + content
        block_tokens = num_tokens_from_string(block)

        if current_tokens + block_tokens > max_tokens:
            if current_part:
                parts.append(current_part)
                current_part = ""
                current_tokens = 0

        if block_tokens > max_tokens:
            # If a single block exceeds max_tokens, split it further
            words = block.split()
            temp_part = ""
            
            for word in words:
                if num_tokens_from_string(temp_part + word) > max_tokens:
                    parts.append(temp_part)
                    temp_part = word + " "
                    
                else:
                    temp_part += word + " "
                    
            if temp_part:
                current_part += temp_part
                current_tokens = num_tokens_from_string(current_part)
                
        else:
            current_part += block
            current_tokens += block_tokens

    if current_part:
        parts.append(current_part)

    return parts

In [109]:
import random

# Input + Ouput + Prompt Buffer <= 16k tokens (gpt-3.5-turbo-16k model max tokens)
HTML_TOKEN_INPUT_UPPERBOUND = 4096
MODEL_TOKEN_OUTPUT_UPPERBOUND = 7500
PROMPT_BUFFER = 1000

# Normally would store as environment variable, but repo is private so this seems okay? Would have to ask
OPENAI_KEY = "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"

client = openai.OpenAI(api_key = OPENAI_KEY)

def split_into_chunks(url: str, max_chars: int = 750, model = "gpt-3.5-turbo-16k") -> List[str]:
    """
    Utilize LLM to split the HTML content into chunks of 'max_chars' size.
    """

    CHARS_PER_WORD = 4.7
    prompt_template = f"""
You are an AI assistant specializing in content analysis and chunking for RAG (Retrieval-Augmented Generation) systems. Your task is to extract the relevant text from HTML and partition it into chunk of english text roughly {max_chars // CHARS_PER_WORD} words long. Make sure to keep headers and paragraphs together, and don't break up bulleted lists mid-list. Each of your chunks should be roughly {max_chars // CHARS_PER_WORD} words or less, but they could be more if it's necessary to keep relevant content together. 

Here's the HTML content to process:
{{}}

Please provide your output as a valid JSON array of chunks, where each chunk is the extracted english and NOT HTML.
"""

    all_chunks = []
    
    try:
        processed_soup = fetch_and_parse_html(url)
        blocks = get_content_blocks(processed_soup)
        html_parts = split_content_blocks(blocks, HTML_TOKEN_INPUT_UPPERBOUND)
        
        for part in html_parts:
            prompt = prompt_template.format(part)
                                    
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a content chunking assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,
                max_tokens=MODEL_TOKEN_OUTPUT_UPPERBOUND + PROMPT_BUFFER,
            )
            
            chunks_json = response.choices[0].message.content.strip()
                        
            try:
                chunks = json.loads(chunks_json)                
                
                # Eliminate chunks that are too small or too large (language models are finicky, so range of characters is wide)
                all_chunks.extend([c for c in chunks if len(c) >= max_chars - 250 and len(c) <= max_chars + 250])
                
            except Exception as e:
                # Sometimes we get malformed output, can't load as a valid json
                continue
    
    except Exception as e:
        print(f"Error in splitting content: {e}")
        return [] 

    return all_chunks

def process_articles(links: List[str]) -> List[Dict[str, List[str]]]:
    """
    Process array of links, returns array of chunks to be processed by RAG-type system
    """
    
    processed_articles = []
    
    for link in tqdm(links, desc = "Processing Links", unit = "Link"):        
        chunks = split_into_chunks(link) + ["TMP"]
        
        processed_articles.append({
            "url": link,
            "chunks": chunks
        })
        
        print("Current URL:", link)
        print("Number of extracted chunks:", len(chunks))
        print("Example chunk:", chunks[0])
    
    return processed_articles

In [110]:
# Testing purposes
NUM_SAMPLE_ARTICLES = 5

articles_processed = process_articles(list(all_links)[:NUM_SAMPLE_ARTICLES])

for article in articles_processed[:NUM_SAMPLE_ARTICLES]:
    print("URL:", article['url'])
    print("Num Chunks:", len(article['chunks']))
    
    for chunk in article['chunks']:
        print("Len Chunk:", len(chunk))
        print("Chunk Text:\n", chunk)

Processing Links:  20%|██        | 1/5 [00:24<01:38, 24.51s/Link]

Current URL: https://www.notion.so/help/guides/10-ai-prompts-to-help-marketers-write-better-copy-faster
Number of extracted chunks: 10
Example chunk: Social Media Posts

Publishing social media posts every day is essential to keep your audiences engaged and your brand top of mind for your followers, but it can be challenging to think of new content ideas every day.

One of the best ways to use AI for social media posts is to ask for a series of social media posts based on another piece of content, like a blog post.

Try the following prompt on a page containing a blog post draft:

Create a series of 10 [LinkedIn] posts based on the content of [this article]. Make the posts eye-catching and engaging to B2B decision-makers, so they feel motivated to request a demo.

AI can also help you write captions for your Instagram posts if you explain the content of the image or video and what you want to talk about in the caption.


Processing Links:  40%|████      | 2/5 [00:54<01:22, 27.53s/Link]

Current URL: https://www.notion.so/help/workspaces-on-mobile#faq
Number of extracted chunks: 6
Example chunk: Updates menu

Tap the 🔔 at the bottom to open this menu, which shows you notifications and changes that are relevant to you on Notion.

Tap Inbox to see all revisions on pages you follow, and all mentions of you in the workspace.

Tap Followed just to see edits made on pages you follow.

Tap All to see all edits made by all users across the entire workspace.

Tap This page to see changes, comments and mentions relevant only to the page you're currently looking at. You can also choose to follow the page here. Click on the clock icon to the right of any revision to go to that version (or restore it) in Page history.


Processing Links:  60%|██████    | 3/5 [01:07<00:41, 20.87s/Link]

Current URL: https://www.notion.so/help/gdpr-at-notion#subprocessors
Number of extracted chunks: 2
Example chunk: GDPR at Notion

At Notion, we're committed to the success of our customers and the protection of their data through complying with the General Data Protection Regulation (GDPR) and other privacy-related regulations 🇪🇺

Data Processing Addendum

Data portability & management tools

Data transfers

Subprocessors

The General Data Protection Regulation (GDPR) is a comprehensive data protection law that regulates the use of EU residents’ personal data, providing individuals rights to exercise control over their data and requiring organizations that process personal data to meet certain obligations.


Processing Links:  80%|████████  | 4/5 [01:37<00:24, 24.67s/Link]

Current URL: https://www.notion.so/help/export-your-content#export-your-entire-workspace
Number of extracted chunks: 10
Example chunk: Export your content

Need to share your content in PDF, CSV, or HTML format? We got you covered! You can export a Notion page, database, or entire workspace at any time 📤

Export as PDF

On desktop

On mobile

Export as HTML

On desktop

On mobile

Export as Markdown & CSV

On desktop

On mobile

Print a Notion page

On your browser

On the desktop app

Export your entire workspace

Export as PDF

You can save any Notion page or database to your computer as a PDF file. This is one way to back up your information if you want to keep it in some form on your hard drive.


Processing Links: 100%|██████████| 5/5 [01:56<00:00, 23.21s/Link]

Current URL: https://www.notion.so/help/add-and-manage-connections-with-the-api#add-connections-to-pages
Number of extracted chunks: 4
Example chunk: Add and manage integrations

You can connect other software to Notion, automate actions within your workspace, and access connections built by our partners 🤖

Add connections in your workspace

Add connections to pages

Manage connections in your workspace

Install from a developer

Install directly from a partner platform via OAuth

Install via internal integration token

Note: On Enterprise Plans, the following can be restricted to workspace owners:

The ability to add connections

The ability to install workspace-wide security and compliance integrations

The ability to add connections

The ability to install workspace-wide security and compliance integrations

Members can add connections to a workspace in Settings & members → My connections. Workspace owners can add connections to a workspace in Settings & members → Connections. Once 




In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class RAGLookupSystem:
    def __init__(self, processed_articles: List[Dict[str, any]]):
        self.processed_articles = processed_articles
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.chunk_embeddings = []
        self.chunks = []
        self.urls = []
        self.client = openai.OpenAI(api_key = OPENAI_KEY)

        for article in processed_articles:
            url = article['url']
            
            for chunk in article['chunks']:
                self.chunks.append(chunk)
                self.urls.append(url)
        
        self.chunk_embeddings = self.model.encode(self.chunks)

    def find_relevant_chunks(self, query: str, top_k: int = 10) -> List[Dict[str, any]]:
        query_embedding = self.model.encode([query])
        similarities = cosine_similarity(query_embedding, self.chunk_embeddings)[0]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        relevant_chunks = []
        
        for idx in top_indices:
            relevant_chunks.append({
                "chunk": self.chunks[idx],
                "url": self.urls[idx],
                "similarity": similarities[idx]
            })
        
        return relevant_chunks

    def perform_rag_lookup(self, query: str) -> str:
        relevant_chunks = self.find_relevant_chunks(query)
        
        context = "\n\n".join([f"Chunk from {chunk['url']}:\n{chunk['chunk']}" for chunk in relevant_chunks])
        
        prompt = f"""Given the following context and query, provide a comprehensive answer. 
        If the context doesn't contain relevant information, say so and provide a general response based on your knowledge.

        Context:
        {context}

        Query: {query}

        Answer:"""

        response = self.client.chat.completions.create(
            model="gpt-3.5-turbo-16k",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that provides accurate information based on the given context."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=4000
        )   
        
        return response.choices[0].message.content.strip()

  from tqdm.autonotebook import tqdm, trange


NameError: name 'List' is not defined

In [None]:
rag_system = RAGLookupSystem(articles_processed)

queries = [
    'How do I export my entire notion workspace?', 
    'What steps should I take to print a notion page?', 
    'How should I use editor tools for notion on mobile?'
]

for cur_query in queries:
    response = rag_system.perform_rag_lookup(cur_query)
    
    print("Query:", cur_query)
    print("Response:", response)

NameError: name 'RAGLookupSystem' is not defined