# Ingesting an entire textbook

In [2]:
# Using the PyPDF2 library to read a PDF file
import PyPDF2       # conda install PyPDF2
from tqdm import tqdm 

In [3]:
from openai import OpenAI
from datetime import datetime, timezone
import hashlib
import re
import os
from tqdm import tqdm
import numpy as np

import logging
import keyring

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [4]:
# Get OPENAI API API KEY values

openai_api_key = keyring.get_password('openai', 'ahn283')

client = OpenAI(
    api_key=openai_api_key
)
INDEX_NAME = 'semantic-search'
NAMESPACE = 'default'
ENGINE = 'text-embedding-ada-002'

In [5]:
# helper functions to get lists of embeddings from the OpenAI API
def get_embeddings(texts, engine=ENGINE):
    response = client.embeddings.create(
        input=texts,
        model=engine
    )
    
    return [d.embedding for d in list(response.data)]

def get_embedding(text):
    return get_embeddings([text])[0]

len(get_embedding('hi')), len(get_embeddings(['hi', 'hello']))

(1536, 2)

In [6]:
# # Open the PDF file in read-binary mode
# with open('./data/pds2.pdf', 'rb') as file:

# file path
file_path = './data/pds2.pdf'

# Creating a PDF reader object
reader = PyPDF2.PdfReader(file_path)
    
# Intializing an empty string hold the next
principles_of_ds = ""
    
# Loop through each page in the PDF file
for page in tqdm(reader.pages):
        
    # Extract the text from the page
    text = page.extract_text()
        
    # Find the starting point of the next we want to extact
    # In this case, we are extracting text starting from the string']' 
    principles_of_ds += '\n\n' + text[text.find(']') + 2:]
        
# Strip any leading or trailing whitespace from the resulting string
principles_of_ds = principles_of_ds.strip()

100%|██████████| 428/428 [07:19<00:00,  1.03s/it]


In [7]:
principles_of_ds



# Chunking the textbook with and without overlap

In [9]:
# Function to split the text into chunks of a maximum number of tokens.
import re
import tiktoken
# from tokenizers import BertWordPieceTokenizer
# tokenizer = BertWordPieceTokenizer(lowercase=False, strip_accents=False)

tokenizer = tiktoken.get_encoding('cl100k_base')


def overlapping_chunks(text, max_tokens=500, overlapping_factor=5):
    """ 
    max_tokens : tokens we want per chunk
    overlapping_factor : number of sentences to start each chunk with that overlaps with the previous chunk
    """
    
    # Split the text using punctuation
    sentences = re.split(r'[.?!]', text)
    
    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    chunks, tokens_so_far, chunk = [], 0, []
    
    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):
        
        # If the number of tokens so far plus the number of tokens in the current sentence is greater
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            
            if overlapping_factor > 0:
                chunk = chunk[-overlapping_factor:]
                tokens_so_far = sum([len(tokenizer.encode(c)) for c in chunk])
            else:
                chunk = []
                tokens_so_far = 0
                
        # If the number of tokens in the current sentence is greater than the max number of
        # tokens, go to the next sentence
        if token > max_tokens:
            continue
        
        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        
        tokens_so_far += token + 1
    
    if chunk:
        chunk.append(". ".join(chunk) + ".")
        
    return chunks

In [10]:
split = overlapping_chunks(principles_of_ds, overlapping_factor=0)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'non-overlapping chunking approach has {len(split)} document with average length {avg_length:.1f} tokens')

non-overlapping chunking approach has 286 document with average length 474.2 tokens


In [11]:
# with 5 overlapping sentences per chunk

split = overlapping_chunks(principles_of_ds, overlapping_factor=5)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

overlapping chunking approach has 393 documents with average length 485.6 tokens


# Chunking the textbook with natural whitespace

In [12]:
# Import the Counter and re libraries
from collections import Counter
import re

# Find all occurences of one or more spaces in 'principles_of_ds'
matches = re.findall(r'[\s]{1,}', principles_of_ds)

# The 5 most frequent spaces that occur in the document
most_common_spaces = Counter(matches).most_common(5)

# Print the most common spaces and their frequencies
print(most_common_spaces)

[(' ', 82243), ('\n', 9220), ('  ', 1592), ('\n\n', 339), ('\n   ', 250)]


# Clustering pages of the document by semantic similarity

In [13]:
# create embeddings
embeddings = None
for s in tqdm(range(0, len(split), 100)):
    if embeddings is None:
        embeddings = np.array(get_embeddings(split[s:s+100], engine=ENGINE))
    else:
        embeddings = np.vstack([embeddings, np.array(get_embeddings(split[s:s+100], engine=ENGINE))])


100%|██████████| 4/4 [00:09<00:00,  2.33s/it]


In [21]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Assume you have a list of text embeddings called 'embeddings' 
# First, compute the cosine similarity matrix between all pairs of embeddings
cosine_sim_matrix = cosine_similarity(embeddings)

# Instantiate the AgglomerativeClustering model
agg_clustering = AgglomerativeClustering(
    n_clusters=None,            # The algorithm will determine the optimal number of clusters based on the data
    distance_threshold=0.1,     # Clusters will be formed until all pairwise distances between clusters are greater than 0.1
    # affinity='precomputed',     # We are providing a precomputed distance matrix (1 - similarity matrix) as input # deprecated
    metric='precomputed',
    linkage='complete'          # form clusters by iteratively merging the smallest clusters based on the maximum distance between their components
)

# Fit the model to the cosine distance matrix (1 - similarity matrix)
agg_clustering.fit(1 - cosine_sim_matrix)

# Get the cluster labels from each embedding
cluster_labels = agg_clustering.labels_

# print the number of embeddings in each cluster
unique_labels, counts = np.unique(cluster_labels, return_counts=True)
for label, count in zip(unique_labels, counts):
    print(f'Cluster {label}: {count} embeddings')

Cluster 0: 2 embeddings
Cluster 1: 4 embeddings
Cluster 2: 2 embeddings
Cluster 3: 2 embeddings
Cluster 4: 2 embeddings
Cluster 5: 2 embeddings
Cluster 6: 2 embeddings
Cluster 7: 2 embeddings
Cluster 8: 3 embeddings
Cluster 9: 2 embeddings
Cluster 10: 2 embeddings
Cluster 11: 2 embeddings
Cluster 12: 4 embeddings
Cluster 13: 2 embeddings
Cluster 14: 2 embeddings
Cluster 15: 2 embeddings
Cluster 16: 2 embeddings
Cluster 17: 2 embeddings
Cluster 18: 2 embeddings
Cluster 19: 2 embeddings
Cluster 20: 2 embeddings
Cluster 21: 2 embeddings
Cluster 22: 2 embeddings
Cluster 23: 2 embeddings
Cluster 24: 2 embeddings
Cluster 25: 2 embeddings
Cluster 26: 2 embeddings
Cluster 27: 2 embeddings
Cluster 28: 2 embeddings
Cluster 29: 2 embeddings
Cluster 30: 2 embeddings
Cluster 31: 4 embeddings
Cluster 32: 2 embeddings
Cluster 33: 2 embeddings
Cluster 34: 1 embeddings
Cluster 35: 2 embeddings
Cluster 36: 2 embeddings
Cluster 37: 2 embeddings
Cluster 38: 2 embeddings
Cluster 39: 3 embeddings
Cluster 40