In [6]:
# I want to tokenize a book in my library, and then spit out key metrics like token count, unique token count, and token frequency.
# Use the following code as inspiration

"""Create the dataset for the thesis.

Making the dataset for the thesis, by loading datasources (currently PDFs)
and creating a persistent Chroma vector store from them.
"""

import os
import openai
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")


def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

data_dir = f'../data/raw/DL_IanGoodfellow'


# Process each PDF file in the directory
all_docs = []
for filename in os.listdir(data_dir):
    if filename.endswith('.pdf'):
        file_path = os.path.join(data_dir, filename)

        # Load PDF document
        loader = PyPDFLoader(file_path)
        documents = loader.load()

        # Split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=4,
            chunk_overlap=0,
            separators=["\n\n", "\n", r"(?<=\. )", " ", ""]
        )
        docs = text_splitter.split_documents(documents)
        print(f"{len(docs)} chunks created from {filename}")

        num_tokens = num_tokens_from_string(str(documents), "cl100k_base")
        print(f"Number of tokens: {num_tokens}")
        





523673 chunks created from DeepLearning_IanGoodfellow.pdf
Number of tokens: 470832


In [8]:
180000/470832

0.38230196758079316