In [2]:
# I want to tokenize a book in my library, and then spit out key metrics like token count, unique token count, and token frequency.
# Use the following code as inspiration

"""Create the dataset for the thesis.

Making the dataset for the thesis, by loading datasources (currently PDFs)
and creating a persistent Chroma vector store from them.
"""

import os
import openai
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")


def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens





In [8]:
data_dir = f'../data/raw/DL_IanGoodfellow'




# Process each PDF file in the directory
all_docs = []
for filename in os.listdir(data_dir):
    if filename.endswith('.pdf'):
        file_path = os.path.join(data_dir, filename)

        # Load PDF document
        loader = PyPDFLoader(file_path)
        documents = loader.load()

        # Split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=4,
            chunk_overlap=0,
            separators=["\n\n", "\n", r"(?<=\. )", " ", ""]
        )
        docs = text_splitter.split_documents(documents)
        print(f"{len(docs)} chunks created from {filename}")

        num_tokens = num_tokens_from_string(str(documents), "cl100k_base")
        print(f"Number of tokens: {num_tokens}")

0.38230196758079316

In [3]:
string1 = """Teaching Assistant is a large language model trained by OpenAI.

Teaching Assistant is designed to be able to assist with teaching related tasks, from answering simple questions to providing in-depth explanations and discussions on STEM topics. 

As a language model, Teaching Assistant is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.

Teaching Assistant is constantly learning and improving, and its capabilities are constantly evolving. 

It is able to process and understand large amounts of text, and can use this knowledge to provide accurate and informative responses to a wide range of questions. 

Additionally, Teaching Assistant is able to generate its own text based on the input it receives, allowing it to engage in discussions and provide explanations and descriptions on a wide range of topics.

Overall, Teaching Assistant is a powerful tool that can help students with understanding STEM topics. 

Whether you need help with a specific question or just want to have a conversation about a particular topic, Teaching Assistant is here to assist."""


num_tokens = num_tokens_from_string(string1, "cl100k_base")
print(f"Number of tokens: {num_tokens}")

Number of tokens: 217
