##### Load Package

In [4]:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAI

import os
from dotenv import load_dotenv

##### Set Up Environment 

In [3]:
load_dotenv(dotenv_path = "../Key/.env")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# What Is Document 
* document is an object that holds a piece of text and metadata (i.e., more information about the text) \
  metadata is a dictionary

# What Is Document Loader
* a document loader is a tool to import data from other sources

# Build A Document

In [3]:
Document(page_content= "this is the document.",
         metadata = {
             "document_id": 123,
             "document_source": "math", 
             "document_create_time": "2024-05-27"
         })

Document(page_content='this is the document.', metadata={'document_id': 123, 'document_source': 'math', 'document_create_time': '2024-05-27'})

# Build A Document Loader

## Build A TextLoader

In [4]:
loader = TextLoader("../Data/Amazon_Transcript.txt")

document = loader.load()

print(document)

[Document(page_content="Hello, and welcome to our Q4 2016 financial results conference call\nJoining us today to answer your questions is Brian Olsavsky, our CFO\nAs you listen to today's conference call, we encourage you to have our press release in front of you, which includes our financial results as well as metrics and commentary on the quarter\nPlease note, unless otherwise stated, all comparisons in this call will be against our results for the comparable period of 2015. Our comments and responses to your questions reflect management's views as of today, February 2, 2017 only and will include forward-looking statements\nActual results may differ materially\nAdditional information about factors that could potentially impact our financial results is included in today's press release and our filings with the SEC, including our most recent Annual Report on Form 10-K and subsequent filings\nDuring this call, we may discuss certain non-GAAP financial measures\nIn our press release, sli

# Build A Text Splitter

In [10]:
documnet_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300,
    chunk_overlap = 20,
)

chunks = documnet_splitter.split_documents(document)

print(chunks)
print(len(chunks))
print(chunks[0].page_content)

[Document(page_content='Hello, and welcome to our Q4 2016 financial results conference call\nJoining us today to answer your questions is Brian Olsavsky, our CFO', metadata={'source': '../Data/Amazon_Transcript.txt'}), Document(page_content="As you listen to today's conference call, we encourage you to have our press release in front of you, which includes our financial results as well as metrics and commentary on the quarter", metadata={'source': '../Data/Amazon_Transcript.txt'}), Document(page_content="Please note, unless otherwise stated, all comparisons in this call will be against our results for the comparable period of 2015. Our comments and responses to your questions reflect management's views as of today, February 2, 2017 only and will include forward-looking statements", metadata={'source': '../Data/Amazon_Transcript.txt'}), Document(page_content="Actual results may differ materially\nAdditional information about factors that could potentially impact our financial results is

# Check Token

In [5]:
llm = OpenAI()

In [7]:
text = "this is a text"

num_tokens = llm.get_num_tokens(text)

print(num_tokens)

4
