In [2]:
import os
from dotenv import load_dotenv, find_dotenv

env_file = load_dotenv(find_dotenv())

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

In [3]:
from langchain_openai import ChatOpenAI

model = 'gpt-3.5-turbo-0125'

chat_model = ChatOpenAI(model=model)

In [4]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader('dataset/be-good.txt')

In [5]:
loaded_data = loader.load()

In [6]:
# loaded_data[0].page_content

In [7]:
from langchain_core.prompts import ChatPromptTemplate

chat_prompt = ChatPromptTemplate.from_template(
    "Can you suggest me a title to this dataset: {dataset}?"
)

message = {
    'dataset': loaded_data[0].page_content
}

message = chat_prompt.format_messages(
    dataset=loaded_data
)

resp = chat_model.invoke(message)

In [8]:
resp.content

'"Exploring the Power of Benevolence in Business and Startups"'

# RAG Method of handling large datasets/volumes

# _CharacterTextSplitter_ Technique

In [9]:
from langchain_text_splitters import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator='\n\n',
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False
)

In [10]:
text = splitter.create_documents(
    [loaded_data[0].page_content]
)

In [11]:
len(text)

2

In [12]:
metadata = [{'chunk': 0}, {'chunk': 1}]

text_with_metadata = splitter.create_documents(
    [
        loaded_data[0].page_content, loaded_data[0].page_content
    ],
    metadatas=metadata
)

In [13]:
text_with_metadata[0]

Document(metadata={'chunk': 0}, page_content='Be good')

# _RecursiveSplitter_ Technique

In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_overlap=4,
    chunk_size=26
)

In [15]:
text1 = 'abcdefghijklmnopqrstuvwxyz'

In [16]:
text2 = """
Data that Speak
LLM Applications are revolutionizing industries such as 
banking, healthcare, insurance, education, legal, tourism, 
construction, logistics, marketing, sales, customer service, 
and even public administration.

The aim of our programs is for students to learn how to 
create LLM Applications in the context of a business, 
which presents a set of challenges that are important 
to consider in advance.
"""

In [17]:
recursive_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [18]:
recursive_splitter.split_text(text2)

['Data that Speak',
 'LLM Applications are',
 'are revolutionizing',
 'industries such as',
 'banking, healthcare,',
 'insurance, education,',
 'legal, tourism,',
 'construction, logistics,',
 'marketing, sales,',
 'customer service,',
 'and even public',
 'administration.',
 'The aim of our programs',
 'is for students to learn',
 'how to',
 'create LLM Applications',
 'in the context of a',
 'a business,',
 'which presents a set of',
 'of challenges that are',
 'are important',
 'to consider in advance.']

In [20]:
second_recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_overlap=0,
    chunk_size=150,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)