# Splitting and Embedding Text Using LangChain
https://python.langchain.com/docs/modules/data_connection/document_loaders/

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
with open ('/Users/Chabi/Documents/LLM_project/LangChain_Pinecone_OpenAI/99-Data/churchill_speech.txt') as f:
    churchil_speech = f.read()

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100,
                                               chunk_overlap=20,
                                               length_function=len)

In [5]:
chunks = text_splitter.create_documents([churchil_speech])

In [6]:
print(chunks[2])

page_content='From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the'


In [7]:
print(chunks[2].page_content)

From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the


In [8]:
print(f'Chunks: {len(chunks)}')

Chunks: 300


### Embedding Cost

In [9]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004 :.6f}')

In [10]:
print_embedding_cost(chunks)

Total tokens: 4820
Embedding Cost in USD: 0.001928


In [11]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

  warn_deprecated(


In [12]:
vector = embeddings.embed_query('abc')
len(vector)

1536

# Inserting the Embedding into Pinecone Index

In [15]:
import pinecone

from langchain_community.vectorstores import Pinecone

pc = pinecone.Pinecone()

In [18]:
# free version only one index 
# deleting all indexes
indexes = pc.list_indexes().names()
for i in indexes:
    print('Deleting all indexes ...', end='')
    pc.delete_index(i)
    print('Done')

Deleting all indexes ...Done


In [19]:
# creating an index
from pinecone import ServerlessSpec
index_name = 'churchill-speech'
if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name}')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )
    print('Index created! 😊')
else:
    print(f'Index {index_name} already exists!')

Creating index churchill-speech
Index created! 😊


In [20]:
vector_store = Pinecone.from_documents(chunks,
                                       embeddings,
                                       index_name=index_name)

In [22]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 300}},
 'total_vector_count': 300}

# Ask Question (Similarity Search)

In [23]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and'), Document(page_content='front, now on that, fighting'), Document(page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing'), Document(page_content='When we consider how much greater would be our advantage in defending the air above this Island')]


In [24]:
for r in result:
    print(r.page_content)
    print('-' * 50)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
front, now on that, fighting
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
When we consider how much greater would be our advantage in defending the air above this Island
--------------------------------------------------


## Answering in Natural Language using an LLM

In [25]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

In [26]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

# Initialize the LLM with the specified model and temperature
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.2)

# Use the provided vector store with similarity search and retrieve top 3 results
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

# Create a RetrievalQA chain using the defined LLM, chain type 'stuff', and retriever
chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

In [27]:
query = 'Answer only from the provided input. Where should we fight?'
answer = chain.invoke(query)
print(answer)

{'query': 'Answer only from the provided input. Where should we fight?', 'result': 'We shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields.'}


In [28]:
query = 'Who was the king of Belgium at that time?'
answer = chain.invoke(query)
print(answer)

{'query': 'Who was the king of Belgium at that time?', 'result': 'The king of Belgium at that time was King Leopold.'}


In [29]:
query = 'What about the French Armies??'
answer = chain.invoke(query)
print(answer)

{'query': 'What about the French Armies??', 'result': 'The French Armies were involved in the fighting during the battle, and they were holding the territory that was being contested. Additionally, a French Army was created to advance across the Somme with great strength during the battle.'}
