# Splitting and Embedding Text Using LangChain
https://python.langchain.com/docs/modules/data_connection/document_loaders/

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
with open ('/Users/Chabi/Documents/LLM_project/Data/churchill_speech.txt') as f:
    churchil_speech = f.read()

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100,
                                               chunk_overlap=20,
                                               length_function=len)

In [9]:
chunks = text_splitter.create_documents([churchil_speech])

In [13]:
print(chunks[2])

page_content='From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the'


In [15]:
print(chunks[2].page_content)

From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the


In [16]:
print(f'Chunks: {len(chunks)}')

Chunks: 300


### Embedding Cost

In [19]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004 :.6f}')

In [20]:
print_embedding_cost(chunks)

Total tokens: 4820
Embedding Cost in USD: 0.001928


In [33]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [34]:
vector = embeddings.embed_query('abc')
len(vector)

1536

# Inserting the Embedding into Pinecone Index

In [26]:
import pinecone
from langchain.vectorstores import Pinecone

In [27]:
pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'),
              environment=os.environ.get('PINECONE_ENV'))

In [31]:
# free version only one index 
# deleting all indexes
indexes = pinecone.list_indexes()
for i in indexes:
    print('Deleting all indexes ...', end='')
    pinecone.delete_index(i)
    print('Done')

Deleting all indexes ...Done


In [32]:
index_name = 'churchill-speech'
if index_name not in pinecone.list_indexes():
    print(f'Creating index {index_name} ...')
    pinecone.create_index(index_name, 
                          dimension=1536, 
                          metric='cosine')
    print('Done!')

Creating index churchill-speech ...
Done!


In [37]:
vector_store = Pinecone.from_documents(chunks,
                                       embeddings,
                                       index_name=index_name)

# Ask Question (Similarity Search)

In [38]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and'), Document(page_content='front, now on that, fighting'), Document(page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing'), Document(page_content='Winston Churchill Speech - We Shall Fight on the Beaches\nWe Shall Fight on the Beaches\nJune 4, 1940')]


In [39]:
for r in result:
    print(r.page_content)
    print('-' * 50)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
front, now on that, fighting
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940
--------------------------------------------------


In [41]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

In [45]:
llm = ChatOpenAI(model='gpt-3.5-turbo',
                 temperature=1)

retriever = vector_store.as_retriever(search_type='similarity',
                                      search_kwargs={'k': 3})

chain = RetrievalQA.from_chain_type(llm=llm,
                                    chain_type='stuff',
                                    retriever=retriever)

In [46]:
query = 'Where should we fight?'
answer = chain.run(query)
print(answer)

We should fight on the beaches, on the landing grounds, in the fields, in France, on the seas and oceans, and anywhere necessary to secure victory.


In [47]:
query = 'Who was the king of Belgium at that time?'
answer = chain.run(query)
print(answer)

The king of Belgium at that time was King Leopold.


In [48]:
query = 'What about the French Armies?'
answer = chain.run(query)
print(answer)

The French Armies were involved in the fighting against the British armies. They were responsible for holding the area in question and had planned to advance across the Somme with a strong force to secure it.
