## Splitting and Embeddings Text using LangChain

In [18]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('files/churchill_speech.txt') as f:
    churchill_speech = f.read()


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,  #maximum overlap between chunks to maintin continuity
    length_function=len
)

In [20]:
chunks = text_splitter.create_documents([churchill_speech])
print(chunks[6].page_content)
print(f'Now you have {len(chunks)}')

of His Majesty’s Government-every man of them. That is the will of Parliament and the nation.
Now you have 23


## Embedding cost

In [21]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')  # recommended 'text-embedding-3-small' or 'text-embedding-3-large'
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens/ 1000 * 0.0004:.6f}')

print_embedding_cost(chunks)

Total Tokens: 417
Embedding Cost in USD: 0.000167


In [22]:
from langchain.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [23]:
vector = embedding.embed_query(chunks[0].page_content)
vector

[-0.031240572103723028,
 -0.003525813434811504,
 -0.02519815667021573,
 -0.03923713145501061,
 -0.008562230727999833,
 0.0021566278862740415,
 -0.01722731113588284,
 -0.01585169736593735,
 -0.00493999583471318,
 0.0016504150546565192,
 0.005923495316519394,
 0.014797489544118731,
 -0.0021084172734517916,
 -0.014000404990685443,
 -0.026638052188580147,
 -0.004023991164291985,
 0.008800070961975645,
 -0.017060181570225933,
 0.016725918713621724,
 0.0012116970344240167,
 -0.004515741138025742,
 -0.00817011690552186,
 0.013344738048599567,
 -0.0023253656132285407,
 0.002031280013539412,
 -0.02269119828420054,
 -0.006582375787232657,
 -0.02545528180124626,
 0.015208887332328823,
 -0.0147332077956998,
 0.010535657680189636,
 -0.028463632609522563,
 -0.009012198310319363,
 0.013177607551620061,
 -0.012406235883818863,
 0.0013097256064588344,
 -0.01397469210505335,
 -0.011120614885279205,
 0.022395505221705768,
 -0.007077339289593801,
 0.00045880571736744416,
 0.014450371641682374,
 0.00123982

# Inserting the Embeddings into a Pinecone Index

In [24]:
import pinecone
from langchain_community.vectorstores import Pinecone
pc = pinecone.Pinecone()

In [25]:
for i in pc.list_indexes().names():
    print('Deleting all indexes...')
    pc.delete_index(i)
    print('Done')

Deleting all indexes...
Done


In [26]:
index_name = 'churchill-speech'
if index_name not in pc.list_indexes().names():
    print(f'Creatinnnnng inde {index_name}...')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=pinecone.PodSpec(
            environment='gcp-starter'
        )
    )
    print('Done')

Creatinnnnng inde churchill-speech...
Done


In [28]:
vector_store = Pinecone.from_documents(chunks,embedding,index_name=index_name)

In [29]:
# load vector_store from existing index
vector_store = Pinecone.from_existing_index(index_name=index_name, embedding=embedding) 

## Asking Questions (Similarity Search)

In [31]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='we shall fight on the beaches,\nwe shall fight on the landing grounds,'), Document(page_content='we shall fight in the fields and in the streets,\nwe shall fight in the hills;'), Document(page_content='We shall go on to the end, we shall fight in France,\nwe shall fight on the seas and oceans,'), Document(page_content='we shall fight with growing confidence and growing strength in the air, we shall defend our Island,')]


In [32]:
for r in result:
    print(r.page_content)
    print('-' * 50)

we shall fight on the beaches,
we shall fight on the landing grounds,
--------------------------------------------------
we shall fight in the fields and in the streets,
we shall fight in the hills;
--------------------------------------------------
We shall go on to the end, we shall fight in France,
we shall fight on the seas and oceans,
--------------------------------------------------
we shall fight with growing confidence and growing strength in the air, we shall defend our Island,
--------------------------------------------------


In [33]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

# k = 3 means return 3 most similar chunks to users query
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

In [37]:
# query = 'Where should we fight?'
# query = 'Whos was the king of Belgium at that time?'
query = 'What about the French Armies?'
answer = chain.run(query)
print(answer)

The British Empire and the French Republic were allied during World War II and fought together against the Axis powers. The excerpt you provided from Winston Churchill's speech indicates that the British and French were linked together in their cause and need to fight.
