# Splitting and Embedding Text Using LangChain

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('files/churchill_speech.txt') as f:
    churchill_speech = f.read()
    
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [4]:
chunks = text_splitter.create_documents([churchill_speech])
# print(chunks[2])
print(f'Now you have {len(chunks)}')

Now you have 69


In [5]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
    
print_embedding_cost(chunks)

Total Tokens: 977
Embedding Cost in USD: 0.000391


In [15]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [16]:
vector = embeddings.embed_query(chunks[0].page_content)
vector

[-0.04456757365602211,
 -0.037887539578928785,
 -0.0029496059181736835,
 -0.00799309738843945,
 0.01574398012783039,
 0.022589743056483192,
 -0.02858137887914321,
 -0.009650358895679657,
 0.001049333131840556,
 0.007336567129750244,
 0.007789127035119313,
 0.032788272829683496,
 0.007413056186868278,
 -0.011696439194526594,
 0.006374081108250189,
 -0.005386098268716171,
 0.01316885179781892,
 -0.00249863973855472,
 0.013589540447814892,
 -0.010963419413058069,
 -0.008171572320709482,
 -0.026847628780446258,
 0.02962672886076172,
 -0.0038658801793480546,
 -0.01445641549716337,
 -0.018523080208145852,
 0.010835938116855964,
 -0.018612315811635725,
 0.003054778313503319,
 -0.01434168214431696,
 0.007081604071684751,
 -0.008560391112316212,
 -0.016508869767688155,
 0.005150257591345507,
 -0.018331857332520126,
 -0.02385181180043882,
 -0.02237302429414607,
 -0.00874523908494152,
 0.022678980522618207,
 -0.01267167269372106,
 0.013615037265848855,
 0.004605273699663785,
 0.008751613056619367

## Inserting the Embedding into a Pinecone Index

In [11]:
import pinecone
from langchain_community.vectorstores import Pinecone
pc = pinecone.Pinecone()

Create Pinecone index

In [12]:
for i in pc.list_indexes().names():
    print('Deleting all indexes ...', end='')
    pc.delete_index(i)
    print('Done')

Deleting all indexes ...Done


In [14]:
index_name = 'churchill-speech'
if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name}...')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=pinecone.PodSpec(
            environment='gcp-starter'
        )
    )
    print('Done')

Creating index churchill-speech...
Done


In [18]:
# Upload the vectors to Pinecone using langchain
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

In [19]:
# Loading the vector store from an existing index
load_vector_store = Pinecone.from_existing_index(index_name='churchill-speech', embedding=embeddings)

## Asking Questions (Similarity Search)

In [21]:
query = 'Where should we fight?'
result = load_vector_store.similarity_search(query)
print(result)

[Document(page_content='shall fight on the beaches, we shall'), Document(page_content='we shall fight in the hills; we shall'), Document(page_content='fight on the landing grounds, we shall fight in the fields and in the streets, we shall fight in'), Document(page_content='shall fight in France, we shall fight on the seas and oceans, we shall fight with growing')]


In [23]:
for r in result:
    print(r.page_content)
    print('-'*50)

shall fight on the beaches, we shall
--------------------------------------------------
we shall fight in the hills; we shall
--------------------------------------------------
fight on the landing grounds, we shall fight in the fields and in the streets, we shall fight in
--------------------------------------------------
shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------


In [26]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

# Retrieve the most three similar chunks
retriever = load_vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [30]:
query = 'Where should we fight?'
answer = chain.invoke(query)
print(answer['result'])

We shall fight on the landing grounds, in the fields, and in the streets.


In [31]:
query2 = 'Who was the king of Belgium at that time?'
answer2 = chain.invoke(query2)
print(answer2['result'])

The king of Belgium at that time was King Albert I.
