# Splitting and Embedding Text Using LangChain

In [None]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
with open('files/churchill_speech.txt') as f:
    churchill_speech = f.read()
    
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [None]:
chunks = text_splitter.create_documents([churchill_speech])
# print(chunks[2])
print(f'Now you have {len(chunks)}')

In [None]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
    
print_embedding_cost(chunks)

In [None]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [None]:
vector = embeddings.embed_query(chunks[0].page_content)
vector

## Inserting the Embedding into a Pinecone Index

In [None]:
import pinecone
from langchain_community.vectorstores import Pinecone
pc = pinecone.Pinecone()

Create Pinecone index

In [None]:
for i in pc.list_indexes().names():
    print('Deleting all indexes ...', end='')
    pc.delete_index(i)
    print('Done')

In [None]:
index_name = 'churchill-speech'
if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name}...')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=pinecone.PodSpec(
            environment='gcp-starter'
        )
    )
    print('Done')

In [None]:
# Upload the vectors to Pinecone using langchain
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

In [None]:
# Loading the vector store from an existing index
load_vector_store = Pinecone.from_existing_index(index_name='churchill-speech', embedding=embeddings)

## Asking Questions (Similarity Search)

In [None]:
query = 'Where should we fight?'
result = load_vector_store.similarity_search(query)
print(result)

In [None]:
for r in result:
    print(r.page_content)
    print('-'*50)

In [None]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

# Retrieve the most three similar chunks
retriever = load_vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [None]:
query = 'Where should we fight?'
answer = chain.invoke(query)
print(answer['result'])

In [None]:
query2 = 'Who was the king of Belgium at that time?'
answer2 = chain.invoke(query2)
print(answer2['result'])