## Splitting and Embedding Text Using LangChain

In [4]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('./files/churchill_speech.txt') as f:
    churchill_speech = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)


In [23]:
chunks = text_splitter.create_documents([churchill_speech])
print(chunks[10])

page_content='inflicted upon the enemy and the evident exhaustion of the enemy, it may well be the thought that'


In [24]:
print(chunks[10].page_content)

inflicted upon the enemy and the evident exhaustion of the enemy, it may well be the thought that


In [26]:
print(f'Now you have {len(chunks)}')

Now you have 316


### Embedding cost

In [28]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total tokens: {total_tokens}')
    print(f'Embeeding cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
    
print_embedding_cost(chunks)

Total tokens: 6166
Embeeding cost in USD: 0.002466


In [43]:
from langchain.embeddings import OpenAIEmbeddings 
embeddings = OpenAIEmbeddings()

In [44]:
vector = embeddings.embed_query(chunks[0].page_content)
print(vector)

[-0.03153502996889863, -0.00500315390060287, 0.015395379897472325, 0.007532296369785813, 0.0072290749635972726, 0.021818161396517294, -0.0003058057390673021, -0.016911488325398915, 0.011625786517484355, -0.01849650810580748, 0.03974957461902398, 0.022452170426267827, -0.0008958815755401924, -0.013734553252117416, -0.014458149927194545, -0.0060919948068055675, 0.008221436437299365, 0.015546990833397242, 0.006116114664930719, -0.0017331862197303616, 0.0021621757569251668, 0.008745182460201274, 0.014010209837916392, -2.3608418322481505e-05, 0.004965251399452288, -0.027496671790285695, 0.013458897597641031, -0.04327797221172821, -0.0014816503069638046, 0.005699184823967841, -0.002966746274683967, -0.015519425547346382, -0.01039911731838217, -0.00462068159852616, -0.035614737007910444, -0.013300395805864694, 0.004682703957801892, -0.022204079126519713, -0.0013524366316165065, -0.025167379973277972, 0.038288601419812066, -0.006374541782794667, 0.02719345178108104, -0.0025188054869138706, -0.

### Inserting the Embeddings into a Pinecone index

In [45]:
import os
import pinecone
from langchain.vectorstores import Pinecone

pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

In [46]:
# deleting all indexes
indexes = pinecone.list_indexes()
for i in indexes:
    print('Deleting all indexes...', end='')
    pinecone.delete_index(i)
    print('Done')


Deleting all indexes...Done


In [47]:
index_name = 'churhill-speech'
if index_name not in pinecone.list_indexes():
    print(f'Creating index {index_name}...')
    pinecone.create_index(index_name, dimension=1536, metric='cosine')
    print('Done!')

Creating index churhill-speech...
Done!


In [48]:
# upload vectors to pinecone using langchain
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

### Asking Questions (Similarity Search)

In [49]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='which have been fought with the Germans. In France, where we were at a considerable disadvantage'), Document(page_content='believe ourselves possessed of the capacity to continue the war in the air under better conditions'), Document(page_content='institutions and our Empire. The whole fury and might of the enemy must very soon be turned on us.'), Document(page_content='continuous battle — as continuous battle it will surely be.')]


In [50]:
for r in result:
    print(r.page_content)
    print('.'*50)

which have been fought with the Germans. In France, where we were at a considerable disadvantage
..................................................
believe ourselves possessed of the capacity to continue the war in the air under better conditions
..................................................
institutions and our Empire. The whole fury and might of the enemy must very soon be turned on us.
..................................................
continuous battle — as continuous battle it will surely be.
..................................................


In [55]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo',temperature=1)


retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)


In [57]:
answer = chain.run(query)
print(answer)

Based on the given context, it is not explicitly mentioned where "we" should fight. However, it can be inferred that the speaker is referring to the war being fought against the Germans and the likelihood of the enemy turning their full force on "us," indicating that the fighting may continue in France or potentially in other parts of their Empire.
