In [1]:
import os
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv(),override = True)

True

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap = 20,
    length_function = len
)

In [3]:
with open("churchill_speech.txt") as f:
    churchill_speech = f.read()
    
chunks = text_splitter.create_documents([churchill_speech])
print(chunks[0])

page_content='Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940'


In [4]:
print(f'Now you have ------> {len(chunks)} chunks')

Now you have ------> 300 chunks


In [5]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')
    
print_embedding_cost(chunks)

Total Tokens: 4820
Embedding Cost in USD: 0.000096


In [6]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

In [7]:
vector = embeddings.embed_query(chunks[0].page_content)
vector

[0.021032487973570824,
 0.042088139802217484,
 0.07935826480388641,
 0.01917940564453602,
 0.0002213203115388751,
 -0.030761171132326126,
 -0.0038943688850849867,
 -0.0015099727315828204,
 -0.009757637977600098,
 0.018079137429594994,
 0.03261425346136093,
 -0.002582733752205968,
 0.016955705359578133,
 -0.035417042672634125,
 -0.004612438380718231,
 0.016700906679034233,
 -0.013608574867248535,
 0.003934904932975769,
 0.009827128611505032,
 0.020905088633298874,
 -0.02608213759958744,
 -0.004537156783044338,
 -0.042713552713394165,
 -0.01885511539876461,
 -0.03411988168954849,
 0.007696083746850491,
 -0.0034339935518801212,
 -0.04709146171808243,
 0.023036133497953415,
 -0.01291366945952177,
 0.030575862154364586,
 -0.0181138813495636,
 -0.0016315813409164548,
 0.03685317933559418,
 0.02791205607354641,
 -0.052951835095882416,
 0.04445081949234009,
 -0.013237958773970604,
 -0.04259773716330528,
 0.0020499725360423326,
 -0.006636351812630892,
 -0.04855076223611832,
 0.02995044738054275

## Inserting The Embeddings Into A Pinecone Index

In [9]:
import pinecone
from langchain_community.vectorstores import Pinecone
pc = pinecone.Pinecone()

  from tqdm.autonotebook import tqdm


In [11]:
# deleting all indexes
indexes = pc.list_indexes().names()
for i in indexes:
    print('Deleting all indexes ... ', end='')
    pc.delete_index(i)
    print('Done')

Deleting all indexes ... Done


In [13]:
index_name = "churcill-speech"
if index_name not in indexes:
    pc.create_index(
        name = index_name,
        dimension = 1536,
        metric = "cosine",
        spec=pinecone.PodSpec(
            environment = "gcp-starter"
        ) 
    )
    print("Done...")

Done...


In [14]:
# processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
# inserting the embeddings into the index and returning a new Pinecone vector store object. 
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

### Asking Questions (Similarity Search)

In [15]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and'), Document(page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing'), Document(page_content='streets, we shall fight in the hills; we shall never surrender, and even if, which I do not for a'), Document(page_content='number of the enemy, and fought fiercely on some of the old grounds that so many of us knew so')]


In [19]:
for r in result:
    print(r.page_content)
    print('-' * 100)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
----------------------------------------------------------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
----------------------------------------------------------------------------------------------------
streets, we shall fight in the hills; we shall never surrender, and even if, which I do not for a
----------------------------------------------------------------------------------------------------
number of the enemy, and fought fiercely on some of the old grounds that so many of us knew so
----------------------------------------------------------------------------------------------------


### Answering in Natural Language using an LLM

In [20]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

# Initialize the LLM with the specified model and temperature
llm = ChatOpenAI(model='gpt-4o-mini', temperature=0.2)

# Use the provided vector store with similarity search and retrieve top 3 results
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

# Create a RetrievalQA chain using the defined LLM, chain type 'stuff', and retriever
chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

In [21]:
query = 'Answer only from the provided input. Where should we fight?'
answer = chain.invoke(query)
print(answer)

{'query': 'Answer only from the provided input. Where should we fight?', 'result': 'We shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields, we shall fight in France, we shall fight on the seas and oceans, we shall fight in the streets, and we shall fight in the hills.'}


In [23]:
print(answer["result"])

We shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields, we shall fight in France, we shall fight on the seas and oceans, we shall fight in the streets, and we shall fight in the hills.


In [24]:
query = 'What about the French Armies??'
answer = chain.invoke(query)
print(answer["result"])

I don't know.


In [25]:
query = 'Who was the king of Belgium at that time?'
answer = chain.invoke(query)
print(answer["result"])

The king of Belgium at that time was King Leopold.
