### Base Imports

In [3]:
import os
import tiktoken
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

### Splitting and Embedding Text Using LangChain (Similarity Search)

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('files/churchill_speech.txt') as f:
    churchill_speech = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len # indicates how the length of the chunks should be calculated
)

In [11]:
# Creating the chunks
chunks = text_splitter.create_documents([churchill_speech])

# Print the number of chunks
#print(f'Now you have {len(chunks)} chunks')

# Print the content of the 10th chunk
print(chunks[10].page_content)

Now you have 300 chunks
penetration were realized and when a new French Generalissimo, General Weygand, assumed


#### Embedding Cost

In [13]:
def print_embedding_cost(texts):
    
    # Defining the model used for the encoding the text into embeddings
    # Recommended model: text-embedding-3-small / text-embedding-3-medium / text-embedding-3-large (ada-002 will retire soon)
    enc = tiktoken.encoding_for_model('text-embedding-3-small')

    # Calculating the total number of tokens
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    
    # Printing the total number of tokens and the embedding cost in USD
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')

# Function call to print the embedding cost    
print_embedding_cost(chunks)

Total Tokens: 4820
Embedding Cost in USD: 0.000096


### Creating embeddings

In [14]:
from langchain_openai import OpenAIEmbeddings

# Creating the OpenAIEmbeddings object
embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

In [None]:
# Embed the content of the first chunk into a vector
vector = embeddings.embed_query(chunks[0].page_content)
vector

### Inserting the Embeddings into a Pinecone Index

In [7]:
# Importing the necessary libraries and initializing the Pinecone client
import os
import pinecone
from langchain_community.vectorstores import Pinecone

# Initializing the Pinecone client
pc = pinecone.Pinecone()

### Code to Delete all the existing indexes

In [8]:
# Deleting all indexes
indexes = pc.list_indexes().names()
for i in indexes:
    print('Deleting all indexes ... ', end='')
    pc.delete_index(i)
    print('Done')

Deleting all indexes ... Done


### Code to Create a New Index

In [15]:
# 1. Creating an index
from pinecone import PodSpec

# Setting the Index name
index_name = 'churchill-speech'

# Checking if the index already exists
if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name}')
    
    # Creating the index
    pc.create_index(name=index_name, dimension=1536, metric='cosine', spec=PodSpec(environment='gcp-starter'))
    
    print('Index created! 😊')
else:
    print(f'Index {index_name} already exists!')

Creating index churchill-speech
Index created! 😊


In [16]:
# Processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance
# inserting the embeddings into the index and returning a new Pinecone vector store object. 
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)

### Load the Vector Store from an Existing Index

In [23]:
vectorStoreExisting = Pinecone.from_existing_index(index_name='churchill-speech', embedding=embeddings)

In [17]:
# First select the index
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.003,
 'namespaces': {'': {'vector_count': 300}},
 'total_vector_count': 300}

### Asking Questions (Similarity Search)

In [21]:
query = 'Where should we fight?'
result = vector_store.similarity_search(query)
print(f'Number of results returned: {len(result)}')

Number of results returned: 4


### Print the content of the Results

In [19]:
for r in result:
    print(r.page_content)
    print('-' * 50)

shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
streets, we shall fight in the hills; we shall never surrender, and even if, which I do not for a
--------------------------------------------------
number of the enemy, and fought fiercely on some of the old grounds that so many of us knew so
--------------------------------------------------


### Answering in Natural Language using an LLM

In [24]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

# Initialize the LLM with the specified model and temperature
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.2)

# Use the provided vector store with similarity search and retrieve top 3 results
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

# Create a RetrievalQA chain using the defined LLM, chain type 'stuff', and retriever
chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)


In [25]:
query = 'Answer only from the provided input. Where should we fight?'
answer = chain.invoke(query)
print(answer)

{'query': 'Answer only from the provided input. Where should we fight?', 'result': 'We shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and streets, we shall fight in the hills.'}


In [35]:
query = 'Who was the king of Belgium at that time?'
answer = chain.invoke(query)
print(f'Query: {answer["query"]}')
print(f'Answer: {answer["result"]}')

Query: Who was the king of Belgium at that time?
Answer: The king of Belgium at that time was King Leopold.


In [29]:
query = 'What about the French Armies??'
answer = chain.invoke(query)
print(answer['result'])

The French Armies were involved in the fighting, particularly in the area around the Somme. They were tasked with advancing across the Somme in great strength.
